In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Set up directories
data_path = '../data'
plots_path = '../plots'
os.makedirs(plots_path, exist_ok=True)

# Load raw datasets
fraud_data = pd.read_csv(os.path.join(data_path, 'Fraud_Data.csv'))
creditcard_data = pd.read_csv(os.path.join(data_path, 'creditcard.csv'))

# Convert datetime columns
fraud_data['signup_time'] = pd.to_datetime(fraud_data['signup_time'], infer_datetime_format=True, errors='coerce')
fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'], infer_datetime_format=True, errors='coerce')

# Univariate Analysis
# Fraud Data: purchase_value
plt.figure(figsize=(10, 6))
sns.histplot(fraud_data['purchase_value'], bins=50, kde=True)
plt.title('Distribution of Purchase Value (Fraud Data)')
plt.xlabel('Purchase Value')
plt.ylabel('Frequency')
plt.savefig(os.path.join(plots_path, 'purchase_value_dist.png'))
plt.close()

# Fraud Data: age
plt.figure(figsize=(10, 6))
sns.histplot(fraud_data['age'], bins=30, kde=True)
plt.title('Distribution of Age (Fraud Data)')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.savefig(os.path.join(plots_path, 'age_dist.png'))
plt.close()

# Fraud Data: source
plt.figure(figsize=(10, 6))
sns.countplot(x='source', data=fraud_data)
plt.title('Distribution of Source (Fraud Data)')
plt.xlabel('Source')
plt.ylabel('Count')
plt.savefig(os.path.join(plots_path, 'source_dist.png'))
plt.close()

# Creditcard Data: Amount
plt.figure(figsize=(10, 6))
sns.histplot(creditcard_data['Amount'], bins=50, kde=True)
plt.title('Distribution of Amount (Creditcard Data)')
plt.xlabel('Amount')
plt.ylabel('Frequency')
plt.savefig(os.path.join(plots_path, 'amount_dist.png'))
plt.close()

# Class Distribution
plt.figure(figsize=(10, 6))
sns.countplot(x='class', data=fraud_data)
plt.title('Class Distribution (Fraud Data)')
plt.xlabel('Class (0: Non-Fraud, 1: Fraud)')
plt.ylabel('Count')
plt.savefig(os.path.join(plots_path, 'fraud_class_dist.png'))
plt.close()

plt.figure(figsize=(10, 6))
sns.countplot(x='Class', data=creditcard_data)
plt.title('Class Distribution (Creditcard Data)')
plt.xlabel('Class (0: Non-Fraud, 1: Fraud)')
plt.ylabel('Count')
plt.savefig(os.path.join(plots_path, 'creditcard_class_dist.png'))
plt.close()

# Bivariate Analysis
# Purchase Value vs Class (Fraud Data)
plt.figure(figsize=(10, 6))
sns.boxplot(x='class', y='purchase_value', data=fraud_data)
plt.title('Purchase Value vs Class (Fraud Data)')
plt.xlabel('Class')
plt.ylabel('Purchase Value')
plt.savefig(os.path.join(plots_path, 'purchase_value_vs_class.png'))
plt.close()

# Age vs Class (Fraud Data)
plt.figure(figsize=(10, 6))
sns.boxplot(x='class', y='age', data=fraud_data)
plt.title('Age vs Class (Fraud Data)')
plt.xlabel('Class')
plt.ylabel('Age')
plt.savefig(os.path.join(plots_path, 'age_vs_class.png'))
plt.close()

# Correlation Matrix (Creditcard Data)
plt.figure(figsize=(12, 8))
sns.heatmap(creditcard_data.corr(), cmap='coolwarm', annot=False)
plt.title('Correlation Matrix (Creditcard Data)')
plt.savefig(os.path.join(plots_path, 'creditcard_corr_matrix.png'))
plt.close()

# Load processed fraud data for post-processing analysis
processed_fraud = pd.read_csv(os.path.join(data_path, 'processed_fraud_train.csv'))

# Country Distribution
plt.figure(figsize=(10, 6))
sns.countplot(y='country', data=processed_fraud, order=processed_fraud['country'].value_counts().index[:10])
plt.title('Top 10 Countries (Fraud Data)')
plt.xlabel('Count')
plt.ylabel('Country')
plt.savefig(os.path.join(plots_path, 'country_dist.png'))
plt.close()

# Time Since Signup Distribution
plt.figure(figsize=(10, 6))
sns.histplot(processed_fraud['time_since_signup'], bins=50, kde=True)
plt.title('Distribution of Time Since Signup (Fraud Data)')
plt.xlabel('Time Since Signup (Hours)')
plt.ylabel('Frequency')
plt.savefig(os.path.join(plots_path, 'time_since_signup_dist.png'))
plt.close()

# Hour of Day vs Class
plt.figure(figsize=(10, 6))
sns.histplot(data=processed_fraud, x='hour_of_day', hue='class', multiple='stack')
plt.title('Hour of Day vs Class (Fraud Data)')
plt.xlabel('Hour of Day')
plt.ylabel('Count')
plt.savefig(os.path.join(plots_path, 'hour_of_day_vs_class.png'))
plt.close()

print('EDA plots saved to plots/ folder.')


  fraud_data['signup_time'] = pd.to_datetime(fraud_data['signup_time'], infer_datetime_format=True, errors='coerce')
  fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'], infer_datetime_format=True, errors='coerce')


KeyError: 'country'

<Figure size 1000x600 with 0 Axes>