In [None]:
# ---------------------------------------------
# Task 1: Complete EDA & Preprocessing for Fraud (E-commerce + Credit Card)
# ---------------------------------------------

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# -------------------------------
# E-COMMERCE DATA PROCESSING
# -------------------------------

# Load datasets
fraud = pd.read_csv("../data/raw/Fraud_Data.csv")
ip_country = pd.read_csv("../data/raw/IpAddress_to_Country.csv")

# Handle missing values
fraud['age'].fillna(fraud['age'].median(), inplace=True)
fraud['browser'].fillna(fraud['browser'].mode()[0], inplace=True)
fraud['source'].fillna(fraud['source'].mode()[0], inplace=True)

# Remove duplicates
fraud.drop_duplicates(inplace=True)

# Fix data types
fraud['signup_time'] = pd.to_datetime(fraud['signup_time'])
fraud['purchase_time'] = pd.to_datetime(fraud['purchase_time'])
fraud['class'] = fraud['class'].astype(int)

# Class distribution
print("E-commerce fraud class distribution:\n", fraud['class'].value_counts())

# Convert IP to integer
def ip_to_int(ip):
    return sum(int(x) * 256**i for i, x in enumerate(reversed(ip.split('.'))))

fraud['ip_int'] = fraud['ip_address'].apply(ip_to_int)
ip_country['lower'] = ip_country['lower_bound_ip_address'].astype(int)
ip_country['upper'] = ip_country['upper_bound_ip_address'].astype(int)

def find_country(ip):
    row = ip_country[(ip_country['lower'] <= ip) & (ip_country['upper'] >= ip)]
    return row['country'].values[0] if not row.empty else 'Unknown'

fraud['country'] = fraud['ip_int'].apply(find_country)

# Time-based features
fraud['time_since_signup'] = (fraud['purchase_time'] - fraud['signup_time']).dt.total_seconds()
fraud['hour_of_day'] = fraud['purchase_time'].dt.hour
fraud['day_of_week'] = fraud['purchase_time'].dt.dayofweek

# Transaction frequency
fraud['txn_count_user'] = fraud.groupby('user_id')['user_id'].transform('count')
fraud['unique_devices'] = fraud.groupby('user_id')['device_id'].transform('nunique')

# One-hot encoding categorical features
categorical_cols = ['browser', 'source', 'sex', 'country']
fraud_encoded = pd.get_dummies(fraud, columns=categorical_cols, drop_first=True)

# Scaling numeric columns
num_cols = ['purchase_value', 'time_since_signup', 'txn_count_user']
scaler = StandardScaler()
fraud_encoded[num_cols] = scaler.fit_transform(fraud_encoded[num_cols])

# Train-test split and SMOTE
X_fraud = fraud_encoded.drop('class', axis=1)
y_fraud = fraud_encoded['class']
X_train_f, X_test_f, y_train_f, y_test_f = train_test_split(
    X_fraud, y_fraud, test_size=0.2, stratify=y_fraud, random_state=42
)
smote = SMOTE(random_state=42)
X_train_f_res, y_train_f_res = smote.fit_resample(X_train_f, y_train_f)

# Save processed e-commerce data
X_train_f_res.to_csv("../data/processed/X_train_fraud.csv", index=False)
X_test_f.to_csv("../data/processed/X_test_fraud.csv", index=False)
y_train_f_res.to_csv("../data/processed/y_train_fraud.csv", index=False)
y_test_f.to_csv("../data/processed/y_test_fraud.csv", index=False)

print("\n✅ E-commerce data processed and saved.\n")

# -------------------------------
# CREDIT CARD DATA PROCESSING
# -------------------------------

# Load credit card dataset
credit = pd.read_csv("../data/raw/creditcard.csv")

# Check class distribution
print("Credit card fraud class distribution:\n", credit['Class'].value_counts())

# Separate features and target
X_credit = credit.drop('Class', axis=1)
y_credit = credit['Class']

# Scale Amount (Time is optional, keep as-is or scale)
scaler_cc = StandardScaler()
X_credit['Amount'] = scaler_cc.fit_transform(X_credit[['Amount']])

# Train-test split
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X_credit, y_credit, test_size=0.2, stratify=y_credit, random_state=42
)

# Handle class imbalance with SMOTE
smote_cc = SMOTE(random_state=42)
X_train_c_res, y_train_c_res = smote_cc.fit_resample(X_train_c, y_train_c)

# Save processed credit card data
X_train_c_res.to_csv("../data/processed/X_train_credit.csv", index=False)
X_test_c.to_csv("../data/processed/X_test_credit.csv", index=False)
y_train_c_res.to_csv("../data/processed/y_train_credit.csv", index=False)
y_test_c.to_csv("../data/processed/y_test_credit.csv", index=False)

print("\n✅ Credit card data processed and saved.\n")
print("✅ Task 1 fully completed for both datasets!")

