In [17]:
# ---------------------------------------------
# EDA & Preprocessing for E-commerce Fraud
# ---------------------------------------------

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns

# Import helper functions from src/utils.py
from src.utils import ip_to_int, find_country, plot_correlation_heatmap, plot_bivariate_categorical

# -------------------------------
# Load datasets
# -------------------------------
fraud = pd.read_csv("../data/raw/Fraud_Data.csv")
ip_country = pd.read_csv("../data/raw/IpAddress_to_Country.csv")

# -------------------------------
# Data Cleaning
# -------------------------------
fraud['age'].fillna(fraud['age'].median(), inplace=True)
fraud['browser'].fillna(fraud['browser'].mode()[0], inplace=True)
fraud['source'].fillna(fraud['source'].mode()[0], inplace=True)
fraud['ip_address'].fillna('0.0.0.0', inplace=True)

fraud.drop_duplicates(inplace=True)

fraud['signup_time'] = pd.to_datetime(fraud['signup_time'])
fraud['purchase_time'] = pd.to_datetime(fraud['purchase_time'])
fraud['class'] = fraud['class'].astype(int)

# -------------------------------
# Feature Engineering
# -------------------------------
# Convert IP to integer
fraud['ip_int'] = fraud['ip_address'].apply(ip_to_int)

# Prepare IP-country ranges
ip_country['lower'] = ip_country['lower_bound_ip_address'].astype(int)
ip_country['upper'] = ip_country['upper_bound_ip_address'].astype(int)

# Map IPs to countries
fraud['country'] = fraud['ip_int'].apply(lambda x: find_country(x, ip_country))

# Time-based features
fraud['time_since_signup'] = (fraud['purchase_time'] - fraud['signup_time']).dt.total_seconds()
fraud['hour_of_day'] = fraud['purchase_time'].dt.hour
fraud['day_of_week'] = fraud['purchase_time'].dt.dayofweek

# Transaction frequency & device diversity
fraud['txn_count_user'] = fraud.groupby('user_id')['user_id'].transform('count')
fraud['unique_devices'] = fraud.groupby('user_id')['device_id'].transform('nunique')

# -------------------------------
# EDA: Univariate Analysis
# -------------------------------
print("E-commerce fraud class distribution:\n", fraud['class'].value_counts())

# -------------------------------
# EDA: Bivariate Analysis
# -------------------------------
for col in ['browser', 'source', 'sex', 'country']:
    plot_bivariate_categorical(fraud, col)

# Correlation heatmap
plot_correlation_heatmap(fraud)

# -------------------------------
# Preprocessing for Modeling
# -------------------------------
categorical_cols = ['browser', 'source', 'sex', 'country']
fraud_encoded = pd.get_dummies(fraud, columns=categorical_cols, drop_first=True)

num_cols = ['purchase_value', 'time_since_signup', 'txn_count_user']
scaler = StandardScaler()
fraud_encoded[num_cols] = scaler.fit_transform(fraud_encoded[num_cols])

# Prepare X, y
non_numeric_cols = ['user_id', 'device_id', 'ip_address', 'signup_time', 'purchase_time']
X_fraud = fraud_encoded.drop(columns=['class'] + non_numeric_cols)
y_fraud = fraud_encoded['class']

# Train-test split
X_train_f, X_test_f, y_train_f, y_test_f = train_test_split(
    X_fraud, y_fraud, test_size=0.2, stratify=y_fraud, random_state=42
)

# SMOTE resampling
smote = SMOTE(random_state=42)
X_train_f_res, y_train_f_res = smote.fit_resample(X_train_f, y_train_f)

# Save processed data
X_train_f_res.to_csv("../data/processed/X_train_fraud.csv", index=False)
X_test_f.to_csv("../data/processed/X_test_fraud.csv", index=False)
y_train_f_res.to_csv("../data/processed/y_train_fraud.csv", index=False)
y_test_f.to_csv("../data/processed/y_test_fraud.csv", index=False)

print("\nâœ… E-commerce data processed and saved.\n")


ImportError: cannot import name 'ip_to_int' from 'src.utils' (c:\Users\sciec\Improved-Detection-Of-Fraud-Cases-For-e-commerce-And-Bank-Transactions\src\utils.py)