Setup and Data Loading

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

# Load raw data
print("Loading raw data...")
fraud_df = pd.read_csv('../data/raw/Fraud_Data.csv', parse_dates=['purchase_time', 'signup_time'])
ip_map_df = pd.read_csv('../data/raw/IpAddress_to_Country.csv')
creditcard_df = pd.read_csv('../data/raw/creditcard.csv')

print(f"Fraud data: {fraud_df.shape}")
print(f"IP mapping: {ip_map_df.shape}")
print(f"Credit card data: {creditcard_df.shape}")


Loading raw data...
Fraud data: (151112, 11)
IP mapping: (138846, 3)
Credit card data: (284807, 31)


Fraud Data Cleaning

In [2]:
print("=== Fraud Data Cleaning ===")

# Remove duplicates
initial_len = len(fraud_df)
fraud_df = fraud_df.drop_duplicates()
print(f"Removed {initial_len - len(fraud_df)} duplicate rows")

# Handle invalid ages
fraud_df = fraud_df[fraud_df['age'] >= 13]  # Minimum reasonable age
fraud_df = fraud_df[fraud_df['age'] <= 100]  # Maximum reasonable age

# Handle outliers in purchase_value
Q1 = fraud_df['purchase_value'].quantile(0.005)
Q3 = fraud_df['purchase_value'].quantile(0.995)
fraud_df = fraud_df[(fraud_df['purchase_value'] >= Q1) & (fraud_df['purchase_value'] <= Q3)]

print(f"Final fraud data shape: {fraud_df.shape}")

=== Fraud Data Cleaning ===
Removed 0 duplicate rows
Final fraud data shape: (150375, 11)


IP â†’ Country Mapping

In [9]:
def ip_to_int(ip):
    try:
        a, b, c, d = map(int, ip.split("."))
        return a * 256**3 + b * 256**2 + c * 256 + d
    except:
        return np.nan

fraud_df["ip_int"] = fraud_df["ip_address"].apply(ip_to_int)

ip_map_df["lower_bound_ip_address"] = pd.to_numeric(
    ip_map_df["lower_bound_ip_address"], errors="coerce"
)
ip_map_df["upper_bound_ip_address"] = pd.to_numeric(
    ip_map_df["upper_bound_ip_address"], errors="coerce"
)

fraud_df = fraud_df.sort_values("ip_int")
ip_map_df = ip_map_df.sort_values("lower_bound_ip_address")

fraud_df = pd.merge_asof(
    fraud_df,
    ip_map_df,
    left_on="ip_int",
    right_on="lower_bound_ip_address",
    direction="backward"
)

fraud_df = fraud_df[
    (fraud_df["ip_int"] >= fraud_df["lower_bound_ip_address"]) &
    (fraud_df["ip_int"] <= fraud_df["upper_bound_ip_address"])
]

print(f"Mapped countries: {fraud_df['country'].nunique()}")


Mapped countries: 0


Transaction Pattern Features

In [10]:
fraud_df = fraud_df.sort_values(["user_id", "purchase_time"])

fraud_df["transactions_per_user"] = fraud_df.groupby("user_id")["user_id"].transform("count")

stats = fraud_df.groupby("user_id")["purchase_value"].agg(
    avg_purchase_value="mean",
    std_purchase_value="std",
    total_purchase_value="sum"
).fillna(0)

fraud_df = fraud_df.merge(stats, on="user_id", how="left")

fraud_df["value_vs_user_avg"] = (
    fraud_df["purchase_value"] / fraud_df["avg_purchase_value"].replace(0, np.nan)
).fillna(0)

high_value_thresh = fraud_df["purchase_value"].quantile(0.9)
fraud_df["is_high_value"] = (fraud_df["purchase_value"] >= high_value_thresh).astype(int)

fraud_df["is_value_spike"] = (
    fraud_df["purchase_value"] >
    (fraud_df["avg_purchase_value"] + 2 * fraud_df["std_purchase_value"])
).astype(int)

Risk Scores (NO LEAKAGE)