In [33]:
# ============================================================
# Environment setup
# ============================================================
from __future__ import annotations

import os
import sys
from pathlib import Path
from typing import Tuple, List

import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

try:
    from IPython.display import display
except ImportError:  
    display = print 

# ----------------------------
# Reproducibility
# ----------------------------
RNG_SEED: int = 42

np.random.seed(RNG_SEED)
torch.manual_seed(RNG_SEED)
torch.cuda.manual_seed_all(RNG_SEED)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# ----------------------------
# Paths
# ----------------------------

PROJECT_ROOT = Path.cwd()
if PROJECT_ROOT.name == "notebooks":
    PROJECT_ROOT = PROJECT_ROOT.parent

DATASET_DIR = PROJECT_ROOT / "data/dataset.csv"
MODELS_DIR = PROJECT_ROOT / "models"
PYTHON_DIR = PROJECT_ROOT / "python"

MODELS_DIR.mkdir(parents=True, exist_ok=True)

# Make local python/ package importable
if str(PYTHON_DIR) not in sys.path:
    sys.path.append(str(PYTHON_DIR))

# ----------------------------
# Device
# ----------------------------
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

Using device: cuda


In [4]:
# ----------------------------
# Load dataset
# ----------------------------
df = pd.read_csv(
    DATASET_DIR,
    low_memory=False,  # Using full-file type inference instead of chunk-based
)
# Strip whitespace from all column names
df.columns = df.columns.str.strip()

In [5]:
# ----------------------------
# Quick Inspect Data
# ----------------------------
def quick_inspect_data(df: pd.DataFrame) -> None:
    print(f"Shapes: {df.shape}")

    print("Columns: ")
    print(df.columns.tolist(), "\n")

    print("First rows: ")
    display(df.head())
    print("Info about the dataset: ")
    print(df.info())
quick_inspect_data(df)

Shapes: (299695, 17)
Columns: 
['transaction_id', 'user_id', 'account_age_days', 'total_transactions_user', 'avg_amount_user', 'amount', 'country', 'bin_country', 'channel', 'merchant_category', 'promo_used', 'avs_match', 'cvv_result', 'three_ds_flag', 'transaction_time', 'shipping_distance_km', 'is_fraud'] 

First rows: 


Unnamed: 0,transaction_id,user_id,account_age_days,total_transactions_user,avg_amount_user,amount,country,bin_country,channel,merchant_category,promo_used,avs_match,cvv_result,three_ds_flag,transaction_time,shipping_distance_km,is_fraud
0,1,1,141,47,147.93,84.75,FR,FR,web,travel,0,1,1,1,2024-01-06T04:09:39Z,370.95,0
1,2,1,141,47,147.93,107.9,FR,FR,web,travel,0,0,0,0,2024-01-09T20:13:47Z,149.62,0
2,3,1,141,47,147.93,92.36,FR,FR,app,travel,1,1,1,1,2024-01-12T06:20:11Z,164.08,0
3,4,1,141,47,147.93,112.47,FR,FR,web,fashion,0,1,1,1,2024-01-15T17:00:04Z,397.4,0
4,5,1,141,47,147.93,132.91,FR,US,web,electronics,0,1,1,1,2024-01-17T01:27:31Z,935.28,0


Info about the dataset: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299695 entries, 0 to 299694
Data columns (total 17 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   transaction_id           299695 non-null  int64  
 1   user_id                  299695 non-null  int64  
 2   account_age_days         299695 non-null  int64  
 3   total_transactions_user  299695 non-null  int64  
 4   avg_amount_user          299695 non-null  float64
 5   amount                   299695 non-null  float64
 6   country                  299695 non-null  object 
 7   bin_country              299695 non-null  object 
 8   channel                  299695 non-null  object 
 9   merchant_category        299695 non-null  object 
 10  promo_used               299695 non-null  int64  
 11  avs_match                299695 non-null  int64  
 12  cvv_result               299695 non-null  int64  
 13  three_ds_flag            299695 no

In [8]:
# ============================================================
# Train / Validation / Test split (stratified, no helpers)
# ============================================================

TARGET_COL: str = "is_fraud"

# Using a clean working copy without raw ID column.
df_model = df.drop(columns=["transaction_id"])

# Using X_df as features and y_df as target.
X_df = df_model.drop(columns=[TARGET_COL])
y_df = df_model[TARGET_COL]

# ----------------------------
# take 15% of full data as validation (stratified)
# ----------------------------
X_train_temp, X_val, y_train_temp, y_val = train_test_split(
    X_df,
    y_df,
    test_size=0.15,          # 15% global -> validation
    stratify=y_df,
    random_state=RNG_SEED,
    shuffle=True,
)

# ----------------------------
# split remaining into train / test (stratified)
# To get 15% test globally:
# remaining = 1 - 0.15 = 0.85
# test_size_rel = 0.15 / 0.85 â‰ˆ 0.17647
# ----------------------------
test_size_rel = 0.15 / 0.85

X_train, X_test, y_train, y_test = train_test_split(
    X_train_temp,
    y_train_temp,
    test_size=test_size_rel,
    stratify=y_train_temp,
    random_state=RNG_SEED,
    shuffle=True,
)

# ----------------------------
# Sanity check: shapes + class balance
# ----------------------------
def _fraud_ratio(y):
    return float((y == 1).mean() * 100.0)


n_total = len(y_df)

print("Split shapes:")
print(f"  Train: {X_train.shape[0]} rows")
print(f"  Val:   {X_val.shape[0]} rows")
print(f"  Test:  {X_test.shape[0]} rows")

print("\nSplit percentages (of full data):")
print(f"  Train: {X_train.shape[0] / n_total * 100:.2f}%")
print(f"  Val:   {X_val.shape[0] / n_total * 100:.2f}%")
print(f"  Test:  {X_test.shape[0] / n_total * 100:.2f}%")

print("\nFraud ratio per split:")
print(f"  Full:  {_fraud_ratio(y_df):5.2f}%")
print(f"  Train: {_fraud_ratio(y_train):5.2f}%")
print(f"  Val:   {_fraud_ratio(y_val):5.2f}%")
print(f"  Test:  {_fraud_ratio(y_test):5.2f}%")


Split shapes:
  Train: 209785 rows
  Val:   44955 rows
  Test:  44955 rows

Split percentages (of full data):
  Train: 70.00%
  Val:   15.00%
  Test:  15.00%

Fraud ratio per split:
  Full:   2.21%
  Train:  2.21%
  Val:    2.21%
  Test:   2.21%


In [12]:
print(f"train data shape X: {X_train.shape} , shape y: {y_train.shape}")
display(X_train.head(5))
X_train.info()

train data shape X: (209785, 15) , shape y: (209785,)


Unnamed: 0,user_id,account_age_days,total_transactions_user,avg_amount_user,amount,country,bin_country,channel,merchant_category,promo_used,avs_match,cvv_result,three_ds_flag,transaction_time,shipping_distance_km
33273,668,869,57,405.04,363.85,NL,TR,web,grocery,0,1,1,1,2024-06-03T08:21:36Z,943.93
25170,507,928,42,49.78,147.58,RO,RO,app,grocery,0,1,0,0,2024-01-07T20:04:48Z,442.9
37254,748,1314,58,42.29,28.01,NL,NL,web,gaming,0,1,1,1,2024-08-21T14:51:02Z,9.85
239229,4793,1232,44,80.27,95.53,NL,NL,app,travel,0,0,1,1,2024-10-18T11:18:55Z,116.42
18836,379,1741,42,804.16,1016.36,IT,IT,web,travel,0,0,1,1,2024-01-03T04:14:06Z,429.14


<class 'pandas.core.frame.DataFrame'>
Index: 209785 entries, 33273 to 105894
Data columns (total 15 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   user_id                  209785 non-null  int64  
 1   account_age_days         209785 non-null  int64  
 2   total_transactions_user  209785 non-null  int64  
 3   avg_amount_user          209785 non-null  float64
 4   amount                   209785 non-null  float64
 5   country                  209785 non-null  object 
 6   bin_country              209785 non-null  object 
 7   channel                  209785 non-null  object 
 8   merchant_category        209785 non-null  object 
 9   promo_used               209785 non-null  int64  
 10  avs_match                209785 non-null  int64  
 11  cvv_result               209785 non-null  int64  
 12  three_ds_flag            209785 non-null  int64  
 13  transaction_time         209785 non-null  object 
 14  shipp

In [34]:
# ============================================================
# Categorical encoding with OneHotEncoder
# ============================================================

# Columns for one-hot encoding
CATEGORICAL_OHE_COLS = ["country", "bin_country", "merchant_category"]

# Binary encoding for 'channel'
CHANNEL_COL = "channel"
CHANNEL_MAPPING = {"web": 0, "app": 1}

# ----------------------------
# Copy splits to avoid side effects
# ----------------------------
X_train_enc = X_train.copy()
X_val_enc = X_val.copy()
X_test_enc = X_test.copy()

# ----------------------------
# Encode 'channel' as binary feature (fixed mapping)
# ----------------------------
for df_split in (X_train_enc, X_val_enc, X_test_enc):
    df_split[CHANNEL_COL] = (df_split[CHANNEL_COL].map(CHANNEL_MAPPING).astype("int8"))

# ----------------------------
# Fit OneHotEncoder on train categorical columns
# ----------------------------
ohe = OneHotEncoder(
    handle_unknown="ignore",  # Using ignore to safely handle unseen categories in val/test
    sparse_output=False, # to get numpy array
    dtype=np.int8
)

ohe.fit(X_train_enc[CATEGORICAL_OHE_COLS])

ohe_feature_names = ohe.get_feature_names_out(CATEGORICAL_OHE_COLS)

# ----------------------------
# Transform each split and rebuild clean feature matrices
# ----------------------------
def build_encoded_frame(
    X_source: pd.DataFrame,
    encoder: OneHotEncoder,
    cat_cols: list,
    ohe_feature_names: np.ndarray,
) -> pd.DataFrame:
    """
    Build a fully encoded feature DataFrame for a given split.

    Using:
        - A fitted OneHotEncoder on the training categorical columns.
        - Numeric and already-encoded columns kept as-is.
        - One-hot encoded columns concatenated as dense features.

    Returns:
        Encoded DataFrame with aligned schema.
    """
    # One-hot encode categorical subset
    cat_ohe = encoder.transform(X_source[cat_cols])

    # Numeric + non-OHE columns (everything except categorical OHE cols)
    X_num = X_source.drop(columns=cat_cols)

    # Combine numeric and one-hot encoded features
    X_encoded = pd.concat(
        [
            X_num.reset_index(drop=True),
            pd.DataFrame(cat_ohe, columns=ohe_feature_names),
        ],
        axis=1,
    )

    return X_encoded


X_train_encoded = build_encoded_frame(
    X_source=X_train_enc,
    encoder=ohe,
    cat_cols=CATEGORICAL_OHE_COLS,
    ohe_feature_names=ohe_feature_names,
)

X_val_encoded = build_encoded_frame(
    X_source=X_val_enc,
    encoder=ohe,
    cat_cols=CATEGORICAL_OHE_COLS,
    ohe_feature_names=ohe_feature_names,
)

X_test_encoded = build_encoded_frame(
    X_source=X_test_enc,
    encoder=ohe,
    cat_cols=CATEGORICAL_OHE_COLS,
    ohe_feature_names=ohe_feature_names,
)

# ----------------------------
# Quick schema check
# ----------------------------
print("Encoded shapes:")
print(f"  X_train_encoded: {X_train_encoded.shape}")
print(f"  X_val_encoded:   {X_val_encoded.shape}")
print(f"  X_test_encoded:  {X_test_encoded.shape}")


Encoded shapes:
  X_train_encoded: (209785, 37)
  X_val_encoded:   (44955, 37)
  X_test_encoded:  (44955, 37)
