In [1]:
# ============================================================
# Environment setup
# ============================================================
from __future__ import annotations

import os
import sys
from pathlib import Path
from typing import Tuple

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

try:
    from IPython.display import display
except ImportError:  
    display = print 

# ----------------------------
# Reproducibility
# ----------------------------
RNG_SEED: int = 42

np.random.seed(RNG_SEED)
torch.manual_seed(RNG_SEED)
torch.cuda.manual_seed_all(RNG_SEED)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# ----------------------------
# Paths
# ----------------------------

PROJECT_ROOT = Path.cwd()
if PROJECT_ROOT.name == "notebooks":
    PROJECT_ROOT = PROJECT_ROOT.parent

DATASET_DIR = PROJECT_ROOT / "data/dataset.csv"
MODELS_DIR = PROJECT_ROOT / "models"
PYTHON_DIR = PROJECT_ROOT / "python"

MODELS_DIR.mkdir(parents=True, exist_ok=True)

# Make local python/ package importable
if str(PYTHON_DIR) not in sys.path:
    sys.path.append(str(PYTHON_DIR))

# ----------------------------
# Device
# ----------------------------
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

Using device: cuda


In [4]:
# ----------------------------
# Load dataset
# ----------------------------
df = pd.read_csv(
    DATASET_DIR,
    low_memory=False,  # Using full-file type inference instead of chunk-based
)
# Strip whitespace from all column names
df.columns = df.columns.str.strip()

In [5]:
# ----------------------------
# Quick Inspect Data
# ----------------------------
def quick_inspect_data(df: pd.DataFrame) -> None:
    print(f"Shapes: {df.shape}")

    print("Columns: ")
    print(df.columns.tolist(), "\n")

    print("First rows: ")
    display(df.head())
    print("Info about the dataset: ")
    print(df.info())
quick_inspect_data(df)

Shapes: (299695, 17)
Columns: 
['transaction_id', 'user_id', 'account_age_days', 'total_transactions_user', 'avg_amount_user', 'amount', 'country', 'bin_country', 'channel', 'merchant_category', 'promo_used', 'avs_match', 'cvv_result', 'three_ds_flag', 'transaction_time', 'shipping_distance_km', 'is_fraud'] 

First rows: 


Unnamed: 0,transaction_id,user_id,account_age_days,total_transactions_user,avg_amount_user,amount,country,bin_country,channel,merchant_category,promo_used,avs_match,cvv_result,three_ds_flag,transaction_time,shipping_distance_km,is_fraud
0,1,1,141,47,147.93,84.75,FR,FR,web,travel,0,1,1,1,2024-01-06T04:09:39Z,370.95,0
1,2,1,141,47,147.93,107.9,FR,FR,web,travel,0,0,0,0,2024-01-09T20:13:47Z,149.62,0
2,3,1,141,47,147.93,92.36,FR,FR,app,travel,1,1,1,1,2024-01-12T06:20:11Z,164.08,0
3,4,1,141,47,147.93,112.47,FR,FR,web,fashion,0,1,1,1,2024-01-15T17:00:04Z,397.4,0
4,5,1,141,47,147.93,132.91,FR,US,web,electronics,0,1,1,1,2024-01-17T01:27:31Z,935.28,0


Info about the dataset: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299695 entries, 0 to 299694
Data columns (total 17 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   transaction_id           299695 non-null  int64  
 1   user_id                  299695 non-null  int64  
 2   account_age_days         299695 non-null  int64  
 3   total_transactions_user  299695 non-null  int64  
 4   avg_amount_user          299695 non-null  float64
 5   amount                   299695 non-null  float64
 6   country                  299695 non-null  object 
 7   bin_country              299695 non-null  object 
 8   channel                  299695 non-null  object 
 9   merchant_category        299695 non-null  object 
 10  promo_used               299695 non-null  int64  
 11  avs_match                299695 non-null  int64  
 12  cvv_result               299695 non-null  int64  
 13  three_ds_flag            299695 no

In [8]:
# ============================================================
# Train / Validation / Test split (stratified, no helpers)
# ============================================================

TARGET_COL: str = "is_fraud"

# Using a clean working copy without raw ID column.
df_model = df.drop(columns=["transaction_id"])

# Using X_df as features and y_df as target.
X_df = df_model.drop(columns=[TARGET_COL])
y_df = df_model[TARGET_COL]

# ----------------------------
# take 15% of full data as validation (stratified)
# ----------------------------
X_train_temp, X_val, y_train_temp, y_val = train_test_split(
    X_df,
    y_df,
    test_size=0.15,          # 15% global -> validation
    stratify=y_df,
    random_state=RNG_SEED,
    shuffle=True,
)

# ----------------------------
# split remaining into train / test (stratified)
# To get 15% test globally:
# remaining = 1 - 0.15 = 0.85
# test_size_rel = 0.15 / 0.85 â‰ˆ 0.17647
# ----------------------------
test_size_rel = 0.15 / 0.85

X_train, X_test, y_train, y_test = train_test_split(
    X_train_temp,
    y_train_temp,
    test_size=test_size_rel,
    stratify=y_train_temp,
    random_state=RNG_SEED,
    shuffle=True,
)

# ----------------------------
# Sanity check: shapes + class balance
# ----------------------------
def _fraud_ratio(y):
    return float((y == 1).mean() * 100.0)


n_total = len(y_df)

print("Split shapes:")
print(f"  Train: {X_train.shape[0]} rows")
print(f"  Val:   {X_val.shape[0]} rows")
print(f"  Test:  {X_test.shape[0]} rows")

print("\nSplit percentages (of full data):")
print(f"  Train: {X_train.shape[0] / n_total * 100:.2f}%")
print(f"  Val:   {X_val.shape[0] / n_total * 100:.2f}%")
print(f"  Test:  {X_test.shape[0] / n_total * 100:.2f}%")

print("\nFraud ratio per split:")
print(f"  Full:  {_fraud_ratio(y_df):5.2f}%")
print(f"  Train: {_fraud_ratio(y_train):5.2f}%")
print(f"  Val:   {_fraud_ratio(y_val):5.2f}%")
print(f"  Test:  {_fraud_ratio(y_test):5.2f}%")


Split shapes:
  Train: 209785 rows
  Val:   44955 rows
  Test:  44955 rows

Split percentages (of full data):
  Train: 70.00%
  Val:   15.00%
  Test:  15.00%

Fraud ratio per split:
  Full:   2.21%
  Train:  2.21%
  Val:    2.21%
  Test:   2.21%
