<h3>Setup</h3>

In [1]:
# Imports
import pandas as pd
import numpy as np

import kagglehub
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from xgboost import XGBClassifier

In [2]:
# Load Kaggle dataset
def load_kaggle_csv(dataset, filename):
    path = Path(kagglehub.dataset_download(dataset))
    return pd.read_csv(path / filename)

df = load_kaggle_csv(
    "chitwanmanchanda/fraudulent-transactions-data",
    "Fraud.csv"
)

# Sort by time
df = df.sort_values('step').reset_index(drop=True)



<h3>Feature Engineering</h3>

In [3]:
# Determine if merchant or customer
df["orig_is_customer"] = df["nameOrig"].str.startswith("C").astype("int8")
df["dest_is_customer"] = df["nameDest"].str.startswith("C").astype("int8")

In [4]:
# Calculate change in account value (customers only)
df["orig_delta"] = 0.0
df["dest_delta"] = 0.0

df.loc[df["orig_is_customer"] == 1, "orig_delta"] = (
    df.loc[df["orig_is_customer"] == 1, "newbalanceOrig"] -
    df.loc[df["orig_is_customer"] == 1, "oldbalanceOrg"]
)

df.loc[df["dest_is_customer"] == 1, "dest_delta"] = (
    df.loc[df["dest_is_customer"] == 1, "newbalanceDest"] -
    df.loc[df["dest_is_customer"] == 1, "oldbalanceDest"]
)

In [5]:
# Get account deltas
df["net_delta"] = df["dest_delta"] - df["orig_delta"]
df["orig_abs_delta"] = df["orig_delta"].abs()
df["dest_abs_delta"] = df["dest_delta"].abs()

In [6]:
# Calculate outflow
df["orig_outflow"] = (-df["orig_delta"]).clip(lower=0)

In [7]:
# Calculate ratio (add small num to avoid div by 0)
df["orig_amount_ratio"] = df["amount"] / (df["oldbalanceOrg"] + 1e-9)
df["dest_amount_ratio"] = df["amount"] / (df["oldbalanceDest"] + 1e-9)

In [8]:
# Flag for transactions where origin is a customer and outflow > 90% of balance
df["large_outflow_flag"] = (df["orig_amount_ratio"] > 0.9).astype("int8")

In [9]:
# Customer to customer and customer to merchant flags
df["cust_to_cust"] = ((df["orig_is_customer"]==1) & (df["dest_is_customer"]==1)).astype("int8")
df["cust_to_merchant"] = ((df["orig_is_customer"]==1) & (df["dest_is_customer"]==0)).astype("int8")

In [10]:
# Apply one hot encording for transaction type
df = pd.get_dummies(df, columns=['type'], drop_first=True)

In [13]:
# Drop name and flag columns
df = df.drop(columns=[
    'nameOrig', 'nameDest',
    'isFlaggedFraud'
])

<h3>Machine Learning</h3>

In [15]:
X = df.drop('isFraud', axis=1)
y = df['isFraud']

split_index = int(len(df) * 0.8)
X_train, X_test = X.iloc[:split_index], X.iloc[split_index:]
y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]

print(f"X_train shape {X_train.shape}")
print(f"X_test shape {X_test.shape}")

X_train shape (5090096, 23)
X_test shape (1272524, 23)


In [16]:
X_train

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,orig_is_customer,dest_is_customer,orig_delta,dest_delta,...,orig_outflow,orig_amount_ratio,dest_amount_ratio,large_outflow_flag,cust_to_cust,cust_to_merchant,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
0,1,9839.64,170136.00,160296.36,0.00,0.00,1,0,-9839.64,0.00,...,9839.64,5.783397e-02,9.839640e+12,0,0,1,0,0,1,0
1,1,5157.05,1667.92,0.00,0.00,0.00,1,0,-1667.92,0.00,...,1667.92,3.091905e+00,5.157050e+12,1,0,1,0,0,1,0
2,1,5746.44,0.00,0.00,0.00,0.00,1,0,0.00,0.00,...,-0.00,5.746440e+12,5.746440e+12,1,0,1,0,0,1,0
3,1,5607.36,5202.00,0.00,0.00,0.00,1,0,-5202.00,0.00,...,5202.00,1.077924e+00,5.607360e+12,1,0,1,0,0,1,0
4,1,6360.79,3731.00,0.00,0.00,0.00,1,0,-3731.00,0.00,...,3731.00,1.704849e+00,6.360790e+12,1,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5090091,355,52303.23,0.00,0.00,4120244.16,4172547.39,1,1,0.00,52303.23,...,-0.00,5.230323e+13,1.269421e-02,1,1,0,1,0,0,0
5090092,355,81480.34,133209.87,51729.53,246169.78,327650.12,1,1,-81480.34,81480.34,...,81480.34,6.116689e-01,3.309925e-01,0,1,0,1,0,0,0
5090093,355,5274.01,5021.00,0.00,0.00,0.00,1,0,-5021.00,0.00,...,5021.00,1.050390e+00,5.274010e+12,1,0,1,0,0,1,0
5090094,355,6651.26,0.00,0.00,0.00,0.00,1,0,0.00,0.00,...,-0.00,6.651260e+12,6.651260e+12,1,0,1,0,0,1,0


In [17]:
model = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum(),
    eval_metric='aucpr',
    use_label_encoder=False,
    n_jobs=-1
)

model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.8,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric='aucpr', gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.05, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=300,
              n_jobs=-1, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [18]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9999992141602044

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00   1268270
           1       1.00      1.00      1.00      4254

    accuracy                           1.00   1272524
   macro avg       1.00      1.00      1.00   1272524
weighted avg       1.00      1.00      1.00   1272524


Confusion Matrix:
 [[1268270       0]
 [      1    4253]]
