<h3>Setup</h3>

In [12]:
# Imports
import kagglehub
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from xgboost import XGBClassifier

In [17]:
# Load Kaggle dataset
def load_kaggle_csv(dataset, filename):
    path = Path(kagglehub.dataset_download(dataset))
    return pd.read_csv(path / filename)

df = load_kaggle_csv(
    "chitwanmanchanda/fraudulent-transactions-data",
    "Fraud.csv"
)

# Sort by time
df = df.sort_values('step').reset_index(drop=True)



<h3>Feature Engineering</h3>

In [18]:
# Calculate change in balance for both accounts
df['orig_delta'] = df['newbalanceOrig'] - df['oldbalanceOrg']
df['dest_delta'] = df['newbalanceDest'] - df['oldbalanceDest']

# Calculate change in balance for origin
mask_orig_customer = ~df['nameOrig'].str.startswith('M')
df['orig_delta'] = 0  # default 0 for merchants
df.loc[mask_orig_customer, 'orig_delta'] = (
    df.loc[mask_orig_customer, 'newbalanceOrig'] - 
    df.loc[mask_orig_customer, 'oldbalanceOrg']
)

# Calculate change in balance for destination
mask_dest_customer = ~df['nameDest'].str.startswith('M')
df['dest_delta'] = 0  # default 0 for merchants
df.loc[mask_dest_customer, 'dest_delta'] = (
    df.loc[mask_dest_customer, 'newbalanceDest'] - 
    df.loc[mask_dest_customer, 'oldbalanceDest']
)

# Initialize previous fraud flag
df['orig_prev_fraud'] = 0
df['dest_prev_fraud'] = 0

# Origin previous fraud
df['orig_prev_fraud'] = df.groupby('nameOrig')['isFraud'].cumsum().shift(fill_value=0)
df['orig_prev_fraud'] = (df['orig_prev_fraud'] > 0).astype(int)

# Destination previous fraud
df['dest_prev_fraud'] = df.groupby('nameDest')['isFraud'].cumsum().shift(fill_value=0)
df['dest_prev_fraud'] = (df['dest_prev_fraud'] > 0).astype(int)

# Apply one hot encording for transaction type
pd.get_dummies(df, columns=['type'], drop_first=True)

# Drop balance figures and answer column
df = df.drop(columns=[
    'oldbalanceOrg', 'newbalanceOrig',
    'oldbalanceDest', 'newbalanceDest',
    'type','isFlaggedFraud',
    'nameOrig', 'nameDest',
])

In [19]:
df.head()

Unnamed: 0,step,amount,isFraud,orig_delta,dest_delta,orig_prev_fraud,dest_prev_fraud
0,1,9839.64,0,-9839.64,0.0,0,0
1,1,5157.05,0,-1667.92,0.0,0,0
2,1,5746.44,0,0.0,0.0,0,0
3,1,5607.36,0,-5202.0,0.0,0,0
4,1,6360.79,0,-3731.0,0.0,0,0


<h3>Machine Learning</h3>

In [20]:
X = df.drop('isFraud', axis=1)
y = df['isFraud']

split_index = int(len(df) * 0.8)
X_train, X_test = X.iloc[:split_index], X.iloc[split_index:]
y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]

print(f"X_train shape {X_train.shape}")
print(f"X_test shape {X_test.shape}")

X_train shape (5090096, 6)
X_test shape (1272524, 6)


In [21]:
X_train

Unnamed: 0,step,amount,orig_delta,dest_delta,orig_prev_fraud,dest_prev_fraud
0,1,9839.64,-9839.64,0.00,0,0
1,1,5157.05,-1667.92,0.00,0,0
2,1,5746.44,0.00,0.00,0,0
3,1,5607.36,-5202.00,0.00,0,0
4,1,6360.79,-3731.00,0.00,0,0
...,...,...,...,...,...,...
5090091,355,52303.23,0.00,52303.23,0,0
5090092,355,81480.34,-81480.34,81480.34,0,0
5090093,355,5274.01,-5021.00,0.00,0,0
5090094,355,6651.26,0.00,0.00,0,0


In [22]:
model = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum(),
    eval_metric='aucpr',
    use_label_encoder=False,
    n_jobs=-1
)

model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.8,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric='aucpr', gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.05, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=300,
              n_jobs=-1, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [23]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9968110621096341

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00   1268270
           1       0.51      0.89      0.65      4254

    accuracy                           1.00   1272524
   macro avg       0.76      0.94      0.82   1272524
weighted avg       1.00      1.00      1.00   1272524


Confusion Matrix:
 [[1264697    3573]
 [    485    3769]]
