<h3>Setup</h3>

In [1]:
# Imports
import pandas as pd
import numpy as np

import kagglehub
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from xgboost import XGBClassifier

In [12]:
# Load Kaggle dataset
def load_kaggle_csv(dataset, filename):
    path = Path(kagglehub.dataset_download(dataset))
    return pd.read_csv(path / filename)

df = load_kaggle_csv(
    "chitwanmanchanda/fraudulent-transactions-data",
    "Fraud.csv"
)

# Sort by time
df = df.sort_values('step').reset_index(drop=True)



<h3>Feature Engineering</h3>

In [13]:
# Determine if merchant or customer
df["orig_is_customer"] = df["nameOrig"].str.startswith("C").astype("int8")
df["dest_is_customer"] = df["nameDest"].str.startswith("C").astype("int8")

In [14]:
# Calculate change in account value (customers only)
df["orig_delta"] = 0.0
df["dest_delta"] = 0.0

df.loc[df["orig_is_customer"] == 1, "orig_delta"] = (
    df.loc[df["orig_is_customer"] == 1, "newbalanceOrig"] -
    df.loc[df["orig_is_customer"] == 1, "oldbalanceOrg"]
)

df.loc[df["dest_is_customer"] == 1, "dest_delta"] = (
    df.loc[df["dest_is_customer"] == 1, "newbalanceDest"] -
    df.loc[df["dest_is_customer"] == 1, "oldbalanceDest"]
)

In [15]:
# Calculate outflow
df["orig_outflow"] = (-df["orig_delta"]).clip(lower=0)

In [16]:
# Apply one hot encording for transaction type
df = pd.get_dummies(df, columns=['type'], drop_first=True)

In [9]:
# Drop name and flag columns
df = df.drop(columns=[
    'nameOrig', 'nameDest',
    'isFlaggedFraud'
])

<h3>Machine Learning</h3>

In [None]:
X = df.drop('isFraud', axis=1)
y = df['isFraud']

split_index = int(len(df) * 0.8)
X_train, X_test = X.iloc[:split_index], X.iloc[split_index:]
y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]

print(f"X_train shape {X_train.shape}")
print(f"X_test shape {X_test.shape}")

In [None]:
X_train

In [None]:
model = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum(),
    eval_metric='aucpr',
    use_label_encoder=False,
    n_jobs=-1
)

model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))