In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score, average_precision_score


In [2]:
train_data = pd.read_csv(
    os.path.join(
        os.path.dirname(os.getcwd()),
        'data',
        'training',
        'train_data_smote.csv'
    )
)
train_data.describe()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
count,912596.0,912596.0,912596.0,912596.0,912596.0,912596.0,912596.0,912596.0
mean,0.276363,0.12006,0.661775,-0.000996,-0.091016,-0.146868,0.280089,0.5
std,1.562936,1.352223,1.668151,1.001063,0.966615,0.767241,0.862601,0.5
min,-0.411272,-0.173725,-0.642476,-2.725359,-0.73498,-0.334684,-1.364948,0.0
25%,-0.346943,-0.162567,-0.400878,0.366924,-0.73498,-0.334684,0.732629,0.0
50%,-0.230446,-0.137796,0.061592,0.366924,-0.73498,-0.334684,0.732629,0.5
75%,0.168064,-0.044833,1.199416,0.366924,1.360581,-0.334684,0.732629,1.0
max,88.793791,407.976778,93.844571,0.366924,1.360581,2.987897,0.732629,1.0


In [3]:
fraud_count = train_data[train_data['fraud'] == 1].shape[0]
print(f"Total number of transactions: {train_data.shape[0]}")
print(f"Number of fraud cases: {fraud_count}")

Total number of transactions: 912596
Number of fraud cases: 456298


In [4]:
# Prepare data for anomaly detection (unsupervised)
X = train_data.drop(columns=['fraud'])

model = IsolationForest(contamination=0.01, random_state=42)
model.fit(X)
scores = model.decision_function(X)
anomalies = model.predict(X)

print("Anomaly scores:", scores)
print("Anomalies detected:", np.sum(anomalies == -1))

Anomaly scores: [0.23191732 0.30145371 0.30320764 ... 0.13786112 0.28944865 0.27928107]
Anomalies detected: 9126


In [5]:
test_data = pd.read_csv(
    os.path.join(
        os.path.dirname(os.getcwd()),
        'data',
        'testing',
        'card_transdata_part2.csv'
    )
)
test_data.describe()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
count,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0
mean,26.502509,5.028611,1.821705,0.88173,0.350066,0.100486,0.650378,0.087402
std,65.783222,22.195372,2.76452,0.322928,0.476991,0.300647,0.476851,0.282424
min,0.004874,0.000298,0.004399,0.0,0.0,0.0,0.0,0.0
25%,3.876566,0.296218,0.474426,1.0,0.0,0.0,0.0,0.0
50%,9.956764,1.000745,0.997461,1.0,0.0,0.0,1.0,0.0
75%,25.605358,3.365425,2.095418,1.0,1.0,0.0,1.0,0.0
max,10632.723672,2724.273459,168.137909,1.0,1.0,1.0,1.0,1.0


In [6]:
# 1. Prepare test data
X_test = test_data.drop(columns=['fraud'])

# 2. Use the trained model to predict anomalies in the test set
test_scores = model.decision_function(X_test)
test_anomalies = model.predict(X_test)

# 3. (Optional) Evaluate against true labels
y_test = test_data['fraud']

# Convert Isolation Forest labels to binary: -1 (anomaly) → 1 (fraud), 1 (normal) → 0
y_pred = (test_anomalies == -1).astype(int)

print("Anomaly Scores:", test_scores)
print("Anomalies Detected in Test Set:", np.sum(y_pred))
print(classification_report(y_test, y_pred, digits=4))
print("ROC AUC:", roc_auc_score(y_test, test_scores * -1))
print("PR AUC:", average_precision_score(y_test, test_scores * -1))

Anomaly Scores: [ 0.02834737 -0.03741469 -0.01139275 ... -0.04072483 -0.0488478
  0.17141902]
Anomalies Detected in Test Set: 245558
              precision    recall  f1-score   support

         0.0     0.9609    0.5358    0.6880    456299
         1.0     0.1375    0.7726    0.2334     43701

    accuracy                         0.5565    500000
   macro avg     0.5492    0.6542    0.4607    500000
weighted avg     0.8890    0.5565    0.6483    500000

ROC AUC: 0.7141623155980398
PR AUC: 0.19431640579709764


In [20]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_recall_curve, auc

In [21]:
validation_data = pd.read_csv(
    os.path.join(
        os.path.dirname(os.getcwd()),
        'data',
        'testing',
        'final_heldout_testing.csv'
    )
)
validation_data.describe()

new_test_data = pd.read_csv(
    os.path.join(
        os.path.dirname(os.getcwd()),
        'data',
        'testing',
        'final_split_testing.csv'
    )
)
new_test_data.describe()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
count,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0
mean,0.760834,1.311892,0.50825,-0.118504,0.350292,0.100288,-0.349556,0.0874
std,2.969346,7.293331,1.709334,0.323205,0.477062,0.300384,0.476831,0.282421
min,-0.456953,-0.325963,-0.612626,-1.0,0.0,0.0,-1.0,0.0
25%,-0.279957,-0.229598,-0.323073,0.0,0.0,0.0,-1.0,0.0
50%,-0.000382,-0.000813,-0.000256,0.0,0.0,0.0,0.0,0.0
75%,0.721089,0.766483,0.678434,0.0,1.0,0.0,0.0,0.0
max,403.482162,887.288666,89.584904,0.0,1.0,1.0,0.0,1.0


In [22]:
X_val = validation_data.drop(columns=['fraud'])
y_val = validation_data['fraud']

val_scores = model.decision_function(X_val)

In [23]:
# Flip score sign because IsolationForest: lower = more anomalous
precision, recall, thresholds = precision_recall_curve(y_val, -val_scores)
pr_auc = auc(recall, precision)

# Pick best threshold (e.g., maximize F1, or tune for high recall)
best_idx = (2 * precision * recall / (precision + recall)).argmax()
best_thresh = thresholds[best_idx]

print(f"Best threshold: {best_thresh:.4f}, PR AUC: {pr_auc:.4f}")

Best threshold: -0.2257, PR AUC: 0.1766


In [24]:
X_test = test_data.drop(columns=['fraud'])
y_test = test_data['fraud']

test_scores = model.decision_function(X_test)
y_pred_test = (test_scores < -best_thresh).astype(int)  # flip again

print("Anomaly Scores:", test_scores)
print("Anomalies Detected in Test Set:", np.sum(y_pred_test))
print(classification_report(y_test, y_pred_test, digits=4))

Anomaly Scores: [0.28438209 0.24164638 0.26492327 ... 0.26033694 0.25320323 0.26353297]
Anomalies Detected in Test Set: 211775
              precision    recall  f1-score   support

         0.0     0.9728    0.6145    0.7532    456299
         1.0     0.1693    0.8204    0.2807     43701

    accuracy                         0.6325    500000
   macro avg     0.5710    0.7174    0.5169    500000
weighted avg     0.9025    0.6325    0.7119    500000

