In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score, average_precision_score


In [3]:
train_data = pd.read_csv(
    os.path.join(
        os.path.dirname(os.getcwd()),
        'data',
        'training',
        'final_anomaly_training.csv'
    )
)
train_data.describe()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
count,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0
mean,26.755075,5.044428,1.826659,0.881342,0.350732,0.10073,0.650726,0.087404
std,64.995798,29.036143,2.834224,0.323386,0.4772,0.300971,0.476741,0.282426
min,0.024144,0.000118,0.005741,0.0,0.0,0.0,0.0,0.0
25%,3.879739,0.297038,0.476991,1.0,0.0,0.0,0.0,0.0
50%,9.979266,0.996609,0.997881,1.0,0.0,0.0,1.0,0.0
75%,25.89253,3.347069,2.097257,1.0,1.0,0.0,1.0,0.0
max,5797.972589,11851.104565,267.802942,1.0,1.0,1.0,1.0,1.0


In [4]:
fraud_count = train_data[train_data['fraud'] == 1].shape[0]
print(f"Total number of transactions: {train_data.shape[0]}")
print(f"Number of fraud cases: {fraud_count}")

Total number of transactions: 500000
Number of fraud cases: 43702


In [5]:
# Prepare data for anomaly detection (unsupervised)
X = train_data.drop(columns=['fraud'])

model = IsolationForest(contamination=0.01, random_state=42)
model.fit(X)
scores = model.decision_function(X)
anomalies = model.predict(X)

print("Anomaly scores:", scores)
print("Anomalies detected:", np.sum(anomalies == -1))

Anomaly scores: [ 0.27383664  0.29810147  0.29025661 ...  0.29881162  0.19377354
 -0.00958613]
Anomalies detected: 5000


In [6]:
test_data = pd.read_csv(
    os.path.join(
        os.path.dirname(os.getcwd()),
        'data',
        'testing',
        'card_transdata_part2.csv'
    )
)
test_data.describe()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
count,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0
mean,26.502509,5.028611,1.821705,0.88173,0.350066,0.100486,0.650378,0.087402
std,65.783222,22.195372,2.76452,0.322928,0.476991,0.300647,0.476851,0.282424
min,0.004874,0.000298,0.004399,0.0,0.0,0.0,0.0,0.0
25%,3.876566,0.296218,0.474426,1.0,0.0,0.0,0.0,0.0
50%,9.956764,1.000745,0.997461,1.0,0.0,0.0,1.0,0.0
75%,25.605358,3.365425,2.095418,1.0,1.0,0.0,1.0,0.0
max,10632.723672,2724.273459,168.137909,1.0,1.0,1.0,1.0,1.0


In [7]:
# 1. Prepare test data
X_test = test_data.drop(columns=['fraud'])

# 2. Use the trained model to predict anomalies in the test set
test_scores = model.decision_function(X_test)
test_anomalies = model.predict(X_test)

# 3. (Optional) Evaluate against true labels
y_test = test_data['fraud']

# Convert Isolation Forest labels to binary: -1 (anomaly) → 1 (fraud), 1 (normal) → 0
y_pred = (test_anomalies == -1).astype(int)

print("Anomaly Scores:", test_scores)
print("Anomalies Detected in Test Set:", np.sum(y_pred))
print(classification_report(y_test, y_pred, digits=4))
print("ROC AUC:", roc_auc_score(y_test, test_scores * -1))
print("PR AUC:", average_precision_score(y_test, test_scores * -1))

Anomaly Scores: [0.28438209 0.24164638 0.26492327 ... 0.26033694 0.25320323 0.26353297]
Anomalies Detected in Test Set: 4921
              precision    recall  f1-score   support

         0.0     0.9141    0.9918    0.9514    456299
         1.0     0.2394    0.0270    0.0485     43701

    accuracy                         0.9075    500000
   macro avg     0.5767    0.5094    0.4999    500000
weighted avg     0.8551    0.9075    0.8725    500000

ROC AUC: 0.7532556429150293
PR AUC: 0.17904329282479078


In [8]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_recall_curve, auc

In [9]:
validation_data = pd.read_csv(
    os.path.join(
        os.path.dirname(os.getcwd()),
        'data',
        'testing',
        'final_heldout_testing.csv'
    )
)
validation_data.describe()

new_test_data = pd.read_csv(
    os.path.join(
        os.path.dirname(os.getcwd()),
        'data',
        'testing',
        'final_split_testing.csv'
    )
)
new_test_data.describe()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
count,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0
mean,26.48876,5.027214,1.82133,0.881496,0.350292,0.100288,0.650444,0.0874
std,64.520306,22.384742,2.770818,0.323205,0.477062,0.300384,0.476831,0.282421
min,0.027726,0.000298,0.004399,0.0,0.0,0.0,0.0,0.0
25%,3.873639,0.296062,0.473762,1.0,0.0,0.0,0.0,0.0
50%,9.948467,0.998249,0.997046,1.0,0.0,0.0,1.0,0.0
75%,25.625167,3.35324,2.097197,1.0,1.0,0.0,1.0,0.0
max,8777.13642,2724.273459,146.213898,1.0,1.0,1.0,1.0,1.0


In [10]:
X_val = validation_data.drop(columns=['fraud'])
y_val = validation_data['fraud']

val_scores = model.decision_function(X_val)

In [11]:
# Flip score sign because IsolationForest: lower = more anomalous
precision, recall, thresholds = precision_recall_curve(y_val, -val_scores)
pr_auc = auc(recall, precision)

# Pick best threshold (e.g., maximize F1, or tune for high recall)
best_idx = (2 * precision * recall / (precision + recall)).argmax()
best_thresh = thresholds[best_idx]

print(f"Best threshold: {best_thresh:.4f}, PR AUC: {pr_auc:.4f}")

Best threshold: -0.2258, PR AUC: 0.1765


In [12]:
X_test = test_data.drop(columns=['fraud'])
y_test = test_data['fraud']

test_scores = model.decision_function(X_test)
y_pred_test = (test_scores < -best_thresh).astype(int)  # flip again

print("Anomaly Scores:", test_scores)
print("Anomalies Detected in Test Set:", np.sum(y_pred_test))
print(classification_report(y_test, y_pred_test, digits=4))

Anomaly Scores: [0.28438209 0.24164638 0.26492327 ... 0.26033694 0.25320323 0.26353297]
Anomalies Detected in Test Set: 211807
              precision    recall  f1-score   support

         0.0     0.9726    0.6143    0.7530    456299
         1.0     0.1691    0.8194    0.2803     43701

    accuracy                         0.6322    500000
   macro avg     0.5708    0.7168    0.5166    500000
weighted avg     0.9024    0.6322    0.7117    500000



In [13]:
import joblib

dirpath = os.path.join(os.path.dirname(os.getcwd()), "results")
joblib.dump(model, os.path.join(dirpath, "isolated_forest_model.joblib"))

['c:\\Users\\abudi\\OneDrive\\Documents\\Uni Work\\Capstone Project\\fraud-detection-capstone\\results\\isolated_forest_model.joblib']