In [28]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
import joblib
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.preprocessing import FunctionTransformer

In [5]:
# load data for modelling
df = pd.read_csv('../data/processed/fraud_detection_features.csv')
df.head()

Unnamed: 0,transaction_id,account_id,customer_id,transaction_datetime,amount,is_fraud,is_weekend,is_night,prev_txn_time,time_since_last_transaction,avg_amount_7d,count_transactions_7d
0,TXN_163851,ACC_03070,CUST_0001,2025-08-01 06:45:20,9641.65,0,0,0,,,9641.65,1
1,TXN_026097,ACC_04410,CUST_0001,2025-08-01 09:10:04,9382.34,0,0,0,2025-08-01 06:45:20,3.0,9382.34,1
2,TXN_060803,ACC_03070,CUST_0001,2025-08-01 19:45:13,4707.01,0,0,0,2025-08-01 09:10:04,10.0,7174.33,2
3,TXN_025799,ACC_04410,CUST_0001,2025-08-02 23:41:55,10676.77,0,1,1,2025-08-01 19:45:13,28.0,10029.555,2
4,TXN_151201,ACC_03070,CUST_0001,2025-08-03 03:29:39,4438.37,0,1,1,2025-08-02 23:41:55,4.0,6262.343333,3


In [6]:
df.columns

Index(['transaction_id', 'account_id', 'customer_id', 'transaction_datetime',
       'amount', 'is_fraud', 'is_weekend', 'is_night', 'prev_txn_time',
       'time_since_last_transaction', 'avg_amount_7d',
       'count_transactions_7d'],
      dtype='object')

In [9]:
df.dtypes

transaction_id                         object
account_id                             object
customer_id                            object
transaction_datetime           datetime64[ns]
amount                                float64
is_fraud                                int64
is_weekend                              int64
is_night                                int64
prev_txn_time                  datetime64[ns]
time_since_last_transaction           float64
avg_amount_7d                         float64
count_transactions_7d                   int64
dtype: object

In [10]:
# drop is_fraud column from anomaly df
df = df.drop(columns=['is_fraud'])

In [11]:
df_orig = pd.read_csv("../data/processed/merged_customer_data.csv")
df_orig.head()

Unnamed: 0.1,Unnamed: 0,transaction_id,account_id,customer_id,customer_segment,transaction_datetime,transaction_channel,amount,merchant_category,transaction_location,is_fraud,account_type
0,0,TXN_000001,ACC_03664,CUST_1688,Personal,2025-08-27 21:46:32,POS,19668.61,Fuel Station,Kaduna,0,Savings
1,1,TXN_000002,ACC_04332,CUST_0006,Personal,2025-09-27 19:29:47,USSD,43698.55,Bill Payment,Lagos,0,Salary
2,2,TXN_000003,ACC_03483,CUST_0812,Personal,2025-08-06 00:56:03,Web,23839.03,Bill Payment,Abuja,0,Savings
3,3,TXN_000004,ACC_04321,CUST_1048,Corporate,2025-09-03 04:21:20,Web,19447.08,Bill Payment,Lagos,0,Current
4,4,TXN_000005,ACC_03388,CUST_1329,SME,2025-09-22 18:34:38,POS,46728.97,POS Payment,Onitsha,0,Savings


In [12]:
df_orig["transaction_datetime"] = pd.to_datetime(df_orig["transaction_datetime"])
df["transaction_datetime"] = pd.to_datetime(df["transaction_datetime"])

In [13]:
# let merge the two dataframes to include all features
# Merge on transaction_id (primary key)
df_final = df_orig.merge(
    df.drop(columns=["amount", "account_id", "customer_id", "transaction_datetime"]),
    on="transaction_id",
    how="left"
)

In [14]:
df_final.head()

Unnamed: 0.1,Unnamed: 0,transaction_id,account_id,customer_id,customer_segment,transaction_datetime,transaction_channel,amount,merchant_category,transaction_location,is_fraud,account_type,is_weekend,is_night,prev_txn_time,time_since_last_transaction,avg_amount_7d,count_transactions_7d
0,0,TXN_000001,ACC_03664,CUST_1688,Personal,2025-08-27 21:46:32,POS,19668.61,Fuel Station,Kaduna,0,Savings,0,1,2025-08-27 13:43:57,8.0,8286.9325,4
1,1,TXN_000002,ACC_04332,CUST_0006,Personal,2025-09-27 19:29:47,USSD,43698.55,Bill Payment,Lagos,0,Salary,1,0,2025-09-27 16:42:49,3.0,21890.22,9
2,2,TXN_000003,ACC_03483,CUST_0812,Personal,2025-08-06 00:56:03,Web,23839.03,Bill Payment,Abuja,0,Savings,0,1,2025-08-05 15:46:30,9.0,24005.475,4
3,3,TXN_000004,ACC_04321,CUST_1048,Corporate,2025-09-03 04:21:20,Web,19447.08,Bill Payment,Lagos,0,Current,0,1,2025-08-31 09:25:35,67.0,13785.516667,3
4,4,TXN_000005,ACC_03388,CUST_1329,SME,2025-09-22 18:34:38,POS,46728.97,POS Payment,Onitsha,0,Savings,0,0,2025-09-22 12:21:15,6.0,18297.19375,8


In [15]:
# drop the Unnamed: 0 column if it exists
if 'Unnamed: 0' in df_final.columns:
    df_final = df_final.drop(columns=['Unnamed: 0'])

In [16]:
df_final.head()

Unnamed: 0,transaction_id,account_id,customer_id,customer_segment,transaction_datetime,transaction_channel,amount,merchant_category,transaction_location,is_fraud,account_type,is_weekend,is_night,prev_txn_time,time_since_last_transaction,avg_amount_7d,count_transactions_7d
0,TXN_000001,ACC_03664,CUST_1688,Personal,2025-08-27 21:46:32,POS,19668.61,Fuel Station,Kaduna,0,Savings,0,1,2025-08-27 13:43:57,8.0,8286.9325,4
1,TXN_000002,ACC_04332,CUST_0006,Personal,2025-09-27 19:29:47,USSD,43698.55,Bill Payment,Lagos,0,Salary,1,0,2025-09-27 16:42:49,3.0,21890.22,9
2,TXN_000003,ACC_03483,CUST_0812,Personal,2025-08-06 00:56:03,Web,23839.03,Bill Payment,Abuja,0,Savings,0,1,2025-08-05 15:46:30,9.0,24005.475,4
3,TXN_000004,ACC_04321,CUST_1048,Corporate,2025-09-03 04:21:20,Web,19447.08,Bill Payment,Lagos,0,Current,0,1,2025-08-31 09:25:35,67.0,13785.516667,3
4,TXN_000005,ACC_03388,CUST_1329,SME,2025-09-22 18:34:38,POS,46728.97,POS Payment,Onitsha,0,Savings,0,0,2025-09-22 12:21:15,6.0,18297.19375,8


In [17]:
df_final.columns

Index(['transaction_id', 'account_id', 'customer_id', 'customer_segment',
       'transaction_datetime', 'transaction_channel', 'amount',
       'merchant_category', 'transaction_location', 'is_fraud', 'account_type',
       'is_weekend', 'is_night', 'prev_txn_time',
       'time_since_last_transaction', 'avg_amount_7d',
       'count_transactions_7d'],
      dtype='object')

In [18]:
# parse datetime
df_final["transaction_datetime"] = pd.to_datetime(df["transaction_datetime"])
df_final["hour"] = df["transaction_datetime"].dt.hour
df_final["weekday"] = df["transaction_datetime"].dt.weekday

In [24]:
df_final.time_since_last_transaction.dtypes

dtype('float64')

In [20]:
df_final.columns

Index(['transaction_id', 'account_id', 'customer_id', 'customer_segment',
       'transaction_datetime', 'transaction_channel', 'amount',
       'merchant_category', 'transaction_location', 'is_fraud', 'account_type',
       'is_weekend', 'is_night', 'prev_txn_time',
       'time_since_last_transaction', 'avg_amount_7d', 'count_transactions_7d',
       'hour', 'weekday'],
      dtype='object')

In [30]:
df_final.isnull().sum()

transaction_id                    0
account_id                        0
customer_id                       0
customer_segment                  0
transaction_datetime              0
transaction_channel               0
amount                            0
merchant_category                 0
transaction_location              0
is_fraud                          0
account_type                      0
is_weekend                        0
is_night                          0
prev_txn_time                  1795
time_since_last_transaction    1795
avg_amount_7d                     0
count_transactions_7d             0
hour                              0
weekday                           0
dtype: int64

In [25]:
# Data preprocessing for modeling using pipelines object and column transformer
categorical_features = ['customer_segment', 'transaction_channel', 'merchant_category', 'transaction_location', 'account_type']
numerical_features = ['amount', 'is_weekend', 'is_night', 'time_since_last_transaction', 'avg_amount_7d', 'count_transactions_7d', 'hour', 'weekday']

In [42]:
# preprocessing pipeline
numerical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("log", FunctionTransformer(np.log1p, validate=False)),
    ("scale", StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("categorical", categorical_pipeline, categorical_features),
        ("numerical", numerical_pipeline, numerical_features)
    ]
)


In [43]:


X = df_final[categorical_features + numerical_features]
y = df_final["is_fraud"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [44]:
# Model training with RandomForestClassifier and handling class imbalance
pos_weight = (y_train == 0).sum() / (y_train == 1).sum()  # For handling class imbalance



In [46]:
from xgboost import XGBClassifier

xgb_pipeline = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", XGBClassifier(
        n_estimators=500,
        max_depth=6,
        learning_rate=0.07,
        subsample=0.9,
        colsample_bytree=0.9,
        scale_pos_weight=pos_weight,
        eval_metric="logloss"
    ))
])




In [47]:
xgb_pipeline.fit(X_train, y_train)

In [49]:
rf_pipeline = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", RandomForestClassifier(
        n_estimators=300,
        class_weight="balanced",
        random_state=42,
        n_jobs=-1
    ))
])


In [51]:
rf_pipeline.fit(X_train, y_train)

In [50]:
def get_feature_names(preprocessor):
    output_features = []

    # categorical OneHotEncoder features
    cat_pipe = preprocessor.named_transformers_["categorical"]
    ohe = cat_pipe.named_steps["onehot"]
    cat_names = ohe.get_feature_names_out(categorical_features)
    output_features.extend(cat_names)

    # numerical features
    output_features.extend(numerical_features)

    return output_features


In [53]:
# getting the feature importance

xgb_model = xgb_pipeline.named_steps["model"]
feature_names = get_feature_names(xgb_pipeline.named_steps["preprocess"])

xgb_importance = pd.DataFrame({
    "feature": feature_names,
    "importance": xgb_model.feature_importances_
}).sort_values(by="importance", ascending=False)

print("\nXGBoost Feature Importance:")
print(xgb_importance.head(5))



XGBoost Feature Importance:
                    feature  importance
42                   amount    0.924381
46            avg_amount_7d    0.030431
44                 is_night    0.010802
5   transaction_channel_POS    0.005677
3   transaction_channel_ATM    0.004336


In [54]:
rf_model = rf_pipeline.named_steps["model"]
rf_importance = pd.DataFrame({
    "feature": feature_names,
    "importance": rf_model.feature_importances_
}).sort_values(by="importance", ascending=False)

print("\nRandomForest Feature Importance:")
print(rf_importance.head(5))



RandomForest Feature Importance:
                               feature  importance
42                              amount    0.360502
46                       avg_amount_7d    0.271290
44                            is_night    0.098277
35       transaction_location_Tor Node    0.035687
14  merchant_category_International Tr    0.030475


In [66]:
# Building the Isolation forest model for the unsupervise task
#These are behavioral + temporal features → excellent for anomaly detection.
from sklearn.ensemble import IsolationForest

iso_features = [
    'amount',
    'is_weekend',
    'is_night',
    'time_since_last_transaction',
    'avg_amount_7d',
    'count_transactions_7d',
    'hour',
    'weekday'
]


In [68]:
X_iso = df_final[iso_features]

In [69]:
# Preprocessing (impute → log → scale)
# ---------------------------------------------------------
iso_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("log", FunctionTransformer(np.log1p, validate=False)),
    ("scale", StandardScaler())
])

X_processed = iso_pipeline.fit_transform(X_iso)

In [None]:
# Train Isolation Forest
# ---------------------------------------------------------
iso_model = IsolationForest(
    n_estimators=300,
    contamination=0.08,      # assume 2% anomalies
    max_samples='auto',
    random_state=42,
    n_jobs=-1
)

In [71]:
iso_model.fit(X_processed)

In [73]:
# Predict anomalies
# ---------------------------------------------------------
df_final["iso_score"] = iso_model.decision_function(X_processed)  # anomaly score
df_final["iso_pred"] = iso_model.predict(X_processed)             # -1 = anomaly, 1 = normal

# Convert to fraud format: 1 = anomaly, 0 = normal
df_final["iso_pred"] = df_final["iso_pred"].replace({1:0, -1:1})

# ---------------------------------------------------------
# Evaluate against real fraud labels (if available)
# ---------------------------------------------------------
print(confusion_matrix(df_final["is_fraud"], df_final["iso_pred"]))
print(classification_report(df_final["is_fraud"], df_final["iso_pred"]))

[[176253   2247]
 [   147   1353]]
              precision    recall  f1-score   support

           0       1.00      0.99      0.99    178500
           1       0.38      0.90      0.53      1500

    accuracy                           0.99    180000
   macro avg       0.69      0.94      0.76    180000
weighted avg       0.99      0.99      0.99    180000



In [74]:
# Model Evaluation Using the Hold Set- for the both models
from sklearn.metrics import (
    classification_report, confusion_matrix,
    roc_auc_score, precision_recall_curve, auc
)

# ---------------------------------------
# XGBoost Predictions
# ---------------------------------------
xgb_y_pred = xgb_pipeline.predict(X_test)
xgb_y_prob = xgb_pipeline.predict_proba(X_test)[:, 1]

# Confusion matrix
print("XGBoost Confusion Matrix:")
print(confusion_matrix(y_test, xgb_y_pred))

# Full classification report
print("\nXGBoost Classification Report:")
print(classification_report(y_test, xgb_y_pred))

# ROC-AUC
xgb_roc = roc_auc_score(y_test, xgb_y_prob)
print("XGBoost ROC-AUC:", xgb_roc)

# PR-AUC
precision, recall, _ = precision_recall_curve(y_test, xgb_y_prob)
xgb_pr_auc = auc(recall, precision)
print("XGBoost PR-AUC:", xgb_pr_auc)


XGBoost Confusion Matrix:
[[35699     1]
 [    0   300]]

XGBoost Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     35700
           1       1.00      1.00      1.00       300

    accuracy                           1.00     36000
   macro avg       1.00      1.00      1.00     36000
weighted avg       1.00      1.00      1.00     36000

XGBoost ROC-AUC: 1.0
XGBoost PR-AUC: 0.9999999999999999


Meaning of the above metric
* Normal transactions correctly predicted: 35,699

* Fraud transactions correctly predicted: 300

* False positives: 1

* False negatives: 0

This means that the model caught every single fraud case
*It almost never raised false alarms


| Metric                | Score | Interpretation                               |
| --------------------- | ----- | -------------------------------------------- |
| **Precision (fraud)** | 1.00  | Every fraud predicted was actually fraud     |
| **Recall (fraud)**    | 1.00  | Model never missed a fraud                   |
| **F1-score (fraud)**  | 1.00  | Perfect balance                              |
| **ROC-AUC**           | 1.00  | Model perfectly separates fraud vs non-fraud |
| **PR-AUC**            | 1.00  | Outstanding fraud ranking capability         |


This result could mean:

* The patterns in your training data are very clean and separable, meaning fraud cases are highly distinct.

XGBoost captured the fraud patterns almost perfectly.

Model might be overfitting, but given fraud datasets often have strong behavioral signals, very high scores can also be legitimate.

XGBoost performs near-perfectly, making it an extremely strong supervised fraud classifier.


In [75]:
# for the isolation forest

# Extract Isolation Forest features
iso_test = X_test[iso_features]

# Preprocess test data
iso_test_processed = iso_pipeline.transform(iso_test)

# Predict anomalies (-1 = anomaly, 1 = normal)
iso_raw_pred = iso_model.predict(iso_test_processed)

# Convert to fraud format (1 = fraud/anomaly, 0 = normal)
iso_y_pred = np.where(iso_raw_pred == -1, 1, 0)

# Anomaly scores (lower = more suspicious)
iso_scores = iso_model.decision_function(iso_test_processed)
iso_prob = -iso_scores  # invert so higher = more fraud-like


In [76]:
print("Isolation Forest Confusion Matrix:")
print(confusion_matrix(y_test, iso_y_pred))

print("\nIsolation Forest Classification Report:")
print(classification_report(y_test, iso_y_pred))

# ROC-AUC for anomaly scores
iso_roc = roc_auc_score(y_test, iso_prob)
print("Isolation Forest ROC-AUC:", iso_roc)

# PR-AUC
precision_iso, recall_iso, _ = precision_recall_curve(y_test, iso_prob)
iso_pr_auc = auc(recall_iso, precision_iso)
print("Isolation Forest PR-AUC:", iso_pr_auc)


Isolation Forest Confusion Matrix:
[[35240   460]
 [   25   275]]

Isolation Forest Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     35700
           1       0.37      0.92      0.53       300

    accuracy                           0.99     36000
   macro avg       0.69      0.95      0.76     36000
weighted avg       0.99      0.99      0.99     36000

Isolation Forest ROC-AUC: 0.9941987861811392
Isolation Forest PR-AUC: 0.6993969305315425


The above performance metric means that


True negatives (normal): 35,240

True positives (fraud detected): 275

False negatives (missed fraud): 25

False positives: 460


| Metric                | Score | Interpretation                           |
| --------------------- | ----- | ---------------------------------------- |
| **Precision (fraud)** | 0.37  | Many flagged transactions were not fraud |
| **Recall (fraud)**    | 0.92  | Strong ability to catch fraud (92%)      |
| **F1 Fraud**          | 0.53  | Moderate                                 |
| **ROC-AUC**           | 0.994 | Excellent separation ability             |
| **PR-AUC**            | 0.61  | Moderate anomaly ranking                 |


Isolation Forest caught 92% of fraud → very good for an unsupervised model.

But precision is low (0.37) → it raises many false alarms.

This is expected:
Unsupervised models try to detect unusual patterns, not true fraud.

The ROC-AUC of 0.994 shows it assigns anomaly scores well, even though classification threshold-based metrics are weaker. we can look at increasing the  threshold

In [77]:
print("\n===== MODEL COMPARISON =====")
print(f"XGBoost ROC-AUC:        {xgb_roc:.4f}")
print(f"Isolation Forest ROC-AUC: {iso_roc:.4f}")

print(f"XGBoost PR-AUC:         {xgb_pr_auc:.4f}")
print(f"Isolation Forest PR-AUC: {iso_pr_auc:.4f}")



===== MODEL COMPARISON =====
XGBoost ROC-AUC:        1.0000
Isolation Forest ROC-AUC: 0.9942
XGBoost PR-AUC:         1.0000
Isolation Forest PR-AUC: 0.6994


| Metric          | XGBoost (Supervised) | Isolation Forest (Unsupervised) |
| --------------- | -------------------- | ------------------------------- |
| Fraud Recall    | **1.00**             | 0.92                            |
| Fraud Precision | **1.00**             | 0.37                            |
| Fraud F1        | **1.00**             | 0.53                            |
| ROC-AUC         | **1.00**             | 0.99                            |
| PR-AUC          | **1.00**             | 0.61                            |
| False Positives | **1**                | 460                             |
| False Negatives | **0**                | 25                              |


XGBoost is superior because it is trained with labeled fraud data, it achieves near-perfect performance.

Isolation Forest is useful but it performs well for anomaly detection

BUT its precision is low → many false alarms

Recommended for detecting new, unseen fraud patterns, not for main fraud decision-making

#### Recommended Strategy (Industry Best Practice)
→ Use XGBoost for primary fraud detection

High recall + high precision = reliable alerts.

→ Use Isolation Forest as an additional anomaly signal

Add iso_score as a feature input to XGBoost OR as a secondary fraud risk signal.

This hybrid approach Catches more fraud patterns not seen before and reduce false negatives to strengthen the fraud defense system

In [79]:
# saving my models
import pickle

# Assume objects exist: xgb_pipeline, iso_model, iso_pipeline, feature_names, iso_features

# Save supervised model pipeline
joblib.dump(xgb_pipeline, "../models/xgb_pipeline.pkl")

# Save unsupervised model
joblib.dump(iso_model, "../models/isolation_forest_model.pkl")

# Save the isolation forest preprocessing pipeline
joblib.dump(iso_pipeline, "../models/isolation_forest_preprocess.pkl")

# Save feature names for supervised model
with open("../feature_store/xgb_feature_names.pkl", "wb") as f:
    pickle.dump(feature_names, f)

# Save ISO features list
with open("../feature_store/iso_features.pkl", "wb") as f:
    pickle.dump(iso_features, f)

"/mnt/data/model_files_saved"


'/mnt/data/model_files_saved'

In [64]:
import shap
import numpy as np

# Transform test data using preprocessing pipeline
preprocessed_X = xgb_pipeline.named_steps["preprocess"].transform(X_test)

# Use a small background sample for kernel explainer
background = preprocessed_X[:200]

explainer = shap.KernelExplainer(
    model.predict_proba, 
    background
)

# Compute SHAP values on a subset
shap_values = explainer.shap_values(preprocessed_X[:200])

# Plot summary
shap.summary_plot(
    shap_values,
    preprocessed_X[:200],
    feature_names=feature_names,
    max_display=20
)


Using 200 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.
  4%|▎         | 7/200 [01:58<54:20, 16.89s/it]


KeyboardInterrupt: 