In [1]:
import pandas as pd
import numpy as np
import joblib
import shap
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Loads the Dataset
try:
    df = pd.read_csv('../data/transactions.csv')
    print("Transactions dataset loaded successfully!")
    print(f"Original shape: {df.shape}")
except FileNotFoundError:
    print("Error: Make sure 'transactions.csv' is in the '../data/' directory.")

# Data Cleaning
# Drops irrelevant columns and remove duplicates as done in the XGBoost notebook
df_cleaned = df.drop(columns=['step', 'nameOrig', 'nameDest', 'isFlaggedFraud'])
df_cleaned = df_cleaned.drop_duplicates()
print(f"Shape after cleaning: {df_cleaned.shape}")

Transactions dataset loaded successfully!
Original shape: (6362620, 11)
Shape after cleaning: (6264740, 7)


In [2]:
def preprocess_transactions_data(df):
    """Prepares the PaySim dataset by one-hot encoding and creating error features."""
    df_processed = df.copy()

    # One-hot encodes the 'type' column
    if 'type' in df_processed.columns:
        type_dummies = pd.get_dummies(df_processed['type'], prefix='type', drop_first=True)
        df_processed = pd.concat([df_processed, type_dummies], axis=1)
        df_processed = df_processed.drop('type', axis=1)
    
    # Engineer features related to balance discrepancies
    df_processed['errorBalanceOrig'] = df_processed['oldbalanceOrg'] - df_processed['amount'] - df_processed['newbalanceOrig']
    df_processed['errorBalanceDest'] = df_processed['oldbalanceDest'] + df_processed['amount'] - df_processed['newbalanceDest']
    
    # Separates features (X) and target (y)
    if 'isFraud' in df_processed.columns:
        X = df_processed.drop('isFraud', axis=1)
        y = df_processed['isFraud']
        return X, y
    return df_processed

# Appllies preprocessing
X, y = preprocess_transactions_data(df_cleaned)
print("\nData preprocessing complete.")
print("Features shape:", X.shape)


Data preprocessing complete.
Features shape: (6264740, 11)


In [3]:
# Splits data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Defines the two base models for our ensemble
print("\nDefining base models for the ensemble...")
clf1 = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced', n_jobs=-1) # Use all available CPU cores
clf2 = LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000)

print("Models defined and data is split.")


Defining base models for the ensemble...
Models defined and data is split.


In [4]:
# Creates the ensemble model with a 70/30 weight split
ensemble_transactions_model = VotingClassifier(
    estimators=[('rf', clf1), ('lr', clf2)],
    voting='soft',
    weights=[0.7, 0.3]
)

print("\nTraining the ensemble model... (This will take several minutes on the large dataset)")
ensemble_transactions_model.fit(X_train, y_train)
print("Training complete.")

# Evaluates the model's performance
print("Ensemble Model Evaluation on Transactions Data:")
y_pred = ensemble_transactions_model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=['Not Fraud (0)', 'Fraud (1)']))


Training the ensemble model... (This will take several minutes on the large dataset)
Training complete.
Ensemble Model Evaluation on Transactions Data:
               precision    recall  f1-score   support

Not Fraud (0)       1.00      1.00      1.00   1251318
    Fraud (1)       1.00      0.99      1.00      1630

     accuracy                           1.00   1252948
    macro avg       1.00      1.00      1.00   1252948
 weighted avg       1.00      1.00      1.00   1252948



In [5]:
# Models directory
model_dir = Path('../models')

# Saves the trained ensemble model
model_path = model_dir / 'ensemble_transactions_model.joblib'
joblib.dump(ensemble_transactions_model, model_path)
print(f"\nEnsemble model saved successfully to: {model_path}")

# For SHAP, it'll be explained using the dominant model (RandomForest)
# Refits the RF model on the training data for the explainer
rf_for_shap = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced', n_jobs=-1)
rf_for_shap.fit(X_train, y_train)

# Creates and saves the SHAP TreeExplainer
explainer = shap.TreeExplainer(rf_for_shap)
explainer_path = model_dir / 'shap_explainer_ensemble_transactions.joblib'
joblib.dump(explainer, explainer_path)
print(f"SHAP explainer for the ensemble saved successfully to: {explainer_path}")


Ensemble model saved successfully to: ..\models\ensemble_transactions_model.joblib
SHAP explainer for the ensemble saved successfully to: ..\models\shap_explainer_ensemble_transactions.joblib
