In [1]:
import pandas as pd
import numpy as np
import joblib
import shap
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Loads the dataset from the data folder
try:
    df = pd.read_csv('../data/creditcard.csv')
    print("Credit card dataset loaded successfully!")
except FileNotFoundError:
    print("Error: Make sure 'new_transactions.csv' is in the '../data/' directory.")

display(df.head())

Credit card dataset loaded successfully!


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [2]:
def preprocess_credit_card_data(df):
    df_processed = df.copy()
    
    # Scales the 'Amount' feature
    scaler = StandardScaler()
    df_processed['scaled_Amount'] = scaler.fit_transform(df_processed['Amount'].values.reshape(-1, 1))
    
    # Drops original 'Amount' and 'Time'
    df_processed = df_processed.drop(['Time', 'Amount'], axis=1)
    
    # Separates features (X) and target (y)
    X = df_processed.drop('Class', axis=1)
    y = df_processed['Class']
    
    return X, y

# Preprocesses the data
X, y = preprocess_credit_card_data(df)
print("Data preprocessing complete.")
print("Features shape:", X.shape)

Data preprocessing complete.
Features shape: (284807, 29)


In [3]:
# Splits data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Calculates weight for the imbalanced classes
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

# Defines the two models
# RandomForest is given more weight for better performance on complex patterns
clf1 = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')

# Logistic Regression is good for linear patterns
clf2 = LogisticRegression(random_state=42, class_weight='balanced')

print("Models defined and data is split.")

Models defined and data is split.


In [4]:
# Creates the ensemble model with soft voting and weights
creditcard_ensemble_model = VotingClassifier(
    estimators=[('rf', clf1), ('lr', clf2)],
    voting='soft',
    weights=[0.7, 0.3] # 70% for RandomForest, 30% for LogisticRegression
)

print("Training the ensemble model...")
print("This may take a while... Please be patient.")
creditcard_ensemble_model.fit(X_train, y_train)
print("Training complete.")

# Evaluates the model
print("Ensemble Model Evaluation:")
y_pred = creditcard_ensemble_model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=['Not Fraud (0)', 'Fraud (1)']))

Training the ensemble model...
This may take a while... Please be patient.
Training complete.
Ensemble Model Evaluation:
               precision    recall  f1-score   support

Not Fraud (0)       1.00      1.00      1.00     56864
    Fraud (1)       0.93      0.82      0.87        98

     accuracy                           1.00     56962
    macro avg       0.96      0.91      0.93     56962
 weighted avg       1.00      1.00      1.00     56962



In [5]:
# Saves the trained ensemble model
model_path = '../models/creditcard_ensemble_model.joblib'
joblib.dump(creditcard_ensemble_model, model_path)
print(f"Ensemble model saved successfully to: {model_path}")

# For SHAP, it will be explained using the dominant model in the ensemble (RandomForest)
# Refits the RF model on the full training data
rf_for_shap = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf_for_shap.fit(X_train, y_train)

# Creates and saves the SHAP TreeExplainer
shap_explainer_ensemble_creditcard = shap.TreeExplainer(rf_for_shap)
shap_explainer_ensemble_creditcard_path = '../models/shap_explainer_ensemble_creditcard.joblib'
joblib.dump(shap_explainer_ensemble_creditcard, shap_explainer_ensemble_creditcard_path)
print(f"SHAP explainer for the ensemble saved successfully to: {shap_explainer_ensemble_creditcard_path}")

Ensemble model saved successfully to: ../models/creditcard_ensemble_model.joblib
SHAP explainer for the ensemble saved successfully to: ../models/shap_explainer_ensemble_creditcard.joblib
