In [14]:
# Data manipulation and numerical operations
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Mutual information and feature selection
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from sklearn.metrics import mutual_info_score

# Statistical computations (optional for advanced use cases)
from scipy.stats import entropy

# Handling imbalanced data
from imblearn.ensemble import BalancedRandomForestClassifier

# Model explainability
from alibi.explainers.ale import ALE, plot_ale
from sklearn.inspection import partial_dependence, PartialDependenceDisplay

# Permutation Importance
from sklearn.inspection import permutation_importance

# SHAP Values
import shap

# Suppress FutureWarnings
import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)

The history saving thread hit an unexpected error (OperationalError('database or disk is full')).History will not be written to the database.




In [15]:
## Read in the Data

folder_path = "./data/"
#train_identity = pd.read_csv(f"{folder_path}train_identity.csv")
#train_transaction = pd.read_csv(f"{folder_path}train_transaction.csv")

In [None]:
## Combine the datasets
#train = pd.merge(train_transaction, train_identity, on="TransactionID", how="left")

## Replace inf and -inf values with NaN in the dataset
#train.replace([np.inf, -np.inf], np.nan, inplace=True)

In [16]:
df_train = pd.read_csv(f"{folder_path}train_selected_features.csv")

In [None]:
X = df_train.drop(columns=['isFraud'])  # Features (all columns except 'isFraud')
y = df_train['isFraud']   

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()
model = scaler.fit(X_train)
x_train_scaled = model.transform(X_train)

x_test_scaled = model.transform(X_test)


print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")



Training data shape: (472432, 15)
Testing data shape: (118108, 15)


In [9]:
# Step 5: Train a Balanced Random Forest model
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=42)
brf.fit(x_train_scaled, y_train)

# Step 6: Evaluate the model
y_pred = brf.predict(x_test_scaled)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, brf.predict_proba(X_test)[:, 1]))

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.92      0.95    113866
           1       0.28      0.84      0.42      4242

    accuracy                           0.92    118108
   macro avg       0.63      0.88      0.69    118108
weighted avg       0.97      0.92      0.94    118108

Confusion Matrix:
 [[104554   9312]
 [   684   3558]]




ROC AUC Score: 0.6223372776703963


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import cross_val_score

In [12]:
models = []

models.append(("LogisticRegression",LogisticRegression()))
models.append(("SVC",SVC()))
models.append(("LinearSVC",LinearSVC()))
models.append(("KNeighbors",KNeighborsClassifier()))
models.append(("DecisionTree",DecisionTreeClassifier()))
models.append(("RandomForest",RandomForestClassifier()))
rf2 = RandomForestClassifier(n_estimators=100, criterion='gini',
                                max_depth=10, random_state=0, max_features=None)
models.append(("RandomForest2",rf2))
models.append(("MLPClassifier",MLPClassifier(solver='lbfgs', random_state=0)))
models.append(("GaussianNB",GaussianNB()))


results = []
names = []
for name,model in models:
    result = cross_val_score(model, x_train_scaled,y_train,  cv=3)
    names.append(name)
    results.append(result)

for i in range(len(names)):
    print(names[i],results[i].mean())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


LogisticRegression 0.9662469940969051
SVC 0.9675805195062906
LinearSVC 0.9656924173155567
KNeighbors 0.9668735394093244
DecisionTree 0.9677731403889194
RandomForest 0.9818090219459886
RandomForest2 0.9746566699106763
MLPClassifier 0.9716805801404096
GaussianNB 0.9567620305811434


In [None]:
# Create a copy of the test set and sample a smaller subset
X_test_copy = X_test.copy()
X_test_sample = X_test_copy.sample(10_000, random_state=42)  # Sample 2000 rows

# Initialize ALE explainer with all features
ale = ALE(lambda x: brf.predict_proba(x)[:, 1], feature_names=X_test.columns)

# Compute ALE explanation on the sampled dataset
ale_explanation = ale.explain(X_test_sample.values)

# Plot ALE for the top 15 features
for i, feature in enumerate(X_test.columns):
    feature_idx = [list(X_test.columns).index(feature)]  # Make feature_idx a list
    plot_ale(ale_explanation, features=feature_idx)  # Use plot_ale for visualization
    plt.title(f"ALE Plot for {feature}")
    plt.xlabel(feature)
    plt.ylabel("Effect on Prediction")
    plt.tight_layout()
    plt.show()