In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, confusion_matrix
import shap

# Set visual style
sns.set_theme(style="whitegrid")

In [None]:
# Load the feature-engineered dataset
df = pd.read_csv('../data/processed/fraud_processed.csv')

# Separate features and target
X = df.drop(columns=['class']) # Assuming 'class' is the target column
y = df['class']

In [None]:
# 1. Stratified Train-Test Split (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 2. Log Class Distribution BEFORE Resampling
print("--- [LOG] Class Distribution BEFORE Resampling (Training Set) ---")
print(y_train.value_counts(normalize=True))
print(y_train.value_counts())

# 3. Apply SMOTE ONLY to Training Data
# Strategy: We use SMOTE to create synthetic minority examples, ensuring the model
# learns fraud patterns without losing valuable majority class data.
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# 4. Log Class Distribution AFTER Resampling
print("\n--- [LOG] Class Distribution AFTER SMOTE (Training Set Only) ---")
print(y_train_res.value_counts(normalize=True))
print(y_train_res.value_counts())

# 5. Verify Test Set remains imbalanced (Real-world distribution)
print("\n--- [LOG] Test Set Distribution (Untouched) ---")
print(y_test.value_counts(normalize=True))

In [None]:
# Initialize XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Define Hyperparameter Space
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.7, 0.8, 0.9]
}

# Randomized Search with 5-Fold Stratified CV
cv = StratifiedKFold(n_splits=5)
random_search = RandomizedSearchCV(
    xgb, param_distributions=param_dist, n_iter=10, 
    scoring='f1', cv=cv, verbose=1, random_state=42
)

random_search.fit(X_train_res, y_train_res)
best_model = random_search.best_estimator_

print(f"\nBest Parameters: {random_search.best_params_}")

In [None]:
y_pred = best_model.predict(X_test)
y_probs = best_model.predict_proba(X_test)[:, 1]

print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred):.4f}")
print(f"AUC-ROC: {roc_auc_score(y_test, y_probs):.4f}")

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
# Calculate SHAP values
explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_test)

# Global Summary Plot
plt.title("SHAP Feature Importance")
shap.summary_plot(shap_values, X_test)

# Local Explanation (Force Plot for a True Positive)
# Replace index 0 with a known fraud case index if necessary
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[0, :], X_test.iloc[0, :])