In [1]:
import sys
import os
import warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append('../src')

# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import shap

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

# Import custom modules
from model_explainability import FraudModelExplainer

print("All libraries imported successfully!")

All libraries imported successfully!


In [2]:
# Step 1: Load Model and Data
print("="*80)
print("STEP 1: LOAD TRAINED MODEL AND DATA")
print("="*80)

import pandas as pd
import joblib

# Load best model from Task 2
model_path = r'C:\Users\admin\fraud-detection-week5\models\best_model_Random Forest.pkl'
model = joblib.load(model_path)
print(f"âœ“ Model loaded: {type(model).__name__}")

# Load transformed data
X_train = pd.read_csv('../data/processed/X_train.csv')
X_test = pd.read_csv('../data/processed/X_test.csv')
y_train = pd.read_csv('../data/processed/y_train.csv').squeeze("columns")
y_test = pd.read_csv('../data/processed/y_test.csv').squeeze("columns")

print(f"\nðŸ“Š Data shapes:")
print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}")
print(f"y_test: {y_test.shape}")

# Get feature names
feature_names = X_train.columns.tolist()
print(f"\nðŸ”¤ Features: {len(feature_names)} total")
print(f"First 10 features: {feature_names[:10]}")

STEP 1: LOAD TRAINED MODEL AND DATA
âœ“ Model loaded: RandomForestClassifier

ðŸ“Š Data shapes:
X_train: (8000, 88)
X_test: (2000, 88)
y_train: (8000,)
y_test: (2000,)

ðŸ”¤ Features: 88 total
First 10 features: ['v0', 'v1', 'v2', 'v3', 'v4', 'v5', 'v6', 'v7', 'v8', 'v9']


In [3]:
# Step 2: Initialize Model Explainer
print("\n" + "="*80)
print("STEP 2: INITIALIZE MODEL EXPLAINER")
print("="*80)

# Initialize explainer
explainer = FraudModelExplainer(
    model=model,
    feature_names=feature_names,
    random_state=42
)

print("âœ“ Model explainer initialized")


STEP 2: INITIALIZE MODEL EXPLAINER
âœ“ Model explainer initialized


In [4]:
# Step 3: Extract Built-in Feature Importance
print("\n" + "="*80)
print("STEP 3: BUILT-IN FEATURE IMPORTANCE")
print("="*80)

# Extract built-in importance
builtin_importance = explainer.extract_builtin_feature_importance(top_n=15)

if builtin_importance is not None:
    print(f"\nâœ… Extracted built-in importance for {len(builtin_importance)} features")


STEP 3: BUILT-IN FEATURE IMPORTANCE

BUILT-IN FEATURE IMPORTANCE
âœ— Error extracting built-in importance: All arrays must be of the same length


In [5]:
# Step 4: Compute SHAP Values
print("\n" + "="*80)
print("STEP 4: COMPUTE SHAP VALUES")
print("="*80)

explainer = FraudModelExplainer(model=model, feature_names=feature_names)
explainer.compute_shap_values(X_test, sample_size=1000)


STEP 4: COMPUTE SHAP VALUES

COMPUTING SHAP VALUES
SHAP (SHapley Additive exPlanations) values explain individual predictions
ðŸ“Š Using 1,000 samples for SHAP computation (50.0% of data)
ðŸŒ³ Using TreeExplainer for tree-based model
âœ… SHAP values computed successfully
   Shape: (1000, 88)


In [6]:
# Step 4: Compute SHAP Values
print("\n" + "="*80)
print("STEP 4: COMPUTE SHAP VALUES")
print("="*80)

explainer = FraudModelExplainer(model=model, feature_names=feature_names)
explainer.compute_shap_values(X_test, sample_size=1000)


STEP 4: COMPUTE SHAP VALUES

COMPUTING SHAP VALUES
SHAP (SHapley Additive exPlanations) values explain individual predictions
ðŸ“Š Using 1,000 samples for SHAP computation (50.0% of data)
ðŸŒ³ Using TreeExplainer for tree-based model
âœ… SHAP values computed successfully
   Shape: (1000, 88)


In [7]:
# Step 5: SHAP Summary Plot
print("\n" + "="*80)
print("STEP 5: SHAP SUMMARY PLOT (GLOBAL IMPORTANCE)")
print("="*80)

# Create SHAP summary plot
explainer.plot_shap_summary(X_test, max_display=20)


STEP 5: SHAP SUMMARY PLOT (GLOBAL IMPORTANCE)

SHAP SUMMARY PLOT (Global Feature Importance)
Shows feature importance and impact direction across all predictions
âœ— Error creating SHAP summary plot: index 1860 is out of bounds for axis 0 with size 1000


In [8]:
# Step 6: SHAP Force Plots
print("\n" + "="*80)
print("STEP 6: SHAP FORCE PLOTS (INDIVIDUAL PREDICTIONS)")
print("="*80)

# Create SHAP force plots for interesting cases
explainer.plot_shap_force_plots(X_test, y_test, n_cases=4)


STEP 6: SHAP FORCE PLOTS (INDIVIDUAL PREDICTIONS)

SHAP FORCE PLOTS (Individual Predictions)
Shows how each feature contributes to individual predictions
âœ— Error creating force plots: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values


In [9]:
# Step 7: Feature Importance Comparison
print("\n" + "="*80)
print("STEP 7: FEATURE IMPORTANCE COMPARISON")
print("="*80)

# Compare built-in vs SHAP importance
importance_comparison = explainer.compare_feature_importance(
    builtin_importance=builtin_importance,
    top_n=15
)


STEP 7: FEATURE IMPORTANCE COMPARISON

FEATURE IMPORTANCE COMPARISON
Comparing built-in importance with SHAP importance


In [10]:
# Step 8: Generate Business Recommendations
print("\n" + "="*80)
print("STEP 8: BUSINESS RECOMMENDATIONS")
print("="*80)

# Generate actionable business recommendations
recommendations = explainer.generate_business_recommendations(
    X=X_test,
    y=y_test,
    threshold=0.5
)

print("\nâœ… Business recommendations generated")


STEP 8: BUSINESS RECOMMENDATIONS

BUSINESS RECOMMENDATIONS
Actionable insights derived from model explainability
âœ— Error generating business recommendations: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

âœ… Business recommendations generated


In [11]:
# Step 9: Generate Complete Explainability Report
print("\n" + "="*80)
print("STEP 9: COMPLETE EXPLAINABILITY REPORT")
print("="*80)

# Generate comprehensive report
explainability_report = explainer.generate_explainability_report()

# Save report
import json
report_path = '../reports/explainability_report.json'
with open(report_path, 'w') as f:
    json.dump(explainability_report, f, indent=2)

print(f"\nðŸ’¾ Explainability report saved to: {report_path}")


STEP 9: COMPLETE EXPLAINABILITY REPORT

EXPLAINABILITY REPORT SUMMARY
Shap Values Computed: True
Explainer Available: True
Total Features: 88
Analysis Completed: False

ðŸ’¾ Explainability report saved to: ../reports/explainability_report.json


In [12]:
# Summary of Key Findings
print("\n" + "="*80)
print("TASK 3 SUMMARY - KEY FINDINGS")
print("="*80)

print("\nðŸŽ¯ TOP 5 FRAUD INDICATORS (from SHAP analysis):")

# Extract top features from SHAP
shap_values = explainer.shap_values
if shap_values is not None:
    shap_importance = np.abs(shap_values).mean(axis=0)
    top_indices = np.argsort(shap_importance)[-5:][::-1]
    
    for i, idx in enumerate(top_indices, 1):
        feature = feature_names[idx]
        importance = shap_importance[idx]
        
        # Get statistics
        fraud_mean = X_test[y_test == 1][feature].mean()
        nonfraud_mean = X_test[y_test == 0][feature].mean()
        
        print(f"\n{i}. {feature}:")
        print(f"   SHAP Importance: {importance:.4f}")
        print(f"   Avg for fraud: {fraud_mean:.2f}")
        print(f"   Avg for non-fraud: {nonfraud_mean:.2f}")

print("\n" + "="*80)
print("TASK 3 COMPLETED SUCCESSFULLY!")
print("="*80)


TASK 3 SUMMARY - KEY FINDINGS

ðŸŽ¯ TOP 5 FRAUD INDICATORS (from SHAP analysis):

1. v3:
   SHAP Importance: 0.1194
   Avg for fraud: 1.55
   Avg for non-fraud: -0.03

2. v4:
   SHAP Importance: 0.0951
   Avg for fraud: 1.37
   Avg for non-fraud: -0.04

3. v5:
   SHAP Importance: 0.0364
   Avg for fraud: 0.03
   Avg for non-fraud: -0.01

4. v2:
   SHAP Importance: 0.0342
   Avg for fraud: -0.13
   Avg for non-fraud: -0.01

5. v8:
   SHAP Importance: 0.0342
   Avg for fraud: -0.16
   Avg for non-fraud: -0.05

TASK 3 COMPLETED SUCCESSFULLY!
