# Model Explainability with SHAP

This notebook interprets the best fraud detection models using SHAP, compares built-in feature importances, and provides actionable business recommendations.

## 1. Load Data and Models

Load processed data and trained models from previous steps.

In [None]:
import pandas as pd
import numpy as np
import shap
import matplotlib.pyplot as plt
import joblib

# Load processed data
cc_data = pd.read_csv('../data/processed/creditcard_processed.csv')
fraud_data = pd.read_csv('../data/processed/fraud_data_processed.csv')

# Load trained models (update path if needed)
rf_cc = joblib.load('../models/rf_cc.joblib')
rf_fraud = joblib.load('../models/rf_fraud.joblib')

# Load test sets (update path if needed)
X_test_cc = pd.read_csv('../data/processed/X_test_cc.csv')
y_test_cc = pd.read_csv('../data/processed/y_test_cc.csv')
X_test_fraud = pd.read_csv('../data/processed/X_test_fraud.csv')
y_test_fraud = pd.read_csv('../data/processed/y_test_fraud.csv')

## 2. Feature Importance Baseline

Extract and visualize the top 10 most important features from the ensemble model.

In [None]:
# Feature importance for credit card model
feature_importances_cc = rf_cc.feature_importances_
features_cc = X_test_cc.columns

# Top 10 features
indices_cc = np.argsort(feature_importances_cc)[::-1][:10]
plt.figure(figsize=(8,6))
plt.title('Top 10 Feature Importances (Credit Card Model)')
plt.barh(range(10), feature_importances_cc[indices_cc][::-1], align='center')
plt.yticks(range(10), [features_cc[i] for i in indices_cc][::-1])
plt.xlabel('Importance')
plt.show()

# Feature importance for fraud model
feature_importances_fraud = rf_fraud.feature_importances_
features_fraud = X_test_fraud.columns
indices_fraud = np.argsort(feature_importances_fraud)[::-1][:10]
plt.figure(figsize=(8,6))
plt.title('Top 10 Feature Importances (Fraud Model)')
plt.barh(range(10), feature_importances_fraud[indices_fraud][::-1], align='center')
plt.yticks(range(10), [features_fraud[i] for i in indices_fraud][::-1])
plt.xlabel('Importance')
plt.show()

## 3. SHAP Analysis

Generate SHAP summary plots (global feature importance) and force plots for individual predictions.

In [None]:
# SHAP for credit card model
explainer_cc = shap.TreeExplainer(rf_cc)
shap_values_cc = explainer_cc.shap_values(X_test_cc)

# SHAP summary plot (global importance)
shap.summary_plot(shap_values_cc[1], X_test_cc, plot_type='bar', show=True)
shap.summary_plot(shap_values_cc[1], X_test_cc, show=True)

# SHAP for fraud model
explainer_fraud = shap.TreeExplainer(rf_fraud)
shap_values_fraud = explainer_fraud.shap_values(X_test_fraud)

# SHAP summary plot (global importance)
shap.summary_plot(shap_values_fraud[1], X_test_fraud, plot_type='bar', show=True)
shap.summary_plot(shap_values_fraud[1], X_test_fraud, show=True)

### SHAP Force Plots for Individual Predictions

Visualize force plots for one true positive, one false positive, and one false negative for each model.

In [None]:
# Helper to find indices for TP, FP, FN
from sklearn.metrics import confusion_matrix

def get_indices(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    tp = np.where((y_true == 1) & (y_pred == 1))[0]
    fp = np.where((y_true == 0) & (y_pred == 1))[0]
    fn = np.where((y_true == 1) & (y_pred == 0))[0]
    return tp, fp, fn

# Credit card model
y_pred_cc = rf_cc.predict(X_test_cc)
tp_cc, fp_cc, fn_cc = get_indices(y_test_cc.values, y_pred_cc)

# Fraud model
y_pred_fraud = rf_fraud.predict(X_test_fraud)
tp_fraud, fp_fraud, fn_fraud = get_indices(y_test_fraud.values, y_pred_fraud)

# Pick one example from each
idx_tp_cc = tp_cc[0] if len(tp_cc) > 0 else None
idx_fp_cc = fp_cc[0] if len(fp_cc) > 0 else None
idx_fn_cc = fn_cc[0] if len(fn_cc) > 0 else None

idx_tp_fraud = tp_fraud[0] if len(tp_fraud) > 0 else None
idx_fp_fraud = fp_fraud[0] if len(fp_fraud) > 0 else None
idx_fn_fraud = fn_fraud[0] if len(fn_fraud) > 0 else None

# SHAP force plots for credit card model
if idx_tp_cc is not None:
    shap.force_plot(explainer_cc.expected_value[1], shap_values_cc[1][idx_tp_cc], X_test_cc.iloc[idx_tp_cc], matplotlib=True)
if idx_fp_cc is not None:
    shap.force_plot(explainer_cc.expected_value[1], shap_values_cc[1][idx_fp_cc], X_test_cc.iloc[idx_fp_cc], matplotlib=True)
if idx_fn_cc is not None:
    shap.force_plot(explainer_cc.expected_value[1], shap_values_cc[1][idx_fn_cc], X_test_cc.iloc[idx_fn_cc], matplotlib=True)

# SHAP force plots for fraud model
if idx_tp_fraud is not None:
    shap.force_plot(explainer_fraud.expected_value[1], shap_values_fraud[1][idx_tp_fraud], X_test_fraud.iloc[idx_tp_fraud], matplotlib=True)
if idx_fp_fraud is not None:
    shap.force_plot(explainer_fraud.expected_value[1], shap_values_fraud[1][idx_fp_fraud], X_test_fraud.iloc[idx_fp_fraud], matplotlib=True)
if idx_fn_fraud is not None:
    shap.force_plot(explainer_fraud.expected_value[1], shap_values_fraud[1][idx_fn_fraud], X_test_fraud.iloc[idx_fn_fraud], matplotlib=True)

## 4. Interpretation

Compare SHAP importance with built-in feature importance, identify top 5 drivers, and explain findings.

In [None]:
# Compare top features from built-in and SHAP
shap_top_cc = np.argsort(np.abs(shap_values_cc[1]).mean(axis=0))[::-1][:5]
shap_top_fraud = np.argsort(np.abs(shap_values_fraud[1]).mean(axis=0))[::-1][:5]

print('Credit Card Model:')
print('Top 5 built-in:', [features_cc[i] for i in indices_cc[:5]])
print('Top 5 SHAP:', [features_cc[i] for i in shap_top_cc])

print('\nFraud Model:')
print('Top 5 built-in:', [features_fraud[i] for i in indices_fraud[:5]])
print('Top 5 SHAP:', [features_fraud[i] for i in shap_top_fraud])

# Discuss surprising findings
# Example: If a feature is highly ranked by SHAP but not by built-in, note it here

## 5. Business Recommendations

Based on SHAP insights, provide actionable recommendations for fraud detection.

**Example Recommendations:**

1. Transactions with high values in [Top SHAP Feature] should trigger additional verification steps.
2. Accounts with frequent transactions within short time intervals (see [Time-related Feature]) should be flagged for review.
3. Transactions from new devices or locations (see [Location/IP Feature]) should require multi-factor authentication.

_Connect each recommendation to the specific SHAP feature and explain the rationale._