# SHAP Explainability Notebook â€” Loan Risk Assessment

This notebook trains a RandomForest model and uses **SHAP** to explain model predictions. It is intended for inclusion in your GitHub repository as an explainability/demo notebook.

**Dataset path:** `/mnt/data/Data_08_Simulated Loan Risk Assessment Data.csv`

Run cells sequentially. If SHAP is not installed, run the install cell.

In [None]:
# Cell 1: (Optional) Install SHAP if not present
# Uncomment and run if shap is not installed in your environment.
# !pip install shap
print('If SHAP is not installed, uncomment the pip install line above and run this cell.')

In [None]:
# Cell 2: Imports and config
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
try:
    import shap
except Exception as e:
    print('shap import error (if missing, install shap):', e)
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

plt.rcParams['figure.figsize'] = (10,5)
sns.set(style='whitegrid')

DATA_PATH = r"/mnt/data/Data_08_Simulated Loan Risk Assessment Data.csv"
print('Data path:', DATA_PATH)

In [None]:
# Cell 3: Load dataset and basic checks
df = pd.read_csv(DATA_PATH)
print('Shape:', df.shape)
display(df.head())

# Identify numeric and categorical columns
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = df.select_dtypes(include=['object','category','bool']).columns.tolist()
print('Numeric cols:', num_cols)
print('Categorical cols:', cat_cols)

In [None]:
# Cell 4: Choose target and prepare y
target = 'loan_status' if 'loan_status' in df.columns else ( [c for c in df.columns if df[c].nunique()<=10][0] if any(df[c].nunique()<=10 for c in df.columns) else None)
print('Using target:', target)
if target is None:
    raise ValueError('No suitable target found in dataset. Please set target manually.')

y = pd.factorize(df[target].fillna(df[target].mode()[0]))[0]
print('Target value counts:', pd.Series(y).value_counts())

In [None]:
# Cell 5: Basic preprocessing pipeline
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, num_cols),
    ('cat', categorical_transformer, cat_cols)
])

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1))])

X = df.drop(columns=[target]).copy()
print('Feature matrix shape before transform:', X.shape)

In [None]:
# Cell 6: Train-test split and fit model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:,1] if hasattr(clf.named_steps['classifier'], 'predict_proba') and len(np.unique(y))==2 else None

print('Accuracy:', accuracy_score(y_test, y_pred))
print('F1 score:', f1_score(y_test, y_pred, average='binary' if len(np.unique(y))==2 else 'macro'))
if y_proba is not None:
    print('ROC AUC:', roc_auc_score(y_test, y_proba))

In [None]:
# Cell 7: Prepare data for SHAP (get feature names after preprocessing)
pre = clf.named_steps['preprocessor']
X_train_mat = pre.transform(X_train)
X_test_mat = pre.transform(X_test)

num_features = num_cols
if cat_cols:
    ohe = pre.named_transformers_['cat'].named_steps['onehot']
    try:
        cat_feature_names = list(ohe.get_feature_names_out(cat_cols))
    except Exception:
        # older sklearn versions
        cat_feature_names = list(ohe.get_feature_names(cat_cols))
else:
    cat_feature_names = []
feature_names = num_features + cat_feature_names
print('Total features after preprocessing:', len(feature_names))

In [None]:
# Cell 8: SHAP explainer (TreeExplainer for RF) and summary plot
import shap
rf_model = clf.named_steps['classifier']

explainer = shap.TreeExplainer(rf_model)
shap_values = explainer.shap_values(X_test_mat)
print('Computed SHAP values.')

if isinstance(shap_values, list) and len(shap_values) == 2:
    shap_vals_use = shap_values[1]
else:
    shap_vals_use = shap_values

# Summary bar plot
shap.summary_plot(shap_vals_use, X_test_mat, feature_names=feature_names, plot_type='bar', show=True)

In [None]:
# Cell 9: SHAP dot summary and dependence plots for top features
shap.summary_plot(shap_vals_use, X_test_mat, feature_names=feature_names, plot_type='dot', show=True)

import numpy as np
mean_abs_shap = np.abs(shap_vals_use).mean(axis=0)
top_idx = np.argsort(mean_abs_shap)[-5:][::-1]
top_features = [feature_names[i] for i in top_idx]
print('Top features by mean(|SHAP|):', top_features)

for feat in top_features[:3]:
    shap.dependence_plot(feat, shap_vals_use, X_test_mat, feature_names=feature_names, show=True)

In [None]:
# Cell 10: SHAP force plot for a single sample and saving SHAP values
i = 0
try:
    display(shap.force_plot(explainer.expected_value[1] if isinstance(shap_values, list) and len(shap_values)==2 else explainer.expected_value, shap_vals_use[i], X_test_mat[i], feature_names=feature_names))
except Exception as e:
    print('Force plot may require the JS visualization to be enabled in the notebook. Error:', e)

shap_df = pd.DataFrame(shap_vals_use, columns=feature_names)
shap_df.to_csv('/mnt/data/shap_values_test.csv', index=False)
print('Saved SHAP values to /mnt/data/shap_values_test.csv')

## Notes

- If `shap` is missing, uncomment the pip install line in Cell 1. For GitHub, consider adding `requirements.txt` listing `shap`, `scikit-learn`, `pandas`, `numpy`, `matplotlib`, `seaborn`.
- This notebook uses RandomForest; you can swap in LightGBM/XGBoost for improved performance and similar SHAP workflow.

---

You can download this notebook and push it to your GitHub repository.