Mount Drive and set file paths

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Adjust these paths if your CSV is in a different Drive folder

In [4]:
DRIVE_ROOT = "/content/drive/MyDrive"
DATA_PATH = f"{DRIVE_ROOT}/UCI_Credit_Card.csv"
OUTPUT_DIR = f"{DRIVE_ROOT}/credit_risk_outputs"
import os
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("DATA_PATH:", DATA_PATH)
print("OUTPUT_DIR:", OUTPUT_DIR)

DATA_PATH: /content/drive/MyDrive/UCI_Credit_Card.csv
OUTPUT_DIR: /content/drive/MyDrive/credit_risk_outputs


Install necessary packages

In [5]:
!pip install -q
!pip install xgboost
!pip install shap
!pip install matplotlib
!pip install scikit-learn
!pip install pandas
!pip install joblib

[31mERROR: You must give at least one requirement to install (see "pip help install")[0m[31m


Imports and helper functions

In [6]:
import pandas as pd
import numpy as np
import os
import joblib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, f1_score, confusion_matrix
from xgboost import XGBClassifier
import shap

plt.rcParams["figure.figsize"] = (10,6)

Load CSV and a basic check

In [7]:
df = pd.read_csv(DATA_PATH)
print("Rows, cols:", df.shape)
print(df.columns.tolist()[:40])

Rows, cols: (30000, 25)
['ID', 'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6', 'default.payment.next.month']


If the column name differs, update the target variable here

In [8]:
possible_targets = ["default.payment.next.month", "DEFAULT_PAYMENT_NEXT_MONTH", "default"]
for t in possible_targets:
    if t in df.columns:
        target = t
        break
else:
    # fallback: assume last column is target
    target = df.columns[-1]
    print("Target column not found in typical names. Using last column as target:", target)

print("Using target column:", target)

# Basic NA handling (if none, this is fast)
print("Missing values per column:", df.isna().sum().sum())
df = df.dropna()  # simple approach; for production do better

Using target column: default.payment.next.month
Missing values per column: 0


Prepare features and split

In [9]:
X = df.drop(columns=[target]).copy()
y = df[target].copy().astype(int)

If there are non-numeric columns, convert or drop them

In [10]:
non_numeric = X.select_dtypes(include=['object']).columns.tolist()
if non_numeric:
    print("Non-numeric columns found (they will be factorized):", non_numeric)
    for c in non_numeric:
        X[c] = pd.factorize(X[c])[0]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

print("Train/test sizes:", X_train.shape, X_test.shape)

Train/test sizes: (24000, 24) (6000, 24)


Train XGBoost classifier

In [11]:
model = XGBClassifier(
    n_estimators=250,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=42
)

model.fit(X_train, y_train)
joblib.dump(model, os.path.join(OUTPUT_DIR, "xgb_model.joblib"))
print("Model saved to:", os.path.join(OUTPUT_DIR, "xgb_model.joblib"))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Model saved to: /content/drive/MyDrive/credit_risk_outputs/xgb_model.joblib


Evaluate model

In [12]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]

auc = roc_auc_score(y_test, y_prob)
f1 = f1_score(y_test, y_pred)
print(f"AUC: {auc:.4f}  |  F1: {f1:.4f}")
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred))

AUC: 0.7777  |  F1: 0.4730
Confusion matrix:
 [[4418  255]
 [ 837  490]]

Classification report:
               precision    recall  f1-score   support

           0       0.84      0.95      0.89      4673
           1       0.66      0.37      0.47      1327

    accuracy                           0.82      6000
   macro avg       0.75      0.66      0.68      6000
weighted avg       0.80      0.82      0.80      6000



SHAP â€” explain predictions (global + local). Save plots to Drive

In [13]:
# Use shap.Explainer for compatibility; it detects model type
explainer = shap.Explainer(model, X_train, feature_names=X.columns)

# Compute SHAP values for the test set (this may take time)
shap_values = explainer(X_test)   # shap_values is an Explanation object

# Save a global summary plot (bar + beeswarm)
# 1) bar (average absolute shap)
shap.plots.bar(shap_values, show=False)
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "shap_bar_summary.png"), bbox_inches='tight')
plt.close()

# 2) beeswarm (summary)
# convert to matplotlib figure via summary_plot
# shap.summary_plot(shap_values.values, X_test, show=False)  # older API
shap.plots.beeswarm(shap_values, show=False)
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "shap_beeswarm_summary.png"), bbox_inches='tight')
plt.close()

print("Saved SHAP global plots to:", OUTPUT_DIR)



Saved SHAP global plots to: /content/drive/MyDrive/credit_risk_outputs


Find misclassified samples and produce local SHAP force plots

In [14]:
# Colab cell 9: Misclassifications and local explanations
X_test_idx = X_test.index.to_list()
test_df = X_test.copy()
test_df["actual"] = y_test
test_df["pred"] = y_pred
test_df["prob"] = y_prob

# False positive: actual 0 but predicted 1
fp = test_df[(test_df["actual"] == 0) & (test_df["pred"] == 1)]
# False negative: actual 1 but predicted 0
fn = test_df[(test_df["actual"] == 1) & (test_df["pred"] == 0)]

print("FP count:", len(fp), "  FN count:", len(fn))

if len(fp) > 0:
    fp_idx = fp.index[0]
    fp_row = X_test.loc[fp_idx:fp_idx]  # keep as DataFrame slice
    fp_shap = explainer(fp_row)
    # force plot to HTML (recommended) & png fallback
    # save HTML
    html_fp = shap.plots._force_matplotlib.plot_html(fp_shap) if hasattr(shap.plots._force_matplotlib, 'plot_html') else None
    # matplotlib force (fallback)
    shap.plots.force(fp_shap, matplotlib=True, show=False)
    plt.savefig(os.path.join(OUTPUT_DIR, "false_positive_forceplot.png"), bbox_inches='tight')
    plt.close()
    print("Saved false_positive_forceplot.png")

if len(fn) > 0:
    fn_idx = fn.index[0]
    fn_row = X_test.loc[fn_idx:fn_idx]
    fn_shap = explainer(fn_row)
    shap.plots.force(fn_shap, matplotlib=True, show=False)
    plt.savefig(os.path.join(OUTPUT_DIR, "false_negative_forceplot.png"), bbox_inches='tight')
    plt.close()
    print("Saved false_negative_forceplot.png")


FP count: 255   FN count: 837
Saved false_positive_forceplot.png
Saved false_negative_forceplot.png


Print top SHAP contributors (feature by absolute magnitude) for selected misclassified rows

In [15]:
# Colab cell 10: Helper to get top contributors
def top_contributors(explanation_obj, X_row, k=8):
    # explanation_obj.values shape: (n_samples, n_features)
    vals = explanation_obj.values[0]  # single row
    abs_idx = np.argsort(np.abs(vals))[::-1][:k]
    return [(X_row.columns[i], vals[i], X_row.iloc[0, i]) for i in abs_idx]

if len(fp) > 0:
    print("\nTop contributors for FP (index {}):".format(fp_idx))
    for feat, shap_val, feat_val in top_contributors(fp_shap, fp_row, k=8):
        direction = "increases risk" if shap_val > 0 else "decreases risk"
        print(f"  {feat}: shap={shap_val:.4f} ({direction})  |  feature_value={feat_val}")

if len(fn) > 0:
    print("\nTop contributors for FN (index {}):".format(fn_idx))
    for feat, shap_val, feat_val in top_contributors(fn_shap, fn_row, k=8):
        direction = "increases risk" if shap_val > 0 else "decreases risk"
        print(f"  {feat}: shap={shap_val:.4f} ({direction})  |  feature_value={feat_val}")



Top contributors for FP (index 29382):
  PAY_2: shap=0.4627 (increases risk)  |  feature_value=2
  BILL_AMT2: shap=0.3108 (increases risk)  |  feature_value=188778.0
  BILL_AMT1: shap=0.1805 (increases risk)  |  feature_value=199417.0
  BILL_AMT3: shap=0.1625 (increases risk)  |  feature_value=195335.0
  PAY_AMT1: shap=0.1526 (increases risk)  |  feature_value=0.0
  PAY_AMT4: shap=0.1379 (increases risk)  |  feature_value=180000.0
  SEX: shap=0.1316 (increases risk)  |  feature_value=1
  PAY_0: shap=0.1281 (increases risk)  |  feature_value=1

Top contributors for FN (index 2156):
  BILL_AMT1: shap=0.4466 (increases risk)  |  feature_value=305823.0
  BILL_AMT2: shap=0.4043 (increases risk)  |  feature_value=303701.0
  PAY_0: shap=-0.3907 (decreases risk)  |  feature_value=0
  PAY_AMT6: shap=-0.3547 (decreases risk)  |  feature_value=23333.0
  PAY_AMT2: shap=-0.2293 (decreases risk)  |  feature_value=10500.0
  LIMIT_BAL: shap=-0.1983 (decreases risk)  |  feature_value=290000.0
  BILL_A

Basic subgroup (age) SHAP comparison & save plots

In [16]:
# Colab cell 11: Subgroup analysis by AGE (if AGE column exists)
if "AGE" in df.columns:
    # create age bins in the original df for reference
    age_bins = [20,30,40,50,60,100]
    df["AGE_BIN"] = pd.cut(df["AGE"], bins=age_bins, labels=['20-30','31-40','41-50','51-60','61+'])
    # map test indices to AGE_BIN
    test_age_bins = df.loc[X_test.index, "AGE_BIN"]
    unique_bins = test_age_bins.unique()
    for b in unique_bins:
        if pd.isna(b):
            continue
        subset_idx = test_age_bins[test_age_bins == b].index
        if len(subset_idx) < 10:
            print(f"Skipping age bin {b} (too few samples: {len(subset_idx)})")
            continue
        subset_X = X_test.loc[subset_idx]
        expl_sub = explainer(subset_X)
        shap.plots.beeswarm(expl_sub, show=False)
        plt.title(f"SHAP beeswarm for age bin {b}")
        plt.savefig(os.path.join(OUTPUT_DIR, f"shap_beeswarm_age_{str(b)}.png"), bbox_inches='tight')
        plt.close()
    print("Saved subgroup SHAP plots (if any).")
else:
    print("No AGE column found; skipping age subgroup analysis.")




Saved subgroup SHAP plots (if any).


Save results summary CSV & model

In [17]:
# Colab cell 12: Save test predictions + probabilities and misclassification log
results = X_test.copy()
results["actual"] = y_test
results["pred"] = y_pred
results["prob"] = y_prob
results.to_csv(os.path.join(OUTPUT_DIR, "test_set_predictions.csv"), index=True)
print("Saved predictions CSV to", os.path.join(OUTPUT_DIR, "test_set_predictions.csv"))

# Save the model (again) and the explainer object if you want
joblib.dump(model, os.path.join(OUTPUT_DIR, "xgb_model.joblib"))
print("Saved model to", os.path.join(OUTPUT_DIR, "xgb_model.joblib"))

# (Optional) save explainer - note shap.Explainer objects can be large
try:
    joblib.dump(explainer, os.path.join(OUTPUT_DIR, "shap_explainer.joblib"))
    print("Saved SHAP explainer to", os.path.join(OUTPUT_DIR, "shap_explainer.joblib"))
except Exception as e:
    print("Could not save explainer (ok):", e)


Saved predictions CSV to /content/drive/MyDrive/credit_risk_outputs/test_set_predictions.csv
Saved model to /content/drive/MyDrive/credit_risk_outputs/xgb_model.joblib
Saved SHAP explainer to /content/drive/MyDrive/credit_risk_outputs/shap_explainer.joblib
