In [1]:
# --- ONE-CELL COMPLETE PROJECT SCRIPT (NO ESCAPED QUOTES, NO ERRORS) ---
# Works for realistic_customer_risk_dataset.csv or any similar dataset.

import os
import pandas as pd
import numpy as np
import zipfile
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from xgboost import XGBClassifier
import shap
import joblib

# =====================================================================
# STEP 1 — LOAD DATA (UPLOAD IF NEEDED)
# =====================================================================
DATAFILE = "realistic_customer_risk_dataset.csv"

if not Path(DATAFILE).exists():
    from google.colab import files
    print("Upload realistic_customer_risk_dataset.csv")
    uploaded = files.upload()
    DATAFILE = list(uploaded.keys())[0]

print("Using:", DATAFILE)

df = pd.read_csv(DATAFILE)
df.columns = df.columns.str.lower().str.strip()

# Auto-detect target
possible_targets = ["target", "loan_status", "default"]
target = None
for t in possible_targets:
    if t in df.columns:
        target = t

if target is None:
    # fallback: find binary column
    for c in df.columns:
        if set(df[c].dropna().unique()).issubset({0,1}):
            target = c
            break

if target is None:
    raise ValueError("Cannot detect binary target column. Manually set 'target'.")

print("Target detected:", target)

# Convert object -> category
for c in df.select_dtypes(include="object").columns:
    df[c] = df[c].astype("category")

X = df.drop(columns=[target])
y = df[target].astype(int)

numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = X.select_dtypes(include=["category"]).columns.tolist()

# =====================================================================
# STEP 2 — BUILD MODEL PIPELINE
# =====================================================================
num_t = SimpleImputer(strategy="median")
cat_t = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

pre = ColumnTransformer([
    ("num", num_t, numeric_cols),
    ("cat", cat_t, categorical_cols)
])

model = Pipeline([
    ("prep", pre),
    ("clf", XGBClassifier(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=4,
        eval_metric="logloss",
        use_label_encoder=False
    ))
])

# =====================================================================
# STEP 3 — TRAIN
# =====================================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

model.fit(X_train, y_train)

# =====================================================================
# STEP 4 — EVALUATE
# =====================================================================
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:,1]

print("\n=== MODEL PERFORMANCE ===")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))

# =====================================================================
# STEP 5 — SHAP EXPLAINABILITY
# =====================================================================
trained_xgb = model.named_steps["clf"]
X_test_trans = model.named_steps["prep"].transform(X_test)

explainer = shap.TreeExplainer(trained_xgb)
shap_values = explainer.shap_values(X_test_trans)

# Get feature names
feature_names = []
feature_names += numeric_cols

# One-hot names
ohe = model.named_steps["prep"].named_transformers_["cat"].named_steps["ohe"]
ohe_names = ohe.get_feature_names_out(categorical_cols)
feature_names += list(ohe_names)

# =====================================================================
# STEP 6 — SAVE OUTPUTS
# =====================================================================
OUT = Path("outputs")
EDGE = OUT / "edge_cases"
OUT.mkdir(exist_ok=True)
EDGE.mkdir(exist_ok=True)

# Global SHAP bar plot
plt.figure(figsize=(8,6))
shap.summary_plot(shap_values, X_test_trans, feature_names=feature_names, plot_type="bar", show=False)
plt.savefig(OUT/"shap_bar.png", dpi=200, bbox_inches="tight")
plt.clf()

# Beeswarm
plt.figure(figsize=(10,7))
shap.summary_plot(shap_values, X_test_trans, feature_names=feature_names, show=False)
plt.savefig(OUT/"shap_beeswarm.png", dpi=200, bbox_inches="tight")
plt.clf()

# =====================================================================
# STEP 7 — LOCAL SHAP (5 cases)
# =====================================================================
fn = np.where((y_test==1) & (y_pred==0))[0][:3]
fp = np.where((y_test==0) & (y_pred==1))[0][:2]
selected = list(fn) + list(fp)
selected = selected[:5]

for i, idx in enumerate(selected):
    row = X_test.iloc[[idx]]
    row_t = model.named_steps["prep"].transform(row)

    # PNG force plot
    shap.force_plot(
        explainer.expected_value,
        explainer.shap_values(row_t),
        matplotlib=True,
        feature_names=feature_names,
        show=False
    )
    plt.savefig(EDGE/f"force_case_{i+1}.png", dpi=200, bbox_inches="tight")
    plt.clf()

    # HTML force plot
    try:
        fobj = shap.force_plot(
            explainer.expected_value,
            explainer.shap_values(row_t),
            feature_names=feature_names
        )
        shap.save_html(str(EDGE/f"force_case_{i+1}.html"), fobj)
    except:
        with open(EDGE/f"force_case_{i+1}.html","w") as f:
            f.write("Force plot unavailable.")

# =====================================================================
# STEP 8 — FINAL ANALYSIS FILE
# =====================================================================
analysis_text = f"""
# FINAL ANALYSIS

Accuracy: {(y_pred==y_test).mean():.4f}
ROC AUC: {roc_auc_score(y_test, y_proba):.4f}

Confusion Matrix:
{confusion_matrix(y_test, y_pred)}

Top SHAP global plots saved in outputs/.
"""

with open("final_analysis.md", "w") as f:
    f.write(analysis_text)

# =====================================================================
# STEP 9 — ZIP EVERYTHING
# =====================================================================
zip_path = "project_bundle.zip"
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk("outputs"):
        for file in files:
            path = os.path.join(root, file)
            zipf.write(path, path)

print("\nZipped project saved as:", zip_path)

Upload realistic_customer_risk_dataset.csv


Saving realistic_customer_risk_dataset.csv to realistic_customer_risk_dataset.csv
Using: realistic_customer_risk_dataset.csv
Target detected: target


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== MODEL PERFORMANCE ===
              precision    recall  f1-score   support

           0       0.86      0.88      0.87       212
           1       0.69      0.65      0.67        88

    accuracy                           0.81       300
   macro avg       0.77      0.76      0.77       300
weighted avg       0.81      0.81      0.81       300

Confusion Matrix:
 [[186  26]
 [ 31  57]]
ROC AUC: 0.8748927958833619

Zipped project saved as: project_bundle.zip


<Figure size 800x950 with 0 Axes>

<Figure size 800x950 with 0 Axes>

<Figure size 2000x300 with 0 Axes>

<Figure size 2000x300 with 0 Axes>

<Figure size 2000x300 with 0 Axes>

<Figure size 2000x300 with 0 Axes>

<Figure size 2000x300 with 0 Axes>