# Credit Risk Prediction

LendingClub dataset — three approaches to binary default classification.

| Version | Models | Key Features |
|---------|--------|--------------|
| `'a'` | LR + XGBoost + ANN | Paper replication, MSE/ROC comparison |
| `'b'` | ANN + XGBoost + Random Forest | Full EDA, extensive feature engineering, outlier removal |
| `'c'` | ANN with StratifiedKFold CV | Refined features, cross-validation, threshold analysis |

**To switch versions:** change `VERSION` in the cell below and re-run all cells.


In [None]:
VERSION = 'c'  # Options: 'a', 'b', 'c'


In [None]:
import pandas as pd
import numpy as np
import kagglehub
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import AUC

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    mean_squared_error, confusion_matrix, classification_report,
    roc_auc_score, roc_curve, auc, precision_recall_curve,
    ConfusionMatrixDisplay, RocCurveDisplay
)
import xgboost as xgb
from xgboost import XGBClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler


## Data Loading

In [None]:
dataset_path = kagglehub.dataset_download("jeandedieunyandwi/lending-club-dataset")
df = pd.read_csv(f"{dataset_path}/lending_club_loan_two.csv")
print(f"Dataset shape: {df.shape}")
print(df["loan_status"].value_counts())


## Preprocessing & Feature Engineering

In [None]:
# ── VERSION a ─────────────────────────────────────────────────────────────
if VERSION == "a":
    df_clean = df[df["loan_status"].isin(["Fully Paid", "Charged Off"])].copy()
    df_clean["loan_status_binary"] = df_clean["loan_status"].apply(lambda x: 0 if x == "Fully Paid" else 1)

    X_clean = df_clean.drop("loan_status_binary", axis=1)
    y_clean = df_clean["loan_status_binary"]
    X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(
        X_clean, y_clean, test_size=0.2, random_state=42, stratify=y_clean
    )

    SAMPLE_SIZE = 60000
    if len(X_train_full) > SAMPLE_SIZE:
        train_indices = X_train_full.sample(n=SAMPLE_SIZE, random_state=42).index
        df_sampled = df_clean.loc[train_indices].copy()
    else:
        df_sampled = df_clean.loc[X_train_full.index].copy()

    X_sampled = df_sampled.drop("loan_status_binary", axis=1)
    y_sampled = df_sampled["loan_status_binary"]

    FEATURE_COLUMNS = ["loan_amnt", "funded_amnt", "funded_amnt_inv", "term", "int_rate",
                       "installment", "grade", "sub_grade", "emp_title", "emp_length",
                       "annual_inc", "application_type"]
    available_cols = [col for col in FEATURE_COLUMNS if col in X_sampled.columns]

    if "term" in X_sampled.columns and X_sampled["term"].dtype == "object":
        X_sampled["term"] = X_sampled["term"].str.replace(" months", "", regex=False).astype(float)
    if "int_rate" in X_sampled.columns and X_sampled["int_rate"].dtype == "object":
        X_sampled["int_rate"] = X_sampled["int_rate"].str.replace("%", "", regex=False).astype(float)
    if "emp_length" in X_sampled.columns and X_sampled["emp_length"].dtype == "object":
        X_sampled["emp_length"] = X_sampled["emp_length"].replace({
            "< 1 year": "0", "1 year": "1", "2 years": "2", "3 years": "3",
            "4 years": "4", "5 years": "5", "6 years": "6", "7 years": "7",
            "8 years": "8", "9 years": "9", "10+ years": "10", "n/a": np.nan
        }).astype(float)

    categorical_features = ["grade", "application_type"]
    final_features = [col for col in available_cols if X_sampled[col].dtype in ["float64", "int64"]]
    final_features.extend([c for c in categorical_features if c in X_sampled.columns])
    X_final = X_sampled[final_features].copy()
    combined_df = pd.concat([X_final, y_sampled], axis=1).dropna()
    X_final = combined_df.drop("loan_status_binary", axis=1)
    y_final = combined_df["loan_status_binary"]

    X_encoded = pd.get_dummies(X_final, columns=[c for c in categorical_features if c in X_final.columns], drop_first=True)
    X_train, X_test, y_train, y_test = train_test_split(
        X_encoded, y_final, test_size=0.2, random_state=42, stratify=y_final
    )
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled  = scaler.transform(X_test)
    print(f"Train/Test: {X_train_scaled.shape[0]} / {X_test_scaled.shape[0]} | Features: {X_train_scaled.shape[1]}")


# ── VERSION b ─────────────────────────────────────────────────────────────
elif VERSION == "b":
    data = df[df["loan_status"].isin(["Fully Paid", "Charged Off"])].copy()

    # EDA
    print(f"Shape: {data.shape}")
    print(data["loan_status"].value_counts())

    plt.figure(figsize=(12, 8))
    sns.heatmap(data.select_dtypes(include=np.number).corr(), annot=False, cmap="viridis")
    plt.title("Correlation Heatmap"); plt.tight_layout(); plt.show()

    plt.figure(figsize=(15, 6))
    plt.subplot(1, 2, 1)
    grade_order = sorted(data.grade.unique().tolist())
    sns.countplot(x="grade", data=data, hue="loan_status", order=grade_order)
    plt.title("Loan Status by Grade")
    plt.subplot(1, 2, 2)
    sub_grade_order = sorted(data.sub_grade.unique().tolist())
    g = sns.countplot(x="sub_grade", data=data, hue="loan_status", order=sub_grade_order)
    g.set_xticklabels(g.get_xticklabels(), rotation=90)
    plt.title("Loan Status by Sub-Grade")
    plt.tight_layout(); plt.show()

    # Binary features
    data["pub_rec"] = data["pub_rec"].apply(lambda x: 0 if x == 0.0 else 1)
    data["mort_acc"] = data["mort_acc"].apply(lambda x: 0 if x == 0.0 else (1 if x >= 1.0 else x))
    data["pub_rec_bankruptcies"] = data["pub_rec_bankruptcies"].apply(lambda x: 0 if x == 0.0 else (1 if x >= 1.0 else x))

    # Target: 1=Fully Paid, 0=Charged Off
    data["loan_status"] = data["loan_status"].map({"Fully Paid": 1, "Charged Off": 0})

    # Handle missing values
    data.drop(["emp_title", "emp_length", "title"], axis=1, inplace=True, errors="ignore")
    data["mort_acc"] = pd.to_numeric(data["mort_acc"], errors="coerce")
    total_acc_avg = data.groupby("total_acc")["mort_acc"].mean()
    data["mort_acc"] = data.apply(
        lambda x: total_acc_avg[x["total_acc"]] if np.isnan(x["mort_acc"]) else x["mort_acc"], axis=1
    ).round()
    data.dropna(inplace=True)
    print(f"Shape after cleaning: {data.shape}")

    # Feature engineering
    data["term"] = data["term"].apply(lambda t: int(t.strip().replace(" months", "")))
    data.drop("grade", axis=1, inplace=True)
    dummies = ["sub_grade", "verification_status", "purpose", "initial_list_status", "application_type"]
    data = pd.get_dummies(data, columns=dummies + ["home_ownership"], drop_first=True)
    data["zip_code"] = data["address"].apply(lambda x: x[-5:])
    data = pd.get_dummies(data, columns=["zip_code"], drop_first=True)
    data.drop(["address", "issue_d"], axis=1, inplace=True, errors="ignore")
    data["earliest_cr_line"] = pd.to_datetime(data["earliest_cr_line"]).dt.year
    print(f"Shape after feature engineering: {data.shape}")

    # Split + outlier removal on train
    train_b, test_b = train_test_split(data, test_size=0.33, random_state=42)
    train_b = train_b[
        (train_b["annual_inc"] <= 250000) & (train_b["dti"] <= 50) &
        (train_b["open_acc"] <= 40)       & (train_b["total_acc"] <= 80) &
        (train_b["revol_util"] <= 120)    & (train_b["revol_bal"] <= 250000)
    ]
    scaler_b = MinMaxScaler()
    X_tr_b = np.array(scaler_b.fit_transform(train_b.drop("loan_status", axis=1))).astype(np.float32)
    y_tr_b = np.array(train_b["loan_status"]).astype(np.float32)
    X_te_b = np.array(scaler_b.transform(test_b.drop("loan_status", axis=1))).astype(np.float32)
    y_te_b = np.array(test_b["loan_status"]).astype(np.float32)

    # Balance: undersample majority, upsample minority
    y_series = pd.Series(y_tr_b)
    maj_cls = y_series.value_counts().idxmax()
    min_cls = y_series.value_counts().idxmin()
    rng = np.random.default_rng(42)
    maj_idx = y_series[y_series == maj_cls].index.to_numpy()
    min_idx = y_series[y_series == min_cls].index.to_numpy()
    maj_under = rng.choice(maj_idx, size=len(maj_idx) // 2, replace=False)
    min_over  = rng.choice(min_idx, size=len(maj_idx) // 2, replace=True)
    bal_idx   = rng.permutation(np.concatenate([maj_under, min_over]))
    X_train_scaled = X_tr_b[bal_idx]
    y_train        = y_tr_b[bal_idx]
    X_test_scaled  = X_te_b
    y_test         = y_te_b
    scores_dict    = {}
    print(f"Balanced train: {X_train_scaled.shape[0]} | Test: {X_test_scaled.shape[0]} | Features: {X_train_scaled.shape[1]}")


# ── VERSION c ─────────────────────────────────────────────────────────────
else:  # VERSION == "c"
    df_c = df.copy()
    df_c["loan_status_binary"] = (df_c["loan_status"] == "Charged Off").astype(int)
    X_all = df_c.drop(["loan_status", "loan_status_binary"], axis=1)
    y_all = df_c["loan_status_binary"]
    X_train_full, _, y_train_full, _ = train_test_split(
        X_all, y_all, test_size=0.2, random_state=42, stratify=y_all
    )

    SAMPLE_SIZE = 60000
    if len(X_train_full) > SAMPLE_SIZE:
        idx = X_train_full.sample(n=SAMPLE_SIZE, random_state=42).index
        X = df_c.loc[idx].drop(["loan_status", "loan_status_binary"], axis=1)
        y = df_c.loc[idx, "loan_status_binary"]
    else:
        X, y = X_train_full, y_train_full

    FEATURE_COLUMNS = ["loan_amnt", "funded_amnt", "funded_amnt_inv", "term", "int_rate",
                       "installment", "grade", "sub_grade", "emp_title", "emp_length",
                       "annual_inc", "application_type"]
    X = X[[col for col in FEATURE_COLUMNS if col in X.columns]]

    if "term" in X.columns and X["term"].dtype == "object":
        X["term"] = X["term"].str.replace(" months", "", regex=False).astype(float)
    if "int_rate" in X.columns and X["int_rate"].dtype == "object":
        X["int_rate"] = X["int_rate"].str.replace("%", "", regex=False).astype(float)
    if "emp_length" in X.columns and X["emp_length"].dtype == "object":
        emp_map = {"< 1 year": "0", "1 year": "1", "2 years": "2", "3 years": "3",
                   "4 years": "4", "5 years": "5", "6 years": "6", "7 years": "7",
                   "8 years": "8", "9 years": "9", "10+ years": "10", "n/a": np.nan}
        X["emp_length"] = X["emp_length"].replace(emp_map).astype(float)

    categorical_features = ["grade", "application_type"]
    numeric_features = [col for col in X.columns if X[col].dtype in ["float64", "int64"]]
    X = X[numeric_features + [c for c in categorical_features if c in X.columns]]
    combined = pd.concat([X, y], axis=1).dropna()
    X = combined.drop("loan_status_binary", axis=1)
    y = combined["loan_status_binary"]

    X = pd.get_dummies(X, columns=[c for c in categorical_features if c in X.columns], drop_first=True)
    X["annual_inc_to_loan_ratio"] = X["annual_inc"] / (X["loan_amnt"] + 1e-6)
    selected = ["loan_amnt", "term", "int_rate", "emp_length", "annual_inc", "annual_inc_to_loan_ratio"]
    ohe_cols = [col for col in X.columns if col.startswith("grade_") or col.startswith("application_type_")]
    selected += [f for f in ohe_cols if f in X.columns]
    X = X[[f for f in selected if f in X.columns]]

    maj = y.value_counts().idxmax()
    min_ = y.value_counts().idxmin()
    rus = RandomUnderSampler(sampling_strategy={maj: y.value_counts()[min_]}, random_state=42)
    X_down, y_down = rus.fit_resample(X, y)
    ros = RandomOverSampler(sampling_strategy={min_: len(y_down[y_down == maj])}, random_state=42)
    X_balanced, y_balanced = ros.fit_resample(X_down, y_down)
    combined = pd.concat([X_balanced, y_balanced], axis=1).sample(frac=1, random_state=42).reset_index(drop=True)
    X_balanced = combined.drop("loan_status_binary", axis=1)
    y_balanced = combined["loan_status_binary"]

    X_train, X_test, y_train, y_test = train_test_split(
        X_balanced, y_balanced, test_size=0.2, random_state=42, stratify=y_balanced
    )
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled  = scaler.transform(X_test)
    print(f"Train/Test: {X_train_scaled.shape[0]} / {X_test_scaled.shape[0]} | Features: {X_train_scaled.shape[1]}")


## Model Building & Training

In [None]:
# ── VERSION a ─────────────────────────────────────────────────────────────
if VERSION == "a":
    lr_model = LogisticRegression(random_state=42)
    lr_model.fit(X_train_scaled, y_train)
    y_pred_lr_proba = lr_model.predict_proba(X_test_scaled)[:, 1]
    y_pred_lr = (y_pred_lr_proba > 0.5).astype(int)
    lr_accuracy = accuracy_score(y_test, y_pred_lr)
    lr_mse = mean_squared_error(y_test, y_pred_lr_proba)
    print(f"LR  Accuracy: {lr_accuracy:.4f} | MSE: {lr_mse:.6f}")

    xgb_model = xgb.XGBClassifier(random_state=42, eval_metric="logloss")
    xgb_model.fit(X_train_scaled, y_train)
    y_pred_xgb_proba = xgb_model.predict_proba(X_test_scaled)[:, 1]
    y_pred_xgb = (y_pred_xgb_proba > 0.5).astype(int)
    xgb_accuracy = accuracy_score(y_test, y_pred_xgb)
    xgb_mse = mean_squared_error(y_test, y_pred_xgb_proba)
    print(f"XGB Accuracy: {xgb_accuracy:.4f} | MSE: {xgb_mse:.6f}")

    ann = tf.keras.Sequential([
        tf.keras.layers.Dense(7, activation="relu", input_shape=(X_train_scaled.shape[1],)),
        tf.keras.layers.Dense(1, activation="sigmoid")
    ])
    ann.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    ann.fit(X_train_scaled, y_train, epochs=50, batch_size=32,
            validation_data=(X_test_scaled, y_test), verbose=0)
    y_pred_ann_proba = ann.predict(X_test_scaled, verbose=0).flatten()
    y_pred_ann = (y_pred_ann_proba > 0.5).astype(int)
    ann_accuracy = accuracy_score(y_test, y_pred_ann)
    ann_mse = mean_squared_error(y_test, y_pred_ann_proba)
    print(f"ANN Accuracy: {ann_accuracy:.4f} | MSE: {ann_mse:.6f}")


# ── VERSION b ─────────────────────────────────────────────────────────────
elif VERSION == "b":
    def print_score(true, pred, label=""):
        print(f"\n{label}\n" + "="*50)
        print(f"Accuracy: {accuracy_score(true, pred)*100:.2f}%")
        print(classification_report(true, pred))

    def plot_training(r):
        plt.figure(figsize=(12, 4))
        plt.subplot(1, 2, 1)
        plt.plot(r.history["loss"], label="Loss")
        plt.plot(r.history["val_loss"], label="Val Loss")
        plt.title("Loss"); plt.legend()
        plt.subplot(1, 2, 2)
        plt.plot(r.history["AUC"], label="AUC")
        plt.plot(r.history["val_AUC"], label="Val AUC")
        plt.title("AUC"); plt.legend()
        plt.tight_layout(); plt.show()

    def nn_model_b(num_cols, hidden_units, dropout_rates, lr):
        inp = Input(shape=(num_cols,))
        x = BatchNormalization()(inp)
        x = Dropout(dropout_rates[0])(x)
        for i, units in enumerate(hidden_units):
            x = Dense(units, activation="relu")(x)
            x = BatchNormalization()(x)
            x = Dropout(dropout_rates[i + 1])(x)
        x = Dense(1, activation="sigmoid")(x)
        m = Model(inputs=inp, outputs=x)
        m.compile(optimizer=Adam(lr), loss="binary_crossentropy", metrics=[AUC(name="AUC")])
        return m

    ann_b = nn_model_b(X_train_scaled.shape[1], [150, 150, 150], [0.1, 0, 0.1, 0], 1e-3)
    r = ann_b.fit(X_train_scaled, y_train, validation_data=(X_test_scaled, y_test),
                  epochs=7, batch_size=32, verbose=1)
    plot_training(r)
    print_score(y_train, ann_b.predict(X_train_scaled, verbose=0).round(), label="ANN Train")
    print_score(y_test,  ann_b.predict(X_test_scaled,  verbose=0).round(), label="ANN Test")
    scores_dict["ANNs"] = {
        "Train": roc_auc_score(y_train, ann_b.predict(X_train_scaled, verbose=0)),
        "Test":  roc_auc_score(y_test,  ann_b.predict(X_test_scaled,  verbose=0))
    }

    xgb_clf = XGBClassifier(eval_metric="logloss")
    xgb_clf.fit(X_train_scaled, y_train)
    print_score(y_train, xgb_clf.predict(X_train_scaled), label="XGBoost Train")
    print_score(y_test,  xgb_clf.predict(X_test_scaled),  label="XGBoost Test")
    scores_dict["XGBoost"] = {
        "Train": roc_auc_score(y_train, xgb_clf.predict(X_train_scaled)),
        "Test":  roc_auc_score(y_test,  xgb_clf.predict(X_test_scaled))
    }

    rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_clf.fit(X_train_scaled, y_train)
    print_score(y_train, rf_clf.predict(X_train_scaled), label="Random Forest Train")
    print_score(y_test,  rf_clf.predict(X_test_scaled),  label="Random Forest Test")
    scores_dict["Random Forest"] = {
        "Train": roc_auc_score(y_train, rf_clf.predict(X_train_scaled)),
        "Test":  roc_auc_score(y_test,  rf_clf.predict(X_test_scaled))
    }


# ── VERSION c ─────────────────────────────────────────────────────────────
else:  # VERSION == "c"
    def build_model_c(input_dim):
        m = tf.keras.Sequential([
            tf.keras.layers.Dense(14, activation="tanh", input_shape=(input_dim,)),
            tf.keras.layers.Dense(7,  activation="tanh"),
            tf.keras.layers.Dense(1,  activation="sigmoid")
        ])
        m.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
        return m

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = {"accuracy": [], "macro_f1": [], "recall_class_1": [], "roc_auc": []}

    for fold, (tr_idx, val_idx) in enumerate(skf.split(X_train_scaled, y_train)):
        X_cv_tr, X_cv_val = X_train_scaled[tr_idx], X_train_scaled[val_idx]
        y_cv_tr, y_cv_val = y_train.iloc[tr_idx],   y_train.iloc[val_idx]
        m = build_model_c(X_train_scaled.shape[1])
        m.fit(X_cv_tr, y_cv_tr, epochs=50, batch_size=32, verbose=0)
        proba = m.predict(X_cv_val, verbose=0).flatten()
        pred  = (proba > 0.5).astype(int)
        cv_scores["accuracy"].append(accuracy_score(y_cv_val, pred))
        cv_scores["macro_f1"].append(f1_score(y_cv_val, pred, average="macro"))
        cv_scores["recall_class_1"].append(recall_score(y_cv_val, pred, pos_label=1))
        cv_scores["roc_auc"].append(roc_auc_score(y_cv_val, proba))
        print(f"Fold {fold+1} ROC-AUC: {cv_scores["roc_auc"][-1]:.4f}")

    final_model = build_model_c(X_train_scaled.shape[1])
    final_model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, verbose=0)
    y_pred_proba = final_model.predict(X_test_scaled, verbose=0).flatten()
    y_pred = (y_pred_proba > 0.5).astype(int)
    print("Final model trained.")


## Evaluation

In [None]:
# ── VERSION a ─────────────────────────────────────────────────────────────
if VERSION == "a":
    fpr_lr,  tpr_lr,  _ = roc_curve(y_test, y_pred_lr_proba)
    fpr_xgb, tpr_xgb, _ = roc_curve(y_test, y_pred_xgb_proba)
    fpr_ann, tpr_ann, _ = roc_curve(y_test, y_pred_ann_proba)

    plt.figure(figsize=(8, 6))
    plt.plot(fpr_lr,  tpr_lr,  label=f"LR  (AUC={auc(fpr_lr,  tpr_lr):.4f})")
    plt.plot(fpr_xgb, tpr_xgb, label=f"XGB (AUC={auc(fpr_xgb, tpr_xgb):.4f})")
    plt.plot(fpr_ann, tpr_ann, label=f"ANN (AUC={auc(fpr_ann, tpr_ann):.4f})")
    plt.plot([0,1],[0,1],"k--")
    plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title("ROC Curves")
    plt.legend(); plt.show()

    print(pd.DataFrame({"Model": ["ANN", "LR"], "MSE": [ann_mse, lr_mse]}).to_string(index=False))

    for name, y_pred, y_proba in [("LR", y_pred_lr, y_pred_lr_proba),
                                   ("XGBoost", y_pred_xgb, y_pred_xgb_proba),
                                   ("ANN", y_pred_ann, y_pred_ann_proba)]:
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(4, 3))
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                    xticklabels=["Fully Paid", "Charged Off"],
                    yticklabels=["Fully Paid", "Charged Off"])
        plt.title(f"Confusion Matrix — {name}")
        plt.xlabel("Predicted"); plt.ylabel("Actual"); plt.show()
        print(f"\nClassification Report — {name}:")
        print(classification_report(y_test, y_pred, zero_division=0))


# ── VERSION b ─────────────────────────────────────────────────────────────
elif VERSION == "b":
    scores_df = pd.DataFrame(scores_dict)
    print("\nROC AUC Summary (Train / Test):")
    print(scores_df.to_string())

    scores_df.T.plot(kind="bar", figsize=(8, 5))
    plt.title("ROC AUC — Train vs Test")
    plt.xlabel("Model"); plt.ylabel("ROC AUC")
    plt.xticks(rotation=30); plt.legend(title="Dataset")
    plt.tight_layout(); plt.show()

    for name, clf in [("XGBoost", xgb_clf), ("Random Forest", rf_clf)]:
        ConfusionMatrixDisplay.from_estimator(
            clf, X_test_scaled, y_test, cmap="Blues",
            display_labels=["Charged Off", "Fully Paid"]
        )
        plt.title(f"{name} — Confusion Matrix"); plt.show()

    disp = RocCurveDisplay.from_estimator(xgb_clf, X_test_scaled, y_test, name="XGBoost")
    RocCurveDisplay.from_estimator(rf_clf, X_test_scaled, y_test, ax=disp.ax_, name="Random Forest")
    plt.title("ROC Curves"); plt.show()


# ── VERSION c ─────────────────────────────────────────────────────────────
else:  # VERSION == "c"
    print("Cross-Validation Results:")
    for metric, scores in cv_scores.items():
        print(f"  {metric:20s}: {np.mean(scores):.4f} ± {np.std(scores):.4f}")

    print("\nTest Set Results:")
    print(classification_report(y_test, y_pred, target_names=["Fully Paid", "Charged Off"]))

    fig, axes = plt.subplots(2, 2, figsize=(14, 12))

    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    axes[0, 0].plot(fpr, tpr, label=f"AUC={auc(fpr, tpr):.4f}", linewidth=2)
    axes[0, 0].plot([0,1],[0,1],"k--", label="Random")
    axes[0, 0].set_xlabel("FPR"); axes[0, 0].set_ylabel("TPR")
    axes[0, 0].set_title("ROC Curve"); axes[0, 0].legend(); axes[0, 0].grid(alpha=0.3)

    precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
    no_skill = float(y_test.mean())
    axes[0, 1].plot([0,1],[no_skill,no_skill],"--",label="Baseline")
    axes[0, 1].plot(recall, precision, label="Model", linewidth=2)
    axes[0, 1].set_xlabel("Recall"); axes[0, 1].set_ylabel("Precision")
    axes[0, 1].set_title("Precision-Recall Curve"); axes[0, 1].legend(); axes[0, 1].grid(alpha=0.3)

    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", ax=axes[1, 0],
                xticklabels=["Fully Paid","Charged Off"],
                yticklabels=["Fully Paid","Charged Off"])
    axes[1, 0].set_xlabel("Predicted"); axes[1, 0].set_ylabel("Actual")
    axes[1, 0].set_title("Confusion Matrix")

    thresholds = np.linspace(0, 1, 100)
    p_scores, r_scores, f1_t, mf1_t = [], [], [], []
    for t in thresholds:
        yp = (y_pred_proba > t).astype(int)
        p_scores.append(precision_score(y_test, yp, zero_division=0))
        r_scores.append(recall_score(y_test, yp, zero_division=0))
        f1_t.append(f1_score(y_test, yp, zero_division=0))
        mf1_t.append(f1_score(y_test, yp, average="macro", zero_division=0))
    axes[1, 1].plot(thresholds, p_scores, label="Precision", linewidth=2)
    axes[1, 1].plot(thresholds, r_scores, label="Recall",    linewidth=2)
    axes[1, 1].plot(thresholds, f1_t,     label="F1",        linewidth=2)
    axes[1, 1].plot(thresholds, mf1_t,    label="Macro-F1",  linewidth=2)
    axes[1, 1].axvline(0.5, color="grey", linestyle="--", alpha=0.7)
    axes[1, 1].set_xlabel("Threshold"); axes[1, 1].set_ylabel("Score")
    axes[1, 1].set_title("Threshold Analysis"); axes[1, 1].legend(); axes[1, 1].grid(alpha=0.3)

    plt.tight_layout(); plt.show()
