# Credit Default Modeling (Utility-Focused, Multi-Seed)
Utility-based comparison of Logistic Regression, Decision Tree, SVM (RBF), Gaussian Naive Bayes, and k-NN on the UCI credit card default data.

## 1. Imports
Load analysis, modeling, and plotting libraries. `xlrd` is pulled in for the Excel source file.

In [None]:
%pip install -q xlrd seaborn scikit-learn

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    roc_curve,
    confusion_matrix,
)

sns.set_theme(style="whitegrid")
pd.set_option("display.max_columns", 40)


## 2. Load the data
Read the Excel file, rename the target column to `TARGET`, and preview the dataset.

In [None]:
DATA_FILE = Path("default of credit card clients.xls")
if not DATA_FILE.exists():
    DATA_FILE = Path("Code") / "Final Project" / "default of credit card clients.xls"

if not DATA_FILE.exists():
    raise FileNotFoundError(f"Could not find data file at {DATA_FILE}")

raw_df = (
    pd.read_excel(DATA_FILE, header=1)
    .rename(columns={"default payment next month": "TARGET"})
)

print(f"Loaded {raw_df.shape[0]:,} rows and {raw_df.shape[1]} columns.")
raw_df.head()


## 3. Split features and target
Separate predictors (`X`) from the binary target (`y`).

In [None]:
X = raw_df.drop(columns="TARGET")
y = raw_df["TARGET"].astype(int)

print(f"Feature matrix shape: {X.shape}")
print(f"Target breakdown: {y.value_counts().to_dict()}")
display(X.head(), y.head())


## 4. Reusable splits and scalers (shared across seeds)
Build consistent train/validation/test splits for each seed and fit a scaler per seed. All models reuse these splits to stay comparable across seeds.

In [None]:
SEED_PLAN = [2025, 0, 1033]
VAL_SEED = 0
BASELINE_SEED = SEED_PLAN[0]


def build_split_bundle(split_seed, val_seed=VAL_SEED):
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.2,
        random_state=split_seed,
        stratify=y,
    )

    X_train_sub, X_val, y_train_sub, y_val = train_test_split(
        X_train,
        y_train,
        test_size=0.25,
        random_state=val_seed,
        stratify=y_train,
    )

    scaler = StandardScaler()
    scaler.fit(X_train_sub)

    return {
        "seed": split_seed,
        "X_train_sub": X_train_sub,
        "X_val": X_val,
        "X_test": X_test,
        "y_train_sub": y_train_sub,
        "y_val": y_val,
        "y_test": y_test,
        "scaler": scaler,
        "X_train_sub_scaled": scaler.transform(X_train_sub),
        "X_val_scaled": scaler.transform(X_val),
        "X_test_scaled": scaler.transform(X_test),
    }


split_store = {s: build_split_bundle(s, val_seed=VAL_SEED) for s in SEED_PLAN}
baseline_split = split_store[BASELINE_SEED]

display(
    {
        "seed": BASELINE_SEED,
        "train_sub": baseline_split["X_train_sub"].shape,
        "val": baseline_split["X_val"].shape,
        "test": baseline_split["X_test"].shape,
    }
)


## 5. Business assumptions and cost matrix
Derive monetary gains/losses from the PDF assumptions to evaluate models on profit rather than accuracy alone.

### 5.1 Observation window and APR assumptions
Compute profit from approving a good customer and loss from approving a bad customer over a six-month window.

In [None]:
df_default = raw_df[raw_df["TARGET"] == 1]   # will default
df_no_default = raw_df[raw_df["TARGET"] == 0]  # will not default

mean_limit_default = df_default["LIMIT_BAL"].mean()
mean_limit_no_default = df_no_default["LIMIT_BAL"].mean()

assumption_config = {
    "annual_apr": 0.18,           # 18% annual percentage rate from the data dictionary PDF
    "periods_per_year": 12,       # monthly compounding
    "observation_months": 6,      # six billing cycles (Apr-Sep 2005 in the PDF)
    "loss_given_default": 0.5,    # lose 50% of the limit if they default
}

annual_apr = assumption_config["annual_apr"]
periods_per_year = assumption_config["periods_per_year"]
observation_months = assumption_config["observation_months"]
loss_given_default = assumption_config["loss_given_default"]

periodic_rate = annual_apr / periods_per_year
period_length_months = 12 / periods_per_year
periods_in_window = observation_months / period_length_months

profit_good = mean_limit_no_default * ((1 + periodic_rate) ** periods_in_window - 1)
loss_bad = mean_limit_default * loss_given_default

print(f"Periodic rate: {periodic_rate:.2%}")
print(f"Approx profit per good customer: {profit_good:,.2f}")
print(f"Approx loss per bad customer:    {loss_bad:,.2f}")


### 5.2 Cost/benefit matrix
Map actual/predicted outcomes to dollar values for the utility function.

In [None]:
value_TN = profit_good    # Actual 0, Pred 0: good customer, approved -> earn interest
value_FP = 0.0              # Actual 0, Pred 1: good customer, rejected -> forego profit
value_FN = -loss_bad        # Actual 1, Pred 0: bad customer, approved -> lose money
value_TP = 0.0              # Actual 1, Pred 1: bad customer, rejected -> avoided loss

value_matrix = pd.DataFrame(
    {
        0: {0: value_TN, 1: value_FN},  # Predicted 0
        1: {0: value_FP, 1: value_TP},  # Predicted 1
    }
)
value_matrix.index.name = "Actual"
value_matrix.columns.name = "Predicted"
value_matrix


## 6. Utility and evaluation helpers
Shared helpers for threshold sweeps, utility calculation, and concise metric summaries.

In [None]:
THRESH_QUANTILES = np.linspace(0.05, 0.95, 19)

def score_predictions(model, features):
    if hasattr(model, "predict_proba"):
        return model.predict_proba(features)[:, 1]
    if hasattr(model, "decision_function"):
        return model.decision_function(features)
    return model.predict(features)

def utility_from_predictions(y_true, y_pred, values=value_matrix):
    cm = confusion_matrix(y_true, y_pred, labels=[0, 1])
    tn, fp, fn, tp = cm.ravel()
    utility_total = (
        tn * values.loc[0, 0]
        + fp * values.loc[0, 1]
        + fn * values.loc[1, 0]
        + tp * values.loc[1, 1]
    )
    utility_per_case = utility_total / len(y_true)
    return utility_total, utility_per_case, {"tn": tn, "fp": fp, "fn": fn, "tp": tp}

def _utility(y_true, y_pred):
    return utility_from_predictions(y_true, y_pred, value_matrix)

def sweep_thresholds(y_true, scores, utility_fn, grid=None):
    grid = np.unique(np.quantile(scores, THRESH_QUANTILES)) if grid is None else grid
    records = []
    for t in grid:
        preds = (scores >= t).astype(int)
        utility_total, utility_pc, _ = utility_fn(y_true, preds)
        records.append(
            {
                "threshold": float(t),
                "utility": utility_total,
                "utility_per_case": utility_pc,
            }
        )
    best = max(records, key=lambda r: r["utility"])
    return {
        "threshold": best["threshold"],
        "val_utility": best["utility"],
        "val_utility_per_case": best["utility_per_case"],
        "sweep": pd.DataFrame(records),
    }

def summarize_model(
    model_name,
    dataset_name,
    y_true,
    y_pred,
    y_score,
    threshold,
    seed,
    utility_fn=_utility,
):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()
    utility_total, utility_pc, _ = utility_fn(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_score) if y_score is not None else np.nan

    return pd.DataFrame(
        [
            {
                "model_name": model_name,
                "dataset_name": dataset_name,
                "seed": seed,
                "threshold": threshold,
                "roc_auc": roc_auc,
                "accuracy": accuracy_score(y_true, y_pred),
                "precision": precision_score(y_true, y_pred, zero_division=0),
                "recall": recall_score(y_true, y_pred, zero_division=0),
                "f1": f1_score(y_true, y_pred, zero_division=0),
                "utility_total": utility_total,
                "utility_per_case": utility_pc,
                "tp": tp,
                "fp": fp,
                "tn": tn,
                "fn": fn,
            }
        ]
    )

def run_model_for_seed(
    model,
    model_name,
    split_seed,
    use_scaled=True,
    threshold_grid=None,
):
    split = split_store[split_seed]
    suffix = "_scaled" if use_scaled else ""
    X_train = split[f"X_train_sub{suffix}"]
    X_val = split[f"X_val{suffix}"]
    X_test = split[f"X_test{suffix}"]
    y_train = split["y_train_sub"]
    y_val = split["y_val"]
    y_test = split["y_test"]

    model.fit(X_train, y_train)

    val_scores = score_predictions(model, X_val)
    sweep = sweep_thresholds(y_val, val_scores, _utility, grid=threshold_grid)

    test_scores = score_predictions(model, X_test)
    test_pred = (test_scores >= sweep["threshold"]).astype(int)

    summary = summarize_model(
        model_name=model_name,
        dataset_name=f"Test (seed={split_seed})",
        y_true=y_test,
        y_pred=test_pred,
        y_score=test_scores,
        threshold=sweep["threshold"],
        seed=split_seed,
    )

    return {
        "seed": split_seed,
        "model_name": model_name,
        "model": model,
        "threshold": sweep["threshold"],
        "val_sweep": sweep["sweep"],
        "val_utility": sweep["val_utility"],
        "summary": summary,
        "test_scores": test_scores,
        "test_pred": test_pred,
    }

def run_knn_for_seed(split_seed, k_grid=(5, 15, 25, 35), threshold_grid=None):
    split = split_store[split_seed]
    X_train = split["X_train_sub_scaled"]
    X_val = split["X_val_scaled"]
    X_test = split["X_test_scaled"]
    y_train = split["y_train_sub"]
    y_val = split["y_val"]
    y_test = split["y_test"]

    best = None
    for k in k_grid:
        model = KNeighborsClassifier(n_neighbors=k, weights="distance")
        model.fit(X_train, y_train)
        val_scores = score_predictions(model, X_val)
        sweep = sweep_thresholds(y_val, val_scores, _utility, grid=threshold_grid)

        if best is None or sweep["val_utility"] > best["val_utility"]:
            best = {
                "k": k,
                "model": model,
                "sweep": sweep,
                "val_utility": sweep["val_utility"],
            }

    best_model = best["model"]
    test_scores = score_predictions(best_model, X_test)
    test_pred = (test_scores >= best["sweep"]["threshold"]).astype(int)

    summary = summarize_model(
        model_name="k-NN",
        dataset_name=f"Test (seed={split_seed})",
        y_true=y_test,
        y_pred=test_pred,
        y_score=test_scores,
        threshold=best["sweep"]["threshold"],
        seed=split_seed,
    )

    return {
        "seed": split_seed,
        "model_name": "k-NN",
        "model": best_model,
        "k": best["k"],
        "threshold": best["sweep"]["threshold"],
        "val_sweep": best["sweep"]["sweep"],
        "val_utility": best["val_utility"],
        "summary": summary,
        "test_scores": test_scores,
        "test_pred": test_pred,
    }


## 7. Models trained across seeds
Train each model on the shared splits, tune thresholds on the validation fold, and evaluate on the test fold for all seeds.

### 7.1 Logistic Regression

In [None]:
logreg_runs = [
    run_model_for_seed(
        LogisticRegression(
            penalty="l2",
            solver="liblinear",
            max_iter=300,
            random_state=s,
        ),
        model_name="Logistic Regression",
        split_seed=s,
        use_scaled=True,
    )
    for s in SEED_PLAN
]

logreg_summary_df = pd.concat([r["summary"] for r in logreg_runs], ignore_index=True)
logreg_summary_df


### 7.2 Decision Tree

In [None]:
dt_runs = [
    run_model_for_seed(
        DecisionTreeClassifier(
            random_state=s,
            min_samples_leaf=5,
            class_weight="balanced",
        ),
        model_name="Decision Tree",
        split_seed=s,
        use_scaled=False,
    )
    for s in SEED_PLAN
]

dt_summary_df = pd.concat([r["summary"] for r in dt_runs], ignore_index=True)
dt_summary_df


### 7.3 SVM (RBF)

In [None]:
svm_runs = [
    run_model_for_seed(
        SVC(
            kernel="rbf",
            C=1.0,
            gamma="scale",
            probability=False,
            class_weight="balanced",
            random_state=s,
        ),
        model_name="SVM (RBF)",
        split_seed=s,
        use_scaled=True,
    )
    for s in SEED_PLAN
]

svm_summary_df = pd.concat([r["summary"] for r in svm_runs], ignore_index=True)
svm_summary_df


### 7.4 Naive Bayes (Gaussian)

In [None]:
nb_runs = [
    run_model_for_seed(
        GaussianNB(),
        model_name="Naive Bayes",
        split_seed=s,
        use_scaled=True,
    )
    for s in SEED_PLAN
]

nb_summary_df = pd.concat([r["summary"] for r in nb_runs], ignore_index=True)
nb_summary_df


### 7.5 k-NN

In [None]:
knn_runs = [run_knn_for_seed(s, k_grid=(5, 15, 25, 35, 50)) for s in SEED_PLAN]

knn_summary_df = pd.concat([r["summary"] for r in knn_runs], ignore_index=True)

pd.DataFrame(
    [
        {
            "seed": r["seed"],
            "best_k": r["k"],
            "threshold": r["threshold"],
            "val_utility": r["val_utility"],
        }
        for r in knn_runs
    ]
)


In [None]:
model_runs = {
    "Logistic Regression": logreg_runs,
    "Decision Tree": dt_runs,
    "SVM (RBF)": svm_runs,
    "Naive Bayes": nb_runs,
    "k-NN": knn_runs,
}

model_summary_frames = {
    "Logistic Regression": logreg_summary_df,
    "Decision Tree": dt_summary_df,
    "SVM (RBF)": svm_summary_df,
    "Naive Bayes": nb_summary_df,
    "k-NN": knn_summary_df,
}

combined_model_summaries = pd.concat(model_summary_frames.values(), ignore_index=True)
combined_model_summaries


## 8. Multi-model visualizations
Seed-level utility bars, ROC curves (baseline seed), and a confusion matrix for the top average-utility model.

In [None]:
plot_data = combined_model_summaries.copy()
plot_data["seed"] = plot_data["seed"].astype(int)

fig, ax = plt.subplots(figsize=(10, 5))
sns.barplot(data=plot_data, x="seed", y="utility_per_case", hue="model_name", ax=ax)
ax.set_title("Utility per case by model and seed")
ax.set_xlabel("Seed")
ax.set_ylabel("Utility per case")
ax.legend(bbox_to_anchor=(1.02, 1), loc="upper left")
plt.tight_layout()


In [None]:
baseline_y_true = baseline_split["y_test"].values

fig, ax = plt.subplots(figsize=(8, 6))
for name, runs in model_runs.items():
    baseline_run = next(r for r in runs if r["seed"] == BASELINE_SEED)
    fpr, tpr, _ = roc_curve(baseline_y_true, baseline_run["test_scores"])
    auc_val = baseline_run["summary"]["roc_auc"].iloc[0]
    ax.plot(fpr, tpr, label=f"{name} (AUC={auc_val:.3f})")

ax.plot([0, 1], [0, 1], "k--", linewidth=1)
ax.set_title("ROC curves (baseline seed)")
ax.set_xlabel("False positive rate")
ax.set_ylabel("True positive rate")
ax.legend(loc="lower right")
plt.tight_layout()


In [None]:
avg_util = combined_model_summaries.groupby("model_name")["utility_per_case"].mean().sort_values(ascending=False)
top_model_name = avg_util.index[0]
top_run = next(r for r in model_runs[top_model_name] if r["seed"] == BASELINE_SEED)

cm = confusion_matrix(baseline_y_true, top_run["test_pred"], labels=[0, 1])
fig, ax = plt.subplots(figsize=(4.5, 4))
sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap="Blues",
    cbar=False,
    ax=ax,
    xticklabels=["Approve", "Reject"],
    yticklabels=["Actual 0", "Actual 1"],
)
ax.set_title(f"Confusion matrix (baseline seed, {top_model_name})")
ax.set_xlabel("Prediction")
ax.set_ylabel("Actual")
plt.tight_layout()


## 9. Summary table (all seeds)
Per-seed metrics for every model plus the three baselines.

In [None]:
baseline_true = baseline_split["y_test"].values

def baseline_summary(name, y_pred):
    return summarize_model(
        model_name=name,
        dataset_name="Test (baseline seed)",
        y_true=baseline_true,
        y_pred=y_pred,
        y_score=None,
        threshold=np.nan,
        seed=np.nan,
    )

rng = np.random.default_rng(0)
baseline_random = baseline_summary("Baseline - Random", rng.integers(0, 2, size=len(baseline_true)))
baseline_approve = baseline_summary("Baseline - Approve all", np.zeros_like(baseline_true))
baseline_reject = baseline_summary("Baseline - Reject all", np.ones_like(baseline_true))

baseline_df = pd.concat([baseline_random, baseline_approve, baseline_reject], ignore_index=True)

summary_table = pd.concat([baseline_df, combined_model_summaries], ignore_index=True)
summary_table


## 10. Model comparison summary (baselines + averages)
Average the seed-level rows to compare overall performance alongside the baselines.

In [None]:
def average_rows(df, label):
    numeric_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
    numeric_cols_no_seed = [c for c in numeric_cols if c != "seed"]
    avg_row = df[numeric_cols_no_seed].mean().to_frame().T
    avg_row["model_name"] = f"{label} (avg seeds)"
    avg_row["dataset_name"] = "Test (avg seeds)"
    cols = [c for c in df.columns if c != "seed"]
    return avg_row.reindex(columns=cols)

avg_rows = [
    average_rows(logreg_summary_df, "Logistic Regression"),
    average_rows(dt_summary_df, "Decision Tree"),
    average_rows(svm_summary_df, "SVM (RBF)"),
    average_rows(nb_summary_df, "Naive Bayes"),
    average_rows(knn_summary_df, "k-NN"),
]

comparison_df = pd.concat([baseline_df] + avg_rows, ignore_index=True)
comparison_df
