# KNN for Loan Default Predictions

## 1. Import libraries

Install xlrd for Excel support and pull in the analysis, plotting, and k-NN libraries used throughout.

In [1]:
%pip install xlrd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans

Note: you may need to restart the kernel to use updated packages.


## 2. Load the data

Load the credit default dataset from Excel, rename the target column to TARGET, and preview the data.

In [2]:
DATA_FILE = "default of credit card clients.xls"

raw_df = (
    pd.read_excel(DATA_FILE, header=1)
    .rename(columns={"default payment next month": "TARGET"})
)

raw_df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,TARGET
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


## 3. Prepare the data

Split the dataframe into feature columns (X) and the TARGET labels (y), then display both.

In [3]:
X = raw_df.drop(columns = 'TARGET')

y = raw_df['TARGET']

display(X,y)

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
0,1,20000,2,2,1,24,2,2,-1,-1,...,689,0,0,0,0,689,0,0,0,0
1,2,120000,2,2,2,26,-1,2,0,0,...,2682,3272,3455,3261,0,1000,1000,1000,0,2000
2,3,90000,2,2,2,34,0,0,0,0,...,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000
3,4,50000,2,2,1,37,0,0,0,0,...,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000
4,5,50000,1,2,1,57,-1,0,-1,0,...,35835,20940,19146,19131,2000,36681,10000,9000,689,679
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,29996,220000,1,3,1,39,0,0,0,0,...,208365,88004,31237,15980,8500,20000,5003,3047,5000,1000
29996,29997,150000,1,3,2,43,-1,-1,-1,-1,...,3502,8979,5190,0,1837,3526,8998,129,0,0
29997,29998,30000,1,2,2,37,4,3,2,-1,...,2758,20878,20582,19357,0,0,22000,4200,2000,3100
29998,29999,80000,1,3,1,41,1,-1,0,0,...,76304,52774,11855,48944,85900,3409,1178,1926,52964,1804


0        1
1        1
2        0
3        0
4        0
        ..
29995    0
29996    0
29997    1
29998    1
29999    1
Name: TARGET, Length: 30000, dtype: int64

## 4. Split the data

Create an 80/20 train-test split with a fixed seed, then carve out a validation fold from the training data for threshold tuning. Even with grid search, we keep a held-out test fold to avoid leakage.

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=2025,
    stratify=y,
)  # 20% of data is test here

X_train_sub, X_val, y_train_sub, y_val = train_test_split(
    X_train,
    y_train,
    test_size=0.25,
    random_state=0,
    stratify=y_train,
)  # carve out a validation fold (20% of original data) for threshold tuning

display(X_train_sub.shape, X_val.shape, X_test.shape, y_train_sub.shape, y_val.shape, y_test.shape)


(18000, 24)

(6000, 24)

(6000, 24)

(18000,)

(6000,)

(6000,)

## 5. Scale the features

Standardize features using training-fold statistics and confirm the means and standard deviations of the scaled sets.

In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()                           # define the scaler

scaler.fit(X_train_sub)                             # train the scaler on training fold only
X_train_scaled = scaler.transform(X_train_sub)      # apply the scaler to transform the training data
X_val_scaled = scaler.transform(X_val)              # transform validation data
X_test_scaled = scaler.transform(X_test)            # apply the scaler to transform test data

dataframe_scaled = pd.DataFrame(np.round(X_train_scaled, 2), columns=X.columns)
dataframe_scaled
dataframe_scaled.describe()


Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
count,18000.0,18000.0,18000.0,18000.0,18000.0,18000.0,18000.0,18000.0,18000.0,18000.0,...,18000.0,18000.0,18000.0,18000.0,18000.0,18000.0,18000.0,18000.0,18000.0,18000.0
mean,1e-05,0.000136,-0.000319,-0.001118,0.000288,-8.4e-05,-0.000151,-0.000464,0.000104,0.001522,...,0.000309,8.3e-05,0.000273,0.000589,0.000641,-0.000301,9.9e-05,-0.000791,0.000955,-2.2e-05
std,1.000009,1.000311,1.002294,0.999471,0.998925,0.999967,0.999705,0.999906,1.001985,1.000076,...,0.999838,0.999994,0.999914,0.999636,0.999823,1.000181,0.999996,1.000205,0.999749,1.000021
min,-1.74,-1.22,-1.24,-2.34,-2.96,-1.57,-1.77,-1.56,-1.54,-1.53,...,-1.55,-3.31,-2.0,-2.24,-0.36,-0.25,-0.32,-0.32,-0.31,-0.3
25%,-0.87,-0.91,-1.24,-1.08,-1.05,-0.82,-0.87,-0.73,-0.7,-0.67,...,-0.63,-0.63,-0.63,-0.63,-0.3,-0.21,-0.3,-0.3,-0.3,-0.29
50%,0.0,-0.21,0.81,0.18,0.86,-0.17,0.02,0.11,0.14,0.19,...,-0.38,-0.38,-0.37,-0.37,-0.23,-0.16,-0.21,-0.22,-0.21,-0.21
75%,0.86,0.57,0.81,0.18,0.86,0.69,0.02,0.11,0.14,0.19,...,0.18,0.17,0.16,0.17,-0.04,-0.04,-0.04,-0.05,-0.05,-0.07
max,1.73,4.91,0.81,5.23,2.77,4.68,7.16,6.8,6.84,7.05,...,23.06,10.28,12.86,11.09,31.11,69.2,25.7,32.52,27.71,30.43


## 6. Reusable splits and scalers (shared across models)

Define the seed plan once, build consistent train/val/test splits for each seed, and fit a scaler per seed on the training fold. All models should reuse these cached splits/scalers to keep comparisons fair.

In [6]:
SEED_PLAN = [2025, 0, 1033]
VAL_SEED = 0
BASELINE_SEED = SEED_PLAN[0]


def build_split_bundle(split_seed, val_seed=VAL_SEED):
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.2,
        random_state=split_seed,
        stratify=y,
    )

    X_train_sub, X_val, y_train_sub, y_val = train_test_split(
        X_train,
        y_train,
        test_size=0.25,
        random_state=val_seed,
        stratify=y_train,
    )

    scaler = StandardScaler()
    scaler.fit(X_train_sub)

    return {
        "seed": split_seed,
        "X_train_sub": X_train_sub,
        "X_val": X_val,
        "X_test": X_test,
        "y_train_sub": y_train_sub,
        "y_val": y_val,
        "y_test": y_test,
        "scaler": scaler,
        "X_train_sub_scaled": scaler.transform(X_train_sub),
        "X_val_scaled": scaler.transform(X_val),
        "X_test_scaled": scaler.transform(X_test),
    }


split_store = {s: build_split_bundle(s, val_seed=VAL_SEED) for s in SEED_PLAN}
baseline_split = split_store[BASELINE_SEED]

# Quick sanity check on shapes for the baseline seed
display(
    baseline_split["X_train_sub"].shape,
    baseline_split["X_val"].shape,
    baseline_split["X_test"].shape,
    baseline_split["y_train_sub"].shape,
    baseline_split["y_val"].shape,
    baseline_split["y_test"].shape,
)


(18000, 24)

(6000, 24)

(6000, 24)

(18000,)

(6000,)

(6000,)

## 7. Business assumptions (cost/benefit inputs)

Compute average credit limits for defaulters and non-defaulters to ground the cost assumptions.

In [7]:
# 1. Separate defaulters vs non-defaulters
df_default    = raw_df[raw_df["TARGET"] == 1]  # will default
df_no_default = raw_df[raw_df["TARGET"] == 0]  # will NOT default

# 2. Typical credit limit (principal proxy)
mean_limit_default    = df_default["LIMIT_BAL"].mean()
mean_limit_no_default = df_no_default["LIMIT_BAL"].mean()

print("Avg credit limit (default):    ", mean_limit_default)
print("Avg credit limit (no default):", mean_limit_no_default)

Avg credit limit (default):     130109.65641952984
Avg credit limit (no default): 178099.72607430234


### 7.1 Observation window and APR assumptions

Apply the PDF's six-month observation window and APR assumptions to derive per-period profit/loss inputs for the cost matrix.

In [8]:
# Business assumptions grounded in the PDF's six-month observation window
# April–September 2005 appears in the data dictionary PDF, so model six monthly billing cycles.
assumption_config = {
    "annual_apr": 0.18,             # 18% annual percentage rate from the referenced credit card PDF
    "periods_per_year": 12,        # monthly compounding
    "observation_months": 6,       # six billing cycles in the PDF window (Apr–Sep 2005)
    "loss_given_default": 0.5,     # lose 50% of the limit if they default
}

annual_apr = assumption_config["annual_apr"]
periods_per_year = assumption_config["periods_per_year"]
observation_months = assumption_config["observation_months"]
loss_given_default = assumption_config["loss_given_default"]

periodic_rate = annual_apr / periods_per_year
period_length_months = 12 / periods_per_year
periods_in_window = observation_months / period_length_months

# Approx profit from approving a good customer (non-defaulter) over the observation window
profit_good = mean_limit_no_default * ((1 + periodic_rate) ** periods_in_window - 1)

# Approx loss from approving a bad customer (defaulter)
loss_bad = mean_limit_default * loss_given_default

print("Periodic rate: {:.2%}".format(periodic_rate))
print("Approx profit per good customer: {:.2f}".format(profit_good))
print("Approx loss per bad customer:    {:.2f}".format(loss_bad))


Periodic rate: 1.50%
Approx profit per good customer: 16642.22
Approx loss per bad customer:    65054.83


### 7.2 Cost/benefit matrix

Construct a cost/benefit matrix that maps actual/predicted outcomes to monetary values using the assumptions.

In [9]:
# 3. Cost/benefit matrix driven by the data

# Story:
# TARGET=1 = will default
# Prediction=1 = treat as risky (reject / restrict credit)
# Prediction=0 = treat as safe (approve / keep credit)

value_TN =  profit_good   # Actual 0, Pred 0: good customer, approved → earn interest
value_FP =  0.0           # Actual 0, Pred 1: good customer, rejected → lose that profit
value_FN = -loss_bad      # Actual 1, Pred 0: bad customer, approved → lose money
value_TP =  0.0           # Actual 1, Pred 1: bad customer, rejected → avoided loss, no loan

value_matrix = pd.DataFrame(
    {
        0: {0: value_TN, 1: value_FN},  # Predicted 0
        1: {0: value_FP, 1: value_TP},  # Predicted 1
    }
)
value_matrix.index.name = "Actual"
value_matrix.columns.name = "Predicted"

value_matrix

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,16642.219712,0.0
1,-65054.82821,0.0


## 8. Models trained across seeds (LogReg, Decision Tree, SVM, Naive Bayes, KNN)

Train and threshold-tune each model on the shared splits using the same utility function and validation sweep across the seed plan.

### 8.1 Logistic Regression
- Liblinear solver, max_iter=200, seed-aligned.
- Uses scaled features from the shared splits.
- Validation sweep over thresholds; best threshold applied to test set per seed.

### 8.2 Decision Tree
- CART with min_samples_leaf=5, seed-aligned.
- Fits on scaled features (keeps parity with other models).
- Validation sweep over thresholds for each seed.

### 8.3 SVM (RBF)
- RBF kernel, C=1.0, gamma='scale', probability=True for scores.
- Trained on scaled features; thresholds tuned on validation per seed.

### 8.4 Naive Bayes (Gaussian)
- GaussianNB on scaled features.
- Validation threshold sweep per seed, same utility function.

### 8.5 KNN

### 8.6 Multi-model training & tuning code
The code below loops through all four models across the seed plan using the shared splits, returning per-seed summaries and best thresholds.

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB


def _get_positive_scores(model, X):
    """Return positive-class scores using predict_proba or decision_function."""
    if hasattr(model, "predict_proba"):
        proba = model.predict_proba(X)
        if proba.ndim == 1:
            return proba
        return proba[:, -1]
    if hasattr(model, "decision_function"):
        scores = model.decision_function(X)
        if scores.ndim > 1:
            scores = scores[:, -1]
        return 1.0 / (1.0 + np.exp(-scores))
    raise ValueError("Model does not expose predict_proba or decision_function")


def run_generic_model_for_seed(label, model_factory, split_seed):
    bundle = split_store[split_seed]
    model = model_factory(split_seed)
    model.fit(bundle["X_train_sub_scaled"], bundle["y_train_sub"])

    y_score_val = _get_positive_scores(model, bundle["X_val_scaled"])
    threshold_grid = np.linspace(0.0, 1.0, 101)

    threshold_summaries = []
    for _thr in threshold_grid:
        y_pred_thr = (y_score_val >= _thr).astype(int)
        threshold_summaries.append(
            summarize_model(
                model_name=f"{label} (seed={split_seed})",
                dataset_name="Validation",
                y_true=bundle["y_val"],
                y_pred=y_pred_thr,
                y_score=y_score_val,
                threshold=_thr,
                utility_fn=_knn_utility,
            )
        )

    threshold_results = pd.concat(threshold_summaries, ignore_index=True)
    best_idx = threshold_results['utility_per_app'].fillna(-np.inf).idxmax()
    best_threshold = threshold_results.loc[best_idx, 'threshold']

    y_score_test = _get_positive_scores(model, bundle["X_test_scaled"])
    y_pred_best = (y_score_test >= best_threshold).astype(int)

    summary = summarize_model(
        model_name=f"{label} (seed={split_seed})",
        dataset_name="Test",
        y_true=bundle["y_test"],
        y_pred=y_pred_best,
        y_score=y_score_test,
        threshold=best_threshold,
        utility_fn=_knn_utility,
    )

    summary["seed"] = split_seed

    return {
        "seed": split_seed,
        "best_threshold": best_threshold,
        "summary": summary,
        "y_pred_best": y_pred_best,
        "y_score_test": y_score_test,
        "y_test": bundle["y_test"],
    }


model_specs = {
    "Logistic Regression": lambda seed: LogisticRegression(max_iter=200, solver="liblinear", random_state=seed),
    "Decision Tree": lambda seed: DecisionTreeClassifier(random_state=seed, min_samples_leaf=5),
    "SVM (RBF)": lambda seed: SVC(probability=True, kernel="rbf", C=1.0, gamma="scale", random_state=seed),
    "Naive Bayes": lambda seed: GaussianNB(),
}

model_seed_runs = {}
model_seed_summary_dfs = {}
for label, factory in model_specs.items():
    runs = [run_generic_model_for_seed(label, factory, split_seed=s) for s in SEED_PLAN]
    model_seed_runs[label] = runs
    model_seed_summary_dfs[label] = pd.concat([r["summary"] for r in runs], ignore_index=True)

logreg_summary_df = model_seed_summary_dfs["Logistic Regression"]
dt_summary_df = model_seed_summary_dfs["Decision Tree"]
svm_summary_df = model_seed_summary_dfs["SVM (RBF)"]
nb_summary_df = model_seed_summary_dfs["Naive Bayes"]


# 9. Multi Model Visualization

## 10. Summary table (all seeds)

Per-seed results for every model plus the three baselines. This is the detailed view before averaging across seeds.

In [20]:
# Per-seed summary tables for each model + baselines
knn_seed_runs = []
for s in SEED_PLAN:
    if s == BASELINE_SEED:
        knn_seed_runs.append(knn_baseline_run)
    else:
        knn_seed_runs.append(run_knn_for_seed(split_seed=s))

knn_summary_df = pd.concat([run["summary"] for run in knn_seed_runs], ignore_index=True)

baseline_true = baseline_split["y_test"]

def _baseline_summary(name, y_pred):
    return summarize_model(
        model_name=name,
        dataset_name="Test (baseline seed)",
        y_true=baseline_true,
        y_pred=y_pred,
        y_score=None,
        threshold=np.nan,
        utility_fn=_knn_utility,
    )

rng = np.random.default_rng(0)
baseline_random = _baseline_summary("Baseline - Random", rng.integers(0, 2, size=len(baseline_true)))
baseline_approve = _baseline_summary("Baseline - Approve all", np.zeros_like(baseline_true))
baseline_reject = _baseline_summary("Baseline - Reject all", np.ones_like(baseline_true))

baseline_df = pd.concat([baseline_random, baseline_approve, baseline_reject], ignore_index=True)
baseline_df["seed"] = np.nan

summary_table = pd.concat(
    [
        baseline_df,
        knn_summary_df,
        logreg_summary_df,
        dt_summary_df,
        svm_summary_df,
        nb_summary_df,
    ],
    ignore_index=True,
)
summary_table


Unnamed: 0,model_name,dataset_name,threshold,TN,FP,FN,TP,fprate,fnrate,approval_rate,rejection_rate,roc_auc,pr_auc,precision,recall,f1,utility_total,utility_per_app,seed
0,Baseline - Random,Test (baseline seed),,2310,2363,683,644,0.505671,0.514695,0.498833,0.501167,,,0.214167,0.485305,0.297185,-5988920.0,-998.153356,
1,Baseline - Approve all,Test (baseline seed),,4673,0,1327,0,0.0,1.0,1.0,0.0,,,0.0,0.0,0.0,-8558664.0,-1426.444054,
2,Baseline - Reject all,Test (baseline seed),,0,4673,0,1327,1.0,0.0,0.0,1.0,,,0.221167,1.0,0.362222,0.0,0.0,
3,"k-NN (k=29, seed=2025)",Test,0.21,3550,1123,532,795,0.240317,0.400904,0.680333,0.319667,0.735951,0.479544,0.414494,0.599096,0.489985,24470710.0,4078.451895,2025.0
4,"k-NN (k=15, seed=0)",Test,0.21,3529,1144,553,774,0.244811,0.416729,0.680333,0.319667,0.725899,0.467545,0.403545,0.583271,0.477042,22755070.0,3792.512227,0.0
5,"k-NN (k=33, seed=1033)",Test,0.22,3715,958,552,775,0.205007,0.415976,0.711167,0.288833,0.740781,0.49263,0.447201,0.584024,0.506536,25915580.0,4319.26351,1033.0
6,Logistic Regression (seed=2025),Test,0.26,3872,801,606,721,0.17141,0.456669,0.746333,0.253667,0.722908,0.483963,0.473719,0.543331,0.506143,25015450.0,4169.241471,2025.0
7,Logistic Regression (seed=0),Test,0.25,3786,887,577,750,0.189814,0.434815,0.727167,0.272833,0.714144,0.504216,0.458155,0.565185,0.506073,25470810.0,4245.134659,0.0
8,Logistic Regression (seed=1033),Test,0.24,3592,1081,564,763,0.231329,0.425019,0.692667,0.307333,0.71284,0.492759,0.413774,0.574981,0.481236,23087930.0,3847.988349,1033.0
9,Decision Tree (seed=2025),Test,0.12,3077,1596,506,821,0.341536,0.381311,0.597167,0.402833,0.664453,0.367585,0.339677,0.618689,0.438568,18290370.0,3048.394496,2025.0


## 11. Model comparison summary (baselines + averages)

Seed-averaged rows for each model, preceded by the three baselines, to compare overall performance.

In [21]:
# Baselines are already computed above (baseline_random/approve/reject)

def _avg_from(df, label):
    numeric_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
    numeric_cols_no_seed = [c for c in numeric_cols if c != "seed"]
    avg_row = df[numeric_cols_no_seed].mean().to_frame().T
    avg_row["model_name"] = f"{label} (avg seeds)"
    avg_row["dataset_name"] = "Test (avg seeds)"
    cols_order = [c for c in df.columns if c != "seed"]
    return avg_row.reindex(columns=cols_order)

avg_rows = [
    _avg_from(knn_summary_df, "k-NN"),
    _avg_from(logreg_summary_df, "Logistic Regression"),
    _avg_from(dt_summary_df, "Decision Tree"),
    _avg_from(svm_summary_df, "SVM (RBF)"),
    _avg_from(nb_summary_df, "Naive Bayes"),
]

comparison_df = pd.concat(
    [baseline_random, baseline_approve, baseline_reject] + avg_rows,
    ignore_index=True,
)
comparison_df


Unnamed: 0,model_name,dataset_name,threshold,TN,FP,FN,TP,fprate,fnrate,approval_rate,rejection_rate,roc_auc,pr_auc,precision,recall,f1,utility_total,utility_per_app
0,Baseline - Random,Test (baseline seed),,2310.0,2363.0,683.0,644.0,0.505671,0.514695,0.498833,0.501167,,,0.214167,0.485305,0.297185,-5988920.0,-998.153356
1,Baseline - Approve all,Test (baseline seed),,4673.0,0.0,1327.0,0.0,0.0,1.0,1.0,0.0,,,0.0,0.0,0.0,-8558664.0,-1426.444054
2,Baseline - Reject all,Test (baseline seed),,0.0,4673.0,0.0,1327.0,1.0,0.0,0.0,1.0,,,0.221167,1.0,0.362222,0.0,0.0
3,k-NN (avg seeds),Test (avg seeds),0.213333,3598.0,1075.0,545.666667,781.333333,0.230045,0.411203,0.690611,0.309389,0.73421,0.479906,0.421747,0.588797,0.491187,24380460.0,4063.40921
4,Logistic Regression (avg seeds),Test (avg seeds),0.25,3750.0,923.0,582.333333,744.666667,0.197518,0.438834,0.722056,0.277944,0.716631,0.493646,0.448549,0.561166,0.497817,24524730.0,4087.454826
5,Decision Tree (avg seeds),Test (avg seeds),0.123333,3089.333333,1583.666667,522.666667,804.333333,0.338897,0.393871,0.602,0.398,0.662874,0.370299,0.336826,0.606129,0.432998,17411370.0,2901.895647
6,SVM (RBF) (avg seeds),Test (avg seeds),0.166667,3842.666667,830.333333,612.666667,714.333333,0.177687,0.461693,0.742556,0.257444,0.725917,0.507337,0.469255,0.538307,0.497835,24093580.0,4015.596366
7,Naive Bayes (avg seeds),Test (avg seeds),0.636667,3569.0,1104.0,516.0,811.0,0.236251,0.388847,0.680833,0.319167,0.727453,0.484419,0.424447,0.611153,0.500556,25827790.0,4304.631799
