In [19]:
import pandas as pd
import numpy as np
from ucimlrepo import fetch_ucirepo

default_of_credit_card_clients = fetch_ucirepo(id=350)

X = default_of_credit_card_clients.data.features
y = default_of_credit_card_clients.data.targets

df = pd.concat([X, y], axis=1)

print("df.shape:", df.shape)
df.head()


df.shape: (30000, 24)


Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X15,X16,X17,X18,X19,X20,X21,X22,X23,Y
0,20000,2,2,1,24,2,2,-1,-1,-2,...,0,0,0,0,689,0,0,0,0,1
1,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,90000,2,2,2,34,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,50000,2,2,1,37,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,50000,1,2,1,57,-1,0,-1,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


The dataset is used to analyze and predict credit card default behavior to support research in credit risk assessment and financial decision-making.

- **Target:** Y
- **Type:** Binary (0 = No default, 1 = Default)
- **Description:** Indicates whether a client defaulted on their payment in the following month.

- **Source:** UCI Machine Learning Repository  
- **Dataset:** Default of Credit Card Clients (Yeh, 2009)  
- **Access:** ucimlrepo.fetch_ucirepo(id=350)
- **License:** Public for research and educational use

### Feature Summary
- `LIMIT_BAL` (numerical): Credit limit  
- `AGE` (numerical): Client age  
- `SEX`, `EDUCATION`, `MARRIAGE` (categorical): Demographics  
- `PAY_0`–`PAY_6`: Repayment status  
- `BILL_AMT1`–`BILL_AMT6` (numerical): Bill amounts  
- `PAY_AMT1`–`PAY_AMT6` (numerical): Payment amounts  

### Limitations and Risks
- Data is from 2005 and may be outdated  
- Clients are from Taiwan only (limited generalizability)  
- Demographic features may introduce bias  
- Class imbalance between default and non-default cases


In [20]:
print("missingness")
print(df.isna().sum().sort_values(ascending=False))
print()

print("duplicates")
print(df.duplicated().sum())
print()

print("target distribution")
df['Y'].value_counts(normalize=True)






missingness
X1     0
X2     0
X23    0
X22    0
X21    0
X20    0
X19    0
X18    0
X17    0
X16    0
X15    0
X14    0
X13    0
X12    0
X11    0
X10    0
X9     0
X8     0
X7     0
X6     0
X5     0
X4     0
X3     0
Y      0
dtype: int64

duplicates
35

target distribution


Y
0    0.7788
1    0.2212
Name: proportion, dtype: float64

## Leakage-Risk Note

### Plausible Leakage Vectors

1. Variables such as recent repayment status and payment amounts are closely tied to a client’s payment behavior near the time of default. If these features include information from the same period or after the target outcome, they could leak future information into the model.

2. The dataset contains 35 duplicate records. If duplicates appear across training and test splits, the model may effectively “memorize” outcomes, leading to overly optimistic performance estimates.

### Leakage Prevention Strategies

- Ensure that all features used for training strictly precede the prediction window for Y. Avoid using features that summarize behavior after the default decision point.
-  Remove duplicate rows prior to splitting the data into training and test sets to prevent information leakage across splits.



In [21]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

initial_n = len(df)
df_dedup = df.drop_duplicates().reset_index(drop=True)
dedup_n = len(df_dedup)
duplicates_removed = initial_n - dedup_n

print(f"Initial rows: {initial_n}, after dedup: {dedup_n}, duplicates removed: {duplicates_removed}")

X = df_dedup.drop(columns=['Y'])
y = df_dedup['Y']

test_size = 0.20
random_state = 42

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=test_size,
    stratify=y,
    random_state=random_state
)

print("Shapes:")
print(" X_train:", X_train.shape)
print(" X_test: ", X_test.shape)
print(" y_train distribution:\n", y_train.value_counts(normalize=True))
print(" y_test distribution:\n", y_test.value_counts(normalize=True))

Initial rows: 30000, after dedup: 29965, duplicates removed: 35
Shapes:
 X_train: (23972, 23)
 X_test:  (5993, 23)
 y_train distribution:
 Y
0    0.778742
1    0.221258
Name: proportion, dtype: float64
 y_test distribution:
 Y
0    0.778742
1    0.221258
Name: proportion, dtype: float64


In [22]:

pipeline = make_pipeline(
    StandardScaler(),
    LogisticRegression(max_iter=1000, random_state=random_state)
)

pipeline.fit(X_train, y_train)

test_score = pipeline.score(X_test, y_test)
print(f"Test accuracy: {test_score:.4f}")

Test accuracy: 0.8129


In [23]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

knn_pipeline = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("knn", KNeighborsClassifier(n_neighbors=20))
])

knn_pipeline.fit(X_train, y_train)

y_pred = knn_pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"kNN Accuracy (k=20): {accuracy:.4f}")


kNN Accuracy (k=20): 0.8118


In [24]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")


Test Accuracy: 0.8118


In [25]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)

pd.DataFrame(
    cm,
    index=["Actual: No Default", "Actual: Default"],
    columns=["Predicted: No Default", "Predicted: Default"]
)



Unnamed: 0,Predicted: No Default,Predicted: Default
Actual: No Default,4451,216
Actual: Default,912,414


# Week 2

In [26]:
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

experiment_log = []

def log_run(model_name, params, cv_score=None, test_score=None):
    experiment_log.append({
        "model": model_name,
        "params": params,
        "cv_score": cv_score,
        "test_accuracy": test_score,
        "seed": RANDOM_STATE,
    })

def show_log():
    return pd.DataFrame(experiment_log).sort_values(
        by=["test_accuracy", "cv_score"], ascending=False
    )


In [27]:
y_pred_lr = pipeline.predict(X_test)
test_acc_lr = accuracy_score(y_test, y_pred_lr)
log_run(
    model_name="Baseline: LogisticRegression",
    params=pipeline.get_params(),
    cv_score=None,
    test_score=test_acc_lr,
)
print("Logged LogisticRegression baseline:", test_acc_lr)
y_pred_knn = knn_pipeline.predict(X_test)
test_acc_knn = accuracy_score(y_test, y_pred_knn)
log_run(
    model_name="Baseline: kNN",
    params=knn_pipeline.get_params(),
    cv_score=None,
    test_score=test_acc_knn,
)
print("Logged kNN baseline:", test_acc_knn)


Logged LogisticRegression baseline: 0.8129484398464876
Logged kNN baseline: 0.811780410478892


In [28]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.svm import LinearSVC

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)

pipe_linsvc = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("clf", LinearSVC(dual="auto", random_state=RANDOM_STATE, max_iter=5000))
])

param_grid_linsvc = {
    "clf__C": [0.1, 1.0, 10.0],
}

gs_linsvc = GridSearchCV(
    estimator=pipe_linsvc,
    param_grid=param_grid_linsvc,
    scoring="accuracy",
    cv=cv,
    n_jobs=1
)

gs_linsvc.fit(X_train, y_train)

best_linsvc = gs_linsvc.best_estimator_
cv_acc = gs_linsvc.best_score_
test_pred = best_linsvc.predict(X_test)
test_acc = accuracy_score(y_test, test_pred)

print("LinearSVC best params:", gs_linsvc.best_params_)
print(f"LinearSVC CV accuracy:   {cv_acc:.4f}")
print(f"LinearSVC TEST accuracy: {test_acc:.4f}")

log_run(
    model_name="SVM: LinearSVC",
    params=gs_linsvc.best_params_,
    cv_score=cv_acc,
    test_score=test_acc,
)


LinearSVC best params: {'clf__C': 0.1}
LinearSVC CV accuracy:   0.8015
LinearSVC TEST accuracy: 0.8034


In [None]:
from sklearn.linear_model import SGDClassifier

pipe_sgd = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("clf", SGDClassifier(
        loss="hinge",
        penalty="l2",
        random_state=RANDOM_STATE,
        max_iter=2000,
        tol=1e-3
    ))
])

param_grid_sgd = {
    "clf__alpha": [.0001, .001, .01],  
}

gs_sgd = GridSearchCV(
    estimator=pipe_sgd,
    param_grid=param_grid_sgd,
    scoring="accuracy",
    cv=cv,
    n_jobs=1
)

gs_sgd.fit(X_train, y_train)

best_sgd = gs_sgd.best_estimator_
cv_acc = gs_sgd.best_score_
test_pred = best_sgd.predict(X_test)
test_acc = accuracy_score(y_test, test_pred)

print("SGDClassifier best params:", gs_sgd.best_params_)
print(f"SGDClassifier CV accuracy:   {cv_acc:.4f}")
print(f"SGDClassifier TEST accuracy: {test_acc:.4f}")

log_run(
    model_name="SVM: SGDClassifier(hinge)",
    params=gs_sgd.best_params_,
    cv_score=cv_acc,
    test_score=test_acc,
)


SGDClassifier best params: {'clf__alpha': 0.0001}
SGDClassifier CV accuracy:   0.8139
SGDClassifier TEST accuracy: 0.8068


In [30]:
from sklearn.svm import SVC

pipe_svc = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("clf", SVC(kernel="rbf"))
])

param_grid_svc = {
    "clf__C": [0.5, 1.0, 2.0],         
    "clf__gamma": ["scale", 0.01],   
}

gs_svc = GridSearchCV(
    estimator=pipe_svc,
    param_grid=param_grid_svc,
    scoring="accuracy",
    cv=cv,
    n_jobs=1
)

gs_svc.fit(X_train, y_train)

best_svc = gs_svc.best_estimator_
cv_acc = gs_svc.best_score_
test_pred = best_svc.predict(X_test)
test_acc = accuracy_score(y_test, test_pred)

print("SVC(RBF) best params:", gs_svc.best_params_)
print(f"SVC(RBF) CV accuracy:   {cv_acc:.4f}")
print(f"SVC(RBF) TEST accuracy: {test_acc:.4f}")

log_run(
    model_name="SVM: SVC(RBF)",
    params=gs_svc.best_params_,
    cv_score=cv_acc,
    test_score=test_acc,
)


SVC(RBF) best params: {'clf__C': 2.0, 'clf__gamma': 'scale'}
SVC(RBF) CV accuracy:   0.8200
SVC(RBF) TEST accuracy: 0.8180


In [31]:
log_df = show_log()
display(log_df)

best_row = log_df.iloc[0]
print("\nBest run so far:")
print(best_row)


Unnamed: 0,model,params,cv_score,test_accuracy,seed
4,SVM: SVC(RBF),"{'clf__C': 2.0, 'clf__gamma': 'scale'}",0.819998,0.817954,42
0,Baseline: LogisticRegression,"{'memory': None, 'steps': [('standardscaler', ...",,0.812948,42
1,Baseline: kNN,"{'memory': None, 'steps': [('scaler', Standard...",,0.81178,42
3,SVM: SGDClassifier(hinge),{'clf__alpha': 0.0001},0.813866,0.806775,42
2,SVM: LinearSVC,{'clf__C': 0.1},0.801518,0.803437,42



Best run so far:
model                                     SVM: SVC(RBF)
params           {'clf__C': 2.0, 'clf__gamma': 'scale'}
cv_score                                       0.819998
test_accuracy                                  0.817954
seed                                                 42
Name: 4, dtype: object


In [32]:
best_model = globals()["best_svc"]

y_pred = best_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

pd.DataFrame(
    cm,
    index=["Actual: No Default", "Actual: Default"],
    columns=["Predicted: No Default", "Predicted: Default"]
)


Unnamed: 0,Predicted: No Default,Predicted: Default
Actual: No Default,4459,208
Actual: Default,883,443


In [33]:
from sklearn.model_selection import train_test_split

RANDOM_STATE = 42

core_model = globals()["best_svc"]

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train,
    test_size=0.2,
    stratify=y_train,
    random_state=RANDOM_STATE
)
print("Train/Val sizes:", X_tr.shape, X_val.shape)


Train/Val sizes: (19177, 23) (4795, 23)


In [34]:
import numpy as np
from sklearn.calibration import CalibratedClassifierCV
from sklearn.base import clone

def has_predict_proba(estimator):
    return hasattr(estimator, "predict_proba")

core_fitted = clone(core_model)
core_fitted.fit(X_tr, y_tr)

calibrated_model = CalibratedClassifierCV(
    estimator=core_fitted,
    method="sigmoid",
    cv=3
)
calibrated_model.fit(X_tr, y_tr)
print("CalibratedClassifierCV(method='sigmoid') applied (Platt scaling).")



CalibratedClassifierCV(method='sigmoid') applied (Platt scaling).


In [None]:
from sklearn.metrics import confusion_matrix

COST_TP = 0
COST_TN = 0
COST_FP = 1 # cost of predicting positive when actually negative
COST_FN = 5 # cost of predicting negative when actually positive (often higher)

def proba_positive(model, X):
        return model.predict_proba(X)[:, 1]

p_val = proba_positive(calibrated_model, X_val)

def expected_cost_at_threshold(y_true, p_pos, thr):
    y_hat = (p_pos >= thr).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_hat).ravel()
    total_cost = tp*COST_TP + tn*COST_TN + fp*COST_FP + fn*COST_FN
    return total_cost / len(y_true)

thresholds = np.linspace(0.01, 0.99, 99)
costs = np.array([expected_cost_at_threshold(y_val, p_val, t) for t in thresholds])

best_idx = int(np.argmin(costs))
best_thr = float(thresholds[best_idx])
best_cost = float(costs[best_idx])

print(f"Chosen threshold (min expected cost): {best_thr:.2f}")
print(f"Validation expected cost at chosen threshold: {best_cost:.4f}")


Chosen threshold (min expected cost): 0.16
Validation expected cost at chosen threshold: 0.6344


In [36]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

def metrics_and_cm(y_true, p_pos, thr, label=""):
    y_hat = (p_pos >= thr).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_hat).ravel()

    out = {
        "label": label,
        "threshold": thr,
        "accuracy": accuracy_score(y_true, y_hat),
        "precision": precision_score(y_true, y_hat, zero_division=0),
        "recall": recall_score(y_true, y_hat, zero_division=0),
        "f1": f1_score(y_true, y_hat, zero_division=0),
        "tn": tn, "fp": fp, "fn": fn, "tp": tp
    }
    return out, confusion_matrix(y_true, y_hat), y_hat

final_core = clone(core_model).fit(X_train, y_train)

if hasattr(final_core, "predict_proba"):
    final_model = final_core
    final_note = "Final model supports predict_proba (no calibration)."
else:
    final_model = CalibratedClassifierCV(
        estimator=final_core,
        method="sigmoid",
        cv=3
    )
    final_model.fit(X_train, y_train)
    final_note = "Final model calibrated with sigmoid on full training set."

print(final_note)

p_test = proba_positive(final_model, X_test)

default_stats, default_cm, _ = metrics_and_cm(y_test, p_test, 0.50, label="Default (0.50)")
chosen_stats, chosen_cm, _ = metrics_and_cm(y_test, p_test, best_thr, label=f"Chosen ({best_thr:.2f})")

report_df = pd.DataFrame([default_stats, chosen_stats])
display(report_df[["label","threshold","accuracy","precision","recall","f1","tn","fp","fn","tp"]])

print("\nConfusion matrix at default threshold (0.50):")
display(pd.DataFrame(default_cm, index=["Actual 0","Actual 1"], columns=["Pred 0","Pred 1"]))

print(f"\nConfusion matrix at chosen threshold ({best_thr:.2f}):")
display(pd.DataFrame(chosen_cm, index=["Actual 0","Actual 1"], columns=["Pred 0","Pred 1"]))


Final model calibrated with sigmoid on full training set.


Unnamed: 0,label,threshold,accuracy,precision,recall,f1,tn,fp,fn,tp
0,Default (0.50),0.5,0.816119,0.689831,0.306938,0.424843,4484,183,919,407
1,Chosen (0.16),0.16,0.746871,0.442435,0.553544,0.491792,3742,925,592,734



Confusion matrix at default threshold (0.50):


Unnamed: 0,Pred 0,Pred 1
Actual 0,4484,183
Actual 1,919,407



Confusion matrix at chosen threshold (0.16):


Unnamed: 0,Pred 0,Pred 1
Actual 0,3742,925
Actual 1,592,734


- **Threshold selection:** Chose a threshold of **0.16** using validation data to minimize expected cost (FP cost = 1, FN cost = 5).
- **Default threshold (0.50):** Accuracy = **0.8161**, Recall = **0.3069**, FP = **183**, FN = **919**.
- **Chosen threshold (0.16):** Accuracy = **0.7469**, Recall = **0.5535**, FP = **925**, FN = **592**.
- **Decision rationale:** Because false negatives are more costly than false positives, the lower threshold is preferred as it substantially reduces missed positive cases while minimizing expected cost.
