In [1]:
# Setup & Data Loading

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from tensorflow import keras
from tensorflow.keras import layers, regularizers

# -----------------------------------------------------------------------------
# 1) Define paths to all “ready” CSV variants created in EDA
# -----------------------------------------------------------------------------
READY_FILE_RAW          = '../data/loan_data_ready_raw.csv'
READY_FILE_LOG          = '../data/loan_data_ready_log.csv'
READY_FILE_MIN_COLUMNS  = '../data/loan_data_ready_minimal.csv'
READY_FILE_FICO_IR      = '../data/loan_data_ready_fico_ir.csv'
READY_FILE_FICO_IR_INQ  = '../data/loan_data_ready_fico_ir_inq.csv'

# -----------------------------------------------------------------------------
# 2) Select which dataset to use for this experiment
#    – Raw:       raw features scaled
#    – Log:       log-transforms + engineered features
#    – Minimal:   top 3 features + flags
#    – FICO_IR:   FICO + interest rate
#    – FICO_IR_INQ: FICO + interest rate + inquiries
# -----------------------------------------------------------------------------
READY_FILE = READY_FILE_FICO_IR_INQ

# -----------------------------------------------------------------------------
# 3) Load the chosen dataset
# -----------------------------------------------------------------------------
df = pd.read_csv(READY_FILE)
print(f"Loaded {READY_FILE}, shape = {df.shape}")
df.head()

Loaded ../data/loan_data_ready_fico_ir_inq.csv, shape = (9578, 4)


Unnamed: 0,fico,int.rate,inq.last.6mths,not.fully.paid
0,0.688825,-0.139318,-0.716989,0
1,-0.101303,-0.578868,-0.716989,0
2,-0.759742,0.486484,-0.26247,0
3,0.030385,-0.813544,-0.26247,0
4,-1.154806,0.743509,-0.716989,0


In [2]:
X = df[['fico']]
y = df['not.fully.paid']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

In [3]:
# Fit logistic regression on FICO only
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(class_weight='balanced', max_iter=1000)
clf.fit(X_train, y_train)

In [4]:
# Evaluate AUC
from sklearn.metrics import roc_auc_score
y_test_prob = clf.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_test_prob)
print(f"FICO-Only Model AUC: {auc:.4f}")

FICO-Only Model AUC: 0.6291


In [5]:
# Threshold selection via Youden's J
from sklearn.metrics import roc_curve, confusion_matrix
fpr, tpr, thresholds = roc_curve(y_test, y_test_prob)
specificity = 1 - fpr
j_scores = tpr + specificity - 1
best_idx = j_scores.argmax()
best_threshold = thresholds[best_idx]
print(f"Optimal threshold: {best_threshold:.2f}")
print(f"Sensitivity: {tpr[best_idx]:.2f}")
print(f"Specificity: {specificity[best_idx]:.2f}")
# Confusion matrix at optimal threshold
y_pred = (y_test_prob >= best_threshold).astype(int)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(f"Confusion matrix: TN={tn}, FP={fp}, FN={fn}, TP={tp}")

Optimal threshold: 0.55
Sensitivity: 0.50
Specificity: 0.69
Confusion matrix: TN=1105, FP=504, FN=153, TP=154
