In [1]:
import sys
print("Running from:", sys.executable)
print("Python version:", sys.version)

Running from: c:\KC\loan-default\venv\Scripts\python.exe
Python version: 3.14.2 (tags/v3.14.2:df79316, Dec  5 2025, 17:18:21) [MSC v.1944 64 bit (AMD64)]


In [2]:
import joblib
import numpy as np
import pandas as pd

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    classification_report
)

Load Preprocessed Data

In [4]:
X_train = joblib.load("../models/X_train_processed.pkl")
X_val = joblib.load("../models/X_val_processed.pkl")
y_train = joblib.load("../models/y_train.pkl")
y_val = joblib.load("../models/y_val.pkl")

print("Shapes:")
print(X_train.shape, X_val.shape)


Shapes:
(2120440, 4798810) (530110, 4798810)


Baseline 1: Logistic Regression (Benchmark Model)

In [5]:
logreg = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",   # Important for credit risk
    random_state=42
)

logreg.fit(X_train, y_train)

# Predictions
val_preds = logreg.predict(X_val)
val_probs = logreg.predict_proba(X_val)[:, 1]

# Metrics
print("===== LOGISTIC REGRESSION BASELINE =====")
print(f"Accuracy: {accuracy_score(y_val, val_preds):.4f}")
print(f"Precision: {precision_score(y_val, val_preds):.4f}")
print(f"Recall: {recall_score(y_val, val_preds):.4f}")
print(f"F1-score: {f1_score(y_val, val_preds):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_val, val_probs):.4f}")

print("\nConfusion Matrix:")
print(confusion_matrix(y_val, val_preds))

print("\nClassification Report:")
print(classification_report(y_val, val_preds))


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


===== LOGISTIC REGRESSION BASELINE =====
Accuracy: 0.9589
Precision: 0.7462
Recall: 0.9153
F1-score: 0.8221
ROC-AUC: 0.9862

Confusion Matrix:
[[458043  17111]
 [  4656  50300]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.96      0.98    475154
           1       0.75      0.92      0.82     54956

    accuracy                           0.96    530110
   macro avg       0.87      0.94      0.90    530110
weighted avg       0.96      0.96      0.96    530110



Save the Baseline Model

In [6]:
joblib.dump(logreg, "../models/logreg_baseline.pkl")
print("Saved: ../models/logreg_baseline.pkl")


Saved: ../models/logreg_baseline.pkl


Baseline 2: Threshold Tuning

In [7]:
thresholds = np.linspace(0.2, 0.8, 7)

results = []

for t in thresholds:
    preds_t = (val_probs >= t).astype(int)
    acc = accuracy_score(y_val, preds_t)
    f1 = f1_score(y_val, preds_t)
    results.append((t, acc, f1))

results_df = pd.DataFrame(results, columns=["threshold", "accuracy", "f1"])
print(results_df)


   threshold  accuracy        f1
0        0.2  0.930343  0.740732
1        0.3  0.944940  0.780937
2        0.4  0.953640  0.806506
3        0.5  0.958939  0.822117
4        0.6  0.962942  0.833766
5        0.7  0.965475  0.839577
6        0.8  0.966788  0.838841


Pick Best Threshold

In [8]:
best_row = results_df.loc[results_df["f1"].idxmax()]
best_threshold = best_row["threshold"]

print(f"Best threshold (by F1): {best_threshold}")


Best threshold (by F1): 0.7000000000000002


In [9]:
final_preds = (val_probs >= best_threshold).astype(int)

print("Tuned Metrics:")
print(f"Accuracy: {accuracy_score(y_val, final_preds):.4f}")
print(f"F1-score: {f1_score(y_val, final_preds):.4f}")


Tuned Metrics:
Accuracy: 0.9655
F1-score: 0.8396
