In [100]:
# imports

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.utils import resample

from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
from sklearn.metrics import  average_precision_score

from sklearn.model_selection import cross_validate

import joblib


In [2]:
# load data

data = pd.read_csv('../Project_csv/dataset_for_ML.csv')

In [117]:
ratio = data["loan_status_enc"].value_counts().to_frame("count")
ratio["percentage"] = (ratio["count"] / len(data) * 100).round(2)

ratio

Unnamed: 0_level_0,count,percentage
loan_status_enc,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,206202,90.38
1.0,16175,7.09


In [118]:
data.head(2)

Unnamed: 0,installment,loan_amount,interest_rate,annual_income,public_records,last_record_months,last_delinquency_months,last_derog_months,delinquency_2y,inquiries_6m,...,issue_date_month,issue_date_year,postcode_district_te,district_te,term_enc,purpose_enc,home_ownership_enc,employment_length_enc,job_title_category_enc,loan_status_enc
0,1.671441,1.399183,-0.385528,0.198413,-0.331524,,,,-0.338888,-0.785814,...,0.419338,-2.839036,-0.924833,-0.045976,-0.52733,-0.31159,1.080559,1.750889,0.451061,0.0
1,-1.543899,-1.523422,-0.10622,-0.33931,-0.331524,,1.324559,,-0.338888,-0.785814,...,-1.046303,-2.129499,0.545233,1.269928,-0.52733,-0.31159,0.562578,-0.891613,0.451061,0.0


In [119]:
# Split the dataset

data_clean = data.dropna(subset=["loan_status_enc"]).copy()

X = data_clean.drop(columns=["loan_status_enc"])
y = data_clean["loan_status_enc"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [120]:
# Balanced data (train set only)

train = pd.concat([X_train.reset_index(drop=True),
                   y_train.reset_index(drop=True)], axis=1)

majority = train[train["loan_status_enc"] == 0]
minority = train[train["loan_status_enc"] == 1]

majority_down = resample(
    majority,
    replace=False,
    n_samples=3 * len(minority),
    random_state=42
)

train_bal = pd.concat([minority, majority_down]).sample(frac=1, random_state=42).reset_index(drop=True)

X_train_bal = train_bal.drop(columns=["loan_status_enc"])
y_train_bal = train_bal["loan_status_enc"]

print(y_train_bal.value_counts())
print(y_train_bal.value_counts(normalize=True))


loan_status_enc
0.0    38820
1.0    12940
Name: count, dtype: int64
loan_status_enc
0.0    0.75
1.0    0.25
Name: proportion, dtype: float64


### Models initialization
 
#### Random_forest, XGBoost, LightGBM

In [79]:
# Random_forest + features selection

rf_pipeline = Pipeline([
    ("fs", SelectFromModel(
        RandomForestClassifier(n_estimators=200, random_state=42),
        threshold="median"
    )),
    ("clf", RandomForestClassifier(random_state=42))
])

In [19]:
# XGBoost + features selection


xgb_pipeline = Pipeline([
    ("fs", SelectFromModel(
        XGBClassifier(
            n_estimators=200,
            learning_rate=0.1,
            max_depth=5,
            subsample=0.8,
            colsample_bytree=0.8,
            eval_metric="logloss",
            random_state=42
        )
    )),
    ("clf", XGBClassifier(
        eval_metric="logloss",
        random_state=42
    ))
])


In [55]:
# LightGBM + features selection

lgb_pipeline = Pipeline([
    ("fs", SelectFromModel(
        LGBMClassifier(
            n_estimators=200,
            learning_rate=0.1,
            num_leaves=31,
            random_state=42
        )
    )),
    ("clf", LGBMClassifier(random_state=42))
])


### Hyperparameters turning

In [62]:
# XGBoost

param_grid_xgb = {
    "fs__threshold": ["median", "mean"],
    "clf__n_estimators": [300, 500],
    "clf__max_depth": [3, 5, 7],
    "clf__learning_rate": [0.05, 0.1],
    "clf__subsample": [0.7, 0.9],
    "clf__colsample_bytree": [0.7, 0.9]
}

search_xgb = RandomizedSearchCV(
    xgb_pipeline,
    param_distributions=param_grid_xgb,
    n_iter=25,
    scoring="roc_auc",
    cv=5,
    n_jobs=-1,
    random_state=42,
    verbose=1
)

search_xgb.fit(X_train_bal, y_train_bal)

print("Best CV AUC:", search_xgb.best_score_)
print("Best params:", search_xgb.best_params_)


Fitting 5 folds for each of 25 candidates, totalling 125 fits
Best CV AUC: 0.9967892024538412
Best params: {'fs__threshold': 'median', 'clf__subsample': 0.7, 'clf__n_estimators': 500, 'clf__max_depth': 5, 'clf__learning_rate': 0.05, 'clf__colsample_bytree': 0.7}


### Model tests

In [58]:
# Run the model on test dataset (orinal data)

y_test_prob = best_model.predict_proba(X_test)[:, 1]
y_test_pred = best_model.predict(X_test)

print("TEST ROC AUC:", roc_auc_score(y_test, y_test_prob))
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))


TEST ROC AUC: 0.9960845824747787
[[41090   151]
 [  140  3095]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     41241
         1.0       0.95      0.96      0.96      3235

    accuracy                           0.99     44476
   macro avg       0.98      0.98      0.98     44476
weighted avg       0.99      0.99      0.99     44476



In [80]:
# Run cross-validation

best_model = search_xgb.best_estimator_

cv_results = cross_validate(
    best_model,
    X_train_bal,
    y_train_bal,
    cv=5,
    scoring=["roc_auc", "accuracy", "f1"],
    return_train_score=True,
    n_jobs=-1
)

for k in cv_results:
    if "time" not in k:
        print(k, cv_results[k].mean())


test_roc_auc 0.9967892024538412
train_roc_auc 0.9995837099526845
test_accuracy 0.9863408037094281
train_accuracy 0.9924410741885626
test_f1 0.9722076241749443
train_f1 0.9847050986758296


In [83]:
# Run features correlation - data leackage

corr = data_clean.corr(numeric_only=True)["loan_status_enc"].sort_values(ascending=False)
print(corr.head(20))

loan_status_enc              1.000000
interest_rate                0.162330
term_enc                     0.109962
debt_to_income               0.076238
credit_card_usage            0.064534
loan_amount                  0.042463
inquiries_6m                 0.034769
home_ownership_enc           0.034608
installment                  0.031394
last_record_months           0.025785
issue_date_year              0.024984
year                         0.024984
earliest_credit_line_year    0.021322
employment_length_enc        0.018948
delinquency_2y               0.016175
purpose_enc                  0.014926
open_accounts                0.009299
job_title_category_enc       0.004804
public_records               0.003514
credit_score                 0.001636
Name: loan_status_enc, dtype: float64


In [86]:
# feature importance

r = permutation_importance(
    best_model,
    X_test,
    y_test,
    n_repeats=10,
    random_state=42,
    n_jobs=1   # <- change this
)

imp = pd.Series(r.importances_mean, index=X_test.columns).sort_values(ascending=False)
print(imp.head(20))


amount_payed                  0.392821
installment                   0.203973
loan_amount                   0.131264
year                          0.002712
issue_date_month              0.001282
term_enc                      0.001019
annual_income                 0.000162
last_record_months            0.000139
delinquency_2y                0.000016
employment_length_enc         0.000000
home_ownership_enc            0.000000
purpose_enc                   0.000000
district_te                   0.000000
earliest_credit_line_year     0.000000
earliest_credit_line_month    0.000000
credit_score                  0.000000
nr_accounts                   0.000000
credit_card_balance           0.000000
open_accounts                 0.000000
inquiries_6m                  0.000000
dtype: float64


### Dataset without data leakage and top 10 features

In [67]:
# Manually picking features - excluded all that describe loan itself like terms, year of loan

leak_features = [
    "amount_payed",
    "last_record_months"]

top_features = [
    "loan_amount",
    "debt_to_income",
    "earliest_credit_line_year",
    "annual_income",
    "delinquency_2y",
    "employment_length_enc",
    "home_ownership_enc",
    "credit_score",
    "nr_accounts",
    "open_accounts",
    "inquiries_6m",
    "public_records",
    "credit_card_usage",
    "credit_card_balance",
    "total_current_balance",
    "purpose_enc",
    "loan_status_enc"
]

In [68]:
data_new = data[top_features] 

In [96]:
# Added more columns

data_new.loc[:, "loan_to_income"] = data_new["loan_amount"] / (data_new["annual_income"] + 1)
data_new.loc[:, "cc_utilization"] = data_new["credit_card_balance"] / (data_new["credit_card_usage"] + 1)
data_new.loc[:, "balance_per_account"] = data_new["total_current_balance"] / (data_new["nr_accounts"] + 1)


In [74]:
# Split the new dataset

data_clean_new = data_new.dropna(subset=["loan_status_enc"]).copy()

X_new = data_clean_new.drop(columns=["loan_status_enc"])
y_new = data_clean_new["loan_status_enc"]

X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(
    X_new, y_new, test_size=0.2, random_state=42, stratify=y_new
)

In [75]:
# Balance the new dataset (train only)

train_new = pd.concat([X_train_new.reset_index(drop=True),
                   y_train_new.reset_index(drop=True)], axis=1)

majority_new = train_new[train_new["loan_status_enc"] == 0]
minority_new = train_new[train_new["loan_status_enc"] == 1]

majority_down_new = resample(
    majority_new,
    replace=False,
    n_samples=3 * len(minority_new),
    random_state=42
)

train_bal_new = pd.concat([minority_new, majority_down_new]).sample(frac=1, random_state=42).reset_index(drop=True)

X_train_bal_new = train_bal_new.drop(columns=["loan_status_enc"])
y_train_bal_new = train_bal_new["loan_status_enc"]

print(y_train_bal_new.value_counts())
print(y_train_bal_new.value_counts(normalize=True))

loan_status_enc
0.0    38820
1.0    12940
Name: count, dtype: int64
loan_status_enc
0.0    0.75
1.0    0.25
Name: proportion, dtype: float64


In [76]:
# XGBoost: hyperparameters selection + model runing + quick train model evaluation

param_grid_xgb = {
    "fs__threshold": ["median", "mean"],
    "clf__n_estimators": [300, 500],
    "clf__max_depth": [3, 5, 7],
    "clf__learning_rate": [0.05, 0.1],
    "clf__subsample": [0.7, 0.9],
    "clf__colsample_bytree": [0.7, 0.9]
}

search_xgb = RandomizedSearchCV(
    xgb_pipeline,
    param_distributions=param_grid_xgb,
    n_iter=25,
    scoring="roc_auc",
    cv=5,
    n_jobs=-1,
    random_state=42,
    verbose=1
)

search_xgb.fit(X_train_bal_new, y_train_bal_new)

print("Best CV AUC:", search_xgb.best_score_)
print("Best params:", search_xgb.best_params_)



Fitting 5 folds for each of 25 candidates, totalling 125 fits
Best CV AUC: 0.6678002025756733
Best params: {'fs__threshold': 'median', 'clf__subsample': 0.7, 'clf__n_estimators': 300, 'clf__max_depth': 3, 'clf__learning_rate': 0.05, 'clf__colsample_bytree': 0.7}


In [77]:
# Test evaluation

best_xgb = search_xgb.best_estimator_

y_test_prob_new = best_xgb.predict_proba(X_test_new)[:,1]
y_test_pred_new = best_xgb.predict(X_test_new)

print("TEST AUC:", roc_auc_score(y_test_new, y_test_prob_new))
print(confusion_matrix(y_test_new, y_test_pred_new))
print(classification_report(y_test_new, y_test_pred_new))


TEST AUC: 0.6659943641115534
[[40520   721]
 [ 3043   192]]
              precision    recall  f1-score   support

         0.0       0.93      0.98      0.96     41241
         1.0       0.21      0.06      0.09      3235

    accuracy                           0.92     44476
   macro avg       0.57      0.52      0.52     44476
weighted avg       0.88      0.92      0.89     44476



### Train with class weights (no resampling)

In [78]:

neg = (y_train_new == 0).sum()
pos = (y_train_new == 1).sum()
scale_pos_weight = neg / pos

print(f"Train negatives: {neg}, positives: {pos}")
print(f"scale_pos_weight (neg/pos): {scale_pos_weight:.2f}")
print("Train pos rate:", pos / (pos + neg))
print("Test  pos rate:", (y_test_new == 1).mean())


Train negatives: 164961, positives: 12940
scale_pos_weight (neg/pos): 12.75
Train pos rate: 0.07273708410857724
Test  pos rate: 0.07273585754114578


In [99]:
# Run the model 

xgb = XGBClassifier(
    n_estimators=600,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    min_child_weight=1,
    scale_pos_weight=scale_pos_weight,
    eval_metric="aucpr",     # PR-AUC is often better than ROC-AUC for imbalanced
    random_state=42
)

xgb.fit(X_train_new, y_train_new)

probs = xgb.predict_proba(X_test_new)[:, 1]  
print("ROC AUC:", roc_auc_score(y_test_new, probs))
print("PR  AUC:", average_precision_score(y_test_new, probs))


ROC AUC: 0.6675632099881696
PR  AUC: 0.13819751976171923


In [84]:
# Feature importance

imp = pd.Series(xgb.feature_importances_, index=X_train_new.columns)
print(imp.sort_values(ascending=False).head(15))

inquiries_6m             0.091783
debt_to_income           0.083872
annual_income            0.079015
loan_to_income           0.075744
credit_card_usage        0.074850
loan_amount              0.058878
home_ownership_enc       0.057174
purpose_enc              0.056239
delinquency_2y           0.054561
total_current_balance    0.054536
credit_card_balance      0.038548
nr_accounts              0.037744
cc_utilization           0.036913
open_accounts            0.036286
employment_length_enc    0.036188
dtype: float32


In [97]:
# Risky groups

ks = np.array([0.01, 0.02, 0.05, 0.10])

df_sorted = df.sort_values("p", ascending=False)
y_sorted = df_sorted["y"].to_numpy()

n = np.maximum(1, (len(df_sorted) * ks).astype(int))   # avoid n=0 on small datasets
precisions = [y_sorted[:ni].mean() for ni in n]

for k, ni, prec in zip(ks, n, precisions):
    print(f"Top {k:.0%} risk (n={ni:,}) → default rate: {prec:.3f}")



Top 1% risk (n=444) → default rate: 0.248
Top 2% risk (n=889) → default rate: 0.232
Top 5% risk (n=2,223) → default rate: 0.197
Top 10% risk (n=4,447) → default rate: 0.169


In [101]:
# Save the model 

joblib.dump(xgb, "../Model/xgb_credit_model.pkl")


['../Model/xgb_credit_model.pkl']