In [1]:
# Core Libraries for Data Manipulation and Analysis
import pandas as pd
import numpy as np

# Machine Learning Models and Utilities
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Balancing Datasets
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from imblearn.combine import SMOTETomek

# Evaluation Metrics
from sklearn.metrics import classification_report, roc_auc_score

# Creating Pipeline
from imblearn.pipeline import Pipeline

In [2]:
df_cleaned = pd.read_csv("credit_default_cleaned.csv")

In [3]:
numeric_features = [
    'LIMIT_BAL', 'AGE',
    'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6',
    'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6'
]

categorical_features = [
    'SEX', 'EDUCATION', 'MARRIAGE',
    'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6'
]

In [4]:
X = df_cleaned.drop("DEFAULT", axis=1)
y = df_cleaned["DEFAULT"]

In [5]:
for col in categorical_features:
    X[col] = X[col].astype('category')

In [6]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Display the shapes of resulting subsets
print(f"Training set X shape: {X_train.shape}")
print(f"Training set y shape: {y_train.shape}")
print(f"Test set X shape: {X_test.shape}")
print(f"Test set y shape: {y_test.shape}")
print("-"*50)
# Verify target variable balance in both sets
print("'DEFAULT' distribution in training set:")
print(y_train.value_counts(normalize=True))
print("-"*50)
print("'DEFAULT' distribution in test set:")
print(y_test.value_counts(normalize=True))

Training set X shape: (24000, 23)
Training set y shape: (24000,)
Test set X shape: (6000, 23)
Test set y shape: (6000,)
--------------------------------------------------
'DEFAULT' distribution in training set:
DEFAULT
0    0.778792
1    0.221208
Name: proportion, dtype: float64
--------------------------------------------------
'DEFAULT' distribution in test set:
DEFAULT
0    0.778833
1    0.221167
Name: proportion, dtype: float64


In [7]:
# Scaling Numerical Features
scaler = StandardScaler()
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])

In [8]:
# Rechecking the data
X_train.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
22788,-0.056866,2,2,2,-0.264558,2,2,3,2,0,...,1.778869,1.891679,2.020839,2.096346,0.580657,-0.290332,-0.29782,0.086961,0.500397,0.048745
29006,-0.134081,2,1,2,-0.155804,1,-1,-1,-2,-2,...,-0.68509,-0.673845,-0.662488,-0.652126,-0.344969,-0.290332,-0.29782,-0.318075,-0.316978,-0.292992
16950,-1.21509,1,2,1,1.58426,1,2,0,0,0,...,-0.561615,-0.547887,-0.526789,-0.510492,-0.348128,-0.227081,-0.233069,-0.298554,-0.29671,-0.23697
22280,0.406423,2,1,2,-0.699574,0,0,0,0,0,...,2.460102,2.738001,1.509166,1.613118,0.215287,0.154056,0.152526,0.000642,0.008643,0.043143
11346,1.101358,2,1,2,-0.373312,1,-2,-1,0,0,...,-0.680324,-0.668755,-0.666365,-0.656089,-0.348251,-0.274236,-0.29782,-0.318075,-0.316978,-0.225765


In [9]:
X_test.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
6907,-0.906231,1,2,2,1.149244,-1,-1,-1,-1,0,...,-0.68509,-0.632468,-0.607727,-0.605522,1.265573,-0.290332,-0.099569,-0.164029,-0.199755,-0.133832
24575,-0.134081,1,1,1,-0.482066,-1,-1,-2,-2,-2,...,-0.68509,-0.673845,-0.662488,-0.454951,-0.348251,-0.290332,-0.29782,-0.318075,0.460127,1.387682
26766,-0.906231,1,2,2,-1.13459,0,0,0,0,0,...,0.031389,0.106055,0.170133,0.202774,-0.236782,-0.199282,-0.1722,-0.185276,-0.197362,-0.178818
2156,0.946928,2,1,2,-1.13459,0,0,0,0,0,...,3.64785,3.210915,3.328879,3.241563,0.580657,0.228121,0.273178,0.677916,0.204281,1.01418
3179,2.568442,2,2,1,-0.917082,-2,-2,-2,-2,-2,...,-0.486706,-0.517706,-0.497544,-0.483514,0.269968,0.380546,0.273178,0.345919,0.347555,1.124601


In [10]:
# Save input features
X_train.to_csv("X_train.csv", index=False)
X_test.to_csv("X_test.csv", index=False)

# Save target variable
y_train.to_csv("y_train.csv", index=False)
y_test.to_csv("y_test.csv", index=False)

In [11]:
base_models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "LightGBM": LGBMClassifier(random_state=42)
}

In [12]:
balancing_methods = {
    "No Balancing": None,
    "Random Oversampling": RandomOverSampler(random_state=42),
    "SMOTE": SMOTE(random_state=42),
    "Random Undersampling": RandomUnderSampler(random_state=42),
    "Tomek Links": TomekLinks(),
    "SMOTE + Tomek Links": SMOTETomek(random_state=42)
}

In [22]:
def evaluate_model(model, X_test, y_test):
    """Evaluates the model using AUC, Recall, and F1-score."""
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    auc = roc_auc_score(y_test, y_pred_proba)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    return auc, recall, f1

In [23]:
results = []

for balance_name, balancer in balancing_methods.items():
    for model_name, model in base_models.items():
        print(f"Balancing: {balance_name} | Model: {model_name}")

        steps = []
        if balancer is not None:
            steps.append(('balancer', balancer))
        steps.append(('classifier', model))

        pipe = Pipeline(steps)

        # Train
        pipe.fit(X_train, y_train)

        # Evaluate
        auc, recall, f1 = evaluate_model(pipe, X_test, y_test)

        results.append((balance_name, model_name, auc, recall, f1))
        print(f"AUC: {auc:.4f} | Recall: {recall:.4f} | F1: {f1:.4f}")


⚙️  Balancing: No Balancing | Model: Random Forest
AUC: 0.7601 | Recall: 0.3580 | F1: 0.4600

⚙️  Balancing: No Balancing | Model: XGBoost


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


AUC: 0.7599 | Recall: 0.3519 | F1: 0.4525

⚙️  Balancing: No Balancing | Model: LightGBM
[LightGBM] [Info] Number of positive: 5309, number of negative: 18691
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002913 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3283
[LightGBM] [Info] Number of data points in the train set: 24000, number of used features: 62
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.221208 -> initscore=-1.258639
[LightGBM] [Info] Start training from score -1.258639
AUC: 0.7765 | Recall: 0.3632 | F1: 0.4675

⚙️  Balancing: Random Oversampling | Model: Random Forest
AUC: 0.7563 | Recall: 0.4273 | F1: 0.4956

⚙️  Balancing: Random Oversampling | Model: XGBoost


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


AUC: 0.7511 | Recall: 0.5599 | F1: 0.5024

⚙️  Balancing: Random Oversampling | Model: LightGBM
[LightGBM] [Info] Number of positive: 18691, number of negative: 18691
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004258 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3305
[LightGBM] [Info] Number of data points in the train set: 37382, number of used features: 73
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
AUC: 0.7737 | Recall: 0.6127 | F1: 0.5322

⚙️  Balancing: SMOTE | Model: Random Forest
AUC: 0.7538 | Recall: 0.4906 | F1: 0.5122

⚙️  Balancing: SMOTE | Model: XGBoost


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


AUC: 0.7454 | Recall: 0.4265 | F1: 0.4784

⚙️  Balancing: SMOTE | Model: LightGBM
[LightGBM] [Info] Number of positive: 18691, number of negative: 18691
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017863 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3688
[LightGBM] [Info] Number of data points in the train set: 37382, number of used features: 73
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
AUC: 0.7678 | Recall: 0.4348 | F1: 0.4976

⚙️  Balancing: Random Undersampling | Model: Random Forest
AUC: 0.7590 | Recall: 0.6179 | F1: 0.5138

⚙️  Balancing: Random Undersampling | Model: XGBoost


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


AUC: 0.7489 | Recall: 0.6420 | F1: 0.4952

⚙️  Balancing: Random Undersampling | Model: LightGBM
[LightGBM] [Info] Number of positive: 5309, number of negative: 5309
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000884 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3262
[LightGBM] [Info] Number of data points in the train set: 10618, number of used features: 57
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
AUC: 0.7661 | Recall: 0.6436 | F1: 0.5225

⚙️  Balancing: Tomek Links | Model: Random Forest
AUC: 0.7547 | Recall: 0.4084 | F1: 0.4896

⚙️  Balancing: Tomek Links | Model: XGBoost


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


AUC: 0.7590 | Recall: 0.4122 | F1: 0.4882

⚙️  Balancing: Tomek Links | Model: LightGBM
[LightGBM] [Info] Number of positive: 5309, number of negative: 17281
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001557 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3279
[LightGBM] [Info] Number of data points in the train set: 22590, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.235015 -> initscore=-1.180204
[LightGBM] [Info] Start training from score -1.180204
AUC: 0.7741 | Recall: 0.4099 | F1: 0.4964

⚙️  Balancing: SMOTE + Tomek Links | Model: Random Forest
AUC: 0.7513 | Recall: 0.4868 | F1: 0.5061

⚙️  Balancing: SMOTE + Tomek Links | Model: XGBoost


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


AUC: 0.7480 | Recall: 0.4213 | F1: 0.4825

⚙️  Balancing: SMOTE + Tomek Links | Model: LightGBM
[LightGBM] [Info] Number of positive: 18293, number of negative: 18293
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015885 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3688
[LightGBM] [Info] Number of data points in the train set: 36586, number of used features: 73
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
AUC: 0.7674 | Recall: 0.4484 | F1: 0.5057


In [25]:
results_df = pd.DataFrame(results, columns=['Balancing', 'Model', 'AUC', 'Recall', 'F1'])
results_df = results_df.sort_values(by='Recall', ascending=False).reset_index(drop=True)

print(results_df)

               Balancing          Model       AUC    Recall        F1
0   Random Undersampling       LightGBM  0.766123  0.643557  0.522484
1   Random Undersampling        XGBoost  0.748861  0.642050  0.495205
2   Random Undersampling  Random Forest  0.759007  0.617935  0.513784
3    Random Oversampling       LightGBM  0.773711  0.612660  0.532242
4    Random Oversampling        XGBoost  0.751120  0.559910  0.502366
5                  SMOTE  Random Forest  0.753837  0.490580  0.512195
6    SMOTE + Tomek Links  Random Forest  0.751338  0.486812  0.506071
7    SMOTE + Tomek Links       LightGBM  0.767403  0.448380  0.505737
8                  SMOTE       LightGBM  0.767833  0.434815  0.497628
9    Random Oversampling  Random Forest  0.756333  0.427280  0.495629
10                 SMOTE        XGBoost  0.745359  0.426526  0.478445
11   SMOTE + Tomek Links        XGBoost  0.747975  0.421251  0.482521
12           Tomek Links        XGBoost  0.759024  0.412208  0.488175
13           Tomek L

In [26]:
results_df.to_csv("model_balancing_results.csv", index=False)

### Performing hyperparameter tuning for LightGBM model with Random Undersampling as it has highest Recall value.

In [58]:
# Create the pipeline
pipe = Pipeline([
    ('balancer', RandomUnderSampler(random_state=42)),
    ('classifier', LGBMClassifier(random_state=42))
])

In [71]:
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__learning_rate': [0.01, 0.05, 0.1],
    'classifier__max_depth': [5, 7, 9],
    'classifier__num_leaves': [15, 31],
    'classifier__subsample': [0.8, 1.0]
}

In [72]:
scoring = {'AUC': 'roc_auc', 'Recall':'recall'}

grid_search = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring=scoring,
    refit='Recall',
    cv=5,
    n_jobs=-1,
    verbose=2
)

In [73]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[LightGBM] [Info] Number of positive: 5309, number of negative: 5309
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001225 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3262
[LightGBM] [Info] Number of data points in the train set: 10618, number of used features: 57
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [78]:
print("Best Parameters Found:")
print(grid_search.best_params_)

print("\nBest AUC Score During CV:")
print(grid_search.best_score_)

# Evaluate on test data
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# Core Metrics
auc = roc_auc_score(y_test, y_pred_proba)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print key metrics
print(f"\nTest Set Performance:")
print(f"\nAUC: {auc:.4f} | Recall: {recall:.4f} | F1: {f1:.4f}")

# Detailed evaluation
print("\nClassification Report:")
print(classification_report(y_test, y_pred, digits=4))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Pred 0", "Pred 1"])
print("Confusion Matrix:")
print(cm_df)

Best Parameters Found:
{'classifier__learning_rate': 0.1, 'classifier__max_depth': 9, 'classifier__n_estimators': 300, 'classifier__num_leaves': 31, 'classifier__subsample': 0.8}

Best AUC Score During CV:
0.6628377095125766

Test Set Performance:

AUC: 0.7591 | Recall: 0.6443 | F1: 0.5095

Classification Report:
              precision    recall  f1-score   support

           0     0.8811    0.7488    0.8096      4673
           1     0.4214    0.6443    0.5095      1327

    accuracy                         0.7257      6000
   macro avg     0.6513    0.6965    0.6596      6000
weighted avg     0.7795    0.7257    0.7432      6000

Confusion Matrix:
          Pred 0  Pred 1
Actual 0    3499    1174
Actual 1     472     855


In [75]:
print("Cross-validation results summary:")
cv_results = pd.DataFrame(grid_search.cv_results_)
print(cv_results[['mean_test_AUC', 'mean_test_Recall', 'params']].sort_values(by='mean_test_Recall', ascending=False).head(10))

Cross-validation results summary:
     mean_test_AUC  mean_test_Recall  \
106       0.766727          0.662838   
107       0.766727          0.662838   
103       0.771726          0.660767   
102       0.771726          0.660767   
105       0.773741          0.657756   
104       0.773741          0.657756   
82        0.769775          0.657753   
83        0.769775          0.657753   
80        0.772391          0.657376   
81        0.772391          0.657376   

                                                params  
106  {'classifier__learning_rate': 0.1, 'classifier...  
107  {'classifier__learning_rate': 0.1, 'classifier...  
103  {'classifier__learning_rate': 0.1, 'classifier...  
102  {'classifier__learning_rate': 0.1, 'classifier...  
105  {'classifier__learning_rate': 0.1, 'classifier...  
104  {'classifier__learning_rate': 0.1, 'classifier...  
82   {'classifier__learning_rate': 0.1, 'classifier...  
83   {'classifier__learning_rate': 0.1, 'classifier...  
80   {'class

In [76]:
import joblib
joblib.dump(best_model, "best_lightgbm_randomundersample.pkl")

['best_lightgbm_randomundersample.pkl']