# Classifier for Credit Default Prediction

In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import roc_curve, auc, recall_score, precision_score, accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import dill as pickle
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit
import warnings
warnings.filterwarnings('ignore')
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.decomposition import PCA
from tqdm import tqdm


In [43]:
data = pd.read_excel("../data/raw/default of credit card clients.xls", sheet_name="Data")
data.shape

(30001, 25)

In [44]:
data.head()

Unnamed: 0.1,Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X15,X16,X17,X18,X19,X20,X21,X22,X23,Y
0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
1,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
2,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
3,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
4,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0


In [45]:
pd.get_dummies(data.iloc[1:,1:], columns=['X2','X3','X4'], drop_first=False).astype(float).head(6)

Unnamed: 0,X1,X5,X6,X7,X8,X9,X10,X11,X12,X13,...,X3_1,X3_2,X3_3,X3_4,X3_5,X3_6,X4_0,X4_1,X4_2,X4_3
1,20000.0,24.0,2.0,2.0,-1.0,-1.0,-2.0,-2.0,3913.0,3102.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,120000.0,26.0,-1.0,2.0,0.0,0.0,0.0,2.0,2682.0,1725.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,90000.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,29239.0,14027.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,50000.0,37.0,0.0,0.0,0.0,0.0,0.0,0.0,46990.0,48233.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5,50000.0,57.0,-1.0,0.0,-1.0,0.0,0.0,0.0,8617.0,5670.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6,50000.0,37.0,0.0,0.0,0.0,0.0,0.0,0.0,64400.0,57069.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [46]:
def data_prep(df):
    # remove unnecessary 1st column and columne names in 1st row
    data = df.iloc[1:, 1:].copy()
    # one-hot encoding
    categorical_cols = ['X2','X3','X4']
    ohe = OneHotEncoder(sparse_output=False, drop=None)
    ohe_array = ohe.fit_transform(data[categorical_cols])
    ohe_array = ohe_array.astype(int) # converting booleans to 0/1
    ohe_cols = ohe.get_feature_names_out(categorical_cols)
    # merge ohe with original data
    data_ohe = pd.DataFrame(ohe_array, columns=ohe_cols, index=data.index)
    # remove original categorical cols
    data_final = pd.concat([data.drop(columns=categorical_cols), data_ohe], axis=1)
    # convert target col to 0/1
    data_final['Y'] = data_final['Y'].astype(int)

    return data_final

In [47]:
# prepare data
prep_data = data_prep(data)
prep_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 1 to 30000
Data columns (total 34 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   X1      30000 non-null  object
 1   X5      30000 non-null  object
 2   X6      30000 non-null  object
 3   X7      30000 non-null  object
 4   X8      30000 non-null  object
 5   X9      30000 non-null  object
 6   X10     30000 non-null  object
 7   X11     30000 non-null  object
 8   X12     30000 non-null  object
 9   X13     30000 non-null  object
 10  X14     30000 non-null  object
 11  X15     30000 non-null  object
 12  X16     30000 non-null  object
 13  X17     30000 non-null  object
 14  X18     30000 non-null  object
 15  X19     30000 non-null  object
 16  X20     30000 non-null  object
 17  X21     30000 non-null  object
 18  X22     30000 non-null  object
 19  X23     30000 non-null  object
 20  Y       30000 non-null  int64 
 21  X2_1    30000 non-null  int64 
 22  X2_2    30000 non-null

#### for handling imbalanced class

In [48]:
# pre-computing sample weights once and for all
from sklearn.utils.class_weight import compute_sample_weight

sample_weights = compute_sample_weight(class_weight='balanced', y=prep_data['Y'])
X = prep_data.drop('Y', axis=1)
y = prep_data['Y'].astype(int)

#### Feature Selections

In [49]:
# -----------------------------------------------------------
# 1. Pearson (Correlation-based selection)
# -----------------------------------------------------------
corr_df = pd.DataFrame(X.join(y).corr()['Y'].sort_values())
corr_df['abs'] = corr_df['Y'].abs()

corr_features = list(
    corr_df.loc[(corr_df['abs'] >= 0.08) & (corr_df.index != 'Y')].index
)

print("-----------------------------------")
print("Correlation-based Important Features")
print("-----------------------------------")
print(corr_features)

# -----------------------------------------------------------
# 2. LASSO (L1 Regularization)
# -----------------------------------------------------------
lr_l1 = LogisticRegression(
    penalty='l1',
    C=1,
    solver='liblinear',
    max_iter=500
)

lr_l1.fit(X, y, sample_weight=sample_weights)

sfm_l1 = SelectFromModel(lr_l1, prefit=True)
lasso_features = list(X.columns[sfm_l1.get_support()])

print("\n-----------------------------------")
print("LASSO-based Important Features")
print("-----------------------------------")
print(lasso_features)

# -----------------------------------------------------------
# 3. Tree-Based (ExtraTrees)
# -----------------------------------------------------------
tree_model = ExtraTreesClassifier(
    n_estimators=200,
    random_state=42
)

tree_model.fit(X, y, sample_weight=sample_weights)

sfm_tree = SelectFromModel(tree_model, prefit=True)
tree_features = list(X.columns[sfm_tree.get_support()])

print("\n-----------------------------------")
print("Tree-based Important Features")
print("-----------------------------------")
print(tree_features)

# -----------------------------------------------------------
# 4. PCA → RFE
# -----------------------------------------------------------
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=0.99)
X_reduced = pca.fit_transform(X_scaled)

N = pca.n_components_
print("\nNumber of PCA components for 99% variance:", N)

rfe = RFE(
    estimator=DecisionTreeClassifier(),
    n_features_to_select=N
)

rfe.fit(X, y, sample_weight=sample_weights)

rfe_features = list(X.columns[rfe.support_])

print("\n-----------------------------------")
print("RFE-based Important Features")
print("-----------------------------------")
print(rfe_features)


-----------------------------------
Correlation-based Important Features
-----------------------------------
['X1', 'X11', 'X10', 'X9', 'X8', 'X7', 'X6']

-----------------------------------
LASSO-based Important Features
-----------------------------------
['X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X18', 'X2_2', 'X3_0', 'X3_1', 'X3_2', 'X3_3', 'X3_4', 'X3_5', 'X4_0', 'X4_1', 'X4_2']

-----------------------------------
Tree-based Important Features
-----------------------------------
['X1', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19', 'X20', 'X21', 'X22', 'X23']

Number of PCA components for 99% variance: 14

-----------------------------------
RFE-based Important Features
-----------------------------------
['X1', 'X5', 'X6', 'X12', 'X13', 'X14', 'X16', 'X17', 'X18', 'X19', 'X20', 'X21', 'X22', 'X23']


#### Creting subsets from original dataset basis each feature set

In [50]:
# append target variable to form complete set cols to 
# for creating multiple datasets for training & benchmarking
corr_features.append('Y')
lasso_features.append('Y')
tree_features.append('Y')
rfe_features.append('Y')
all_features = X.columns.tolist()
all_features.append('Y')

In [51]:
print(corr_features)
print(lasso_features)
print(tree_features)
print(rfe_features)
print(all_features)

['X1', 'X11', 'X10', 'X9', 'X8', 'X7', 'X6', 'Y']
['X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X18', 'X2_2', 'X3_0', 'X3_1', 'X3_2', 'X3_3', 'X3_4', 'X3_5', 'X4_0', 'X4_1', 'X4_2', 'Y']
['X1', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19', 'X20', 'X21', 'X22', 'X23', 'Y']
['X1', 'X5', 'X6', 'X12', 'X13', 'X14', 'X16', 'X17', 'X18', 'X19', 'X20', 'X21', 'X22', 'X23', 'Y']
['X1', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19', 'X20', 'X21', 'X22', 'X23', 'X2_1', 'X2_2', 'X3_0', 'X3_1', 'X3_2', 'X3_3', 'X3_4', 'X3_5', 'X3_6', 'X4_0', 'X4_1', 'X4_2', 'X4_3', 'Y']


#### Benchmarking multiple classifiers (algos) across multiple feature sets

In [57]:
# ============================================================
# Feature subsets 
# ============================================================

feature_sets = [
    corr_features,
    lasso_features,
    tree_features,
    rfe_features,
    all_features      
]

dataset_names = [
    'corr_feat_dataset',
    'lasso_feat_dataset',
    'tree_feat_dataset',
    'rfe_feat_dataset',
    'all_feat_dataset'
]

# ============================================================
# Creating datasets for each feature set
# ============================================================

datasets = []
for fs in feature_sets:
    ds = prep_data.loc[:, fs].reset_index(drop=True)
    datasets.append(ds)

# ============================================================
# Queueing multiple Classifiers for running against mutiple FS
# ============================================================

names = [
    "Nearest Neighbors",
    "Logistic Regression",
    # "RBF SVM",
    "Decision Tree",
    "Random Forest",
    "MLP Neural Net",
    "AdaBoost",
    "Gaussian NB",
    "Quadratic DA",
    "XGBoost",
    "LightGBM"
]

classifiers = [
    KNeighborsClassifier(n_neighbors=10, p=1, weights='distance'),
    LogisticRegression(max_iter=1000, class_weight='balanced'),
    # SVC(kernel="rbf", gamma=2, C=1, probability=True, class_weight='balanced'),       # <--- takes lot of time; uncomment only if you've patience
    DecisionTreeClassifier(max_depth=5, class_weight='balanced'),
    RandomForestClassifier(max_depth=5, n_estimators=100, class_weight='balanced'),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(reg_param=0.1),
    XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        scale_pos_weight=(y.value_counts()[0] / y.value_counts()[1])  # for class balancing
    ),
    LGBMClassifier()
]

In [58]:
# ================================================================
# Full Benchmarking across Feature Sets and various Classifiers
# ================================================================

modelling_df = pd.DataFrame(
    columns=[
        'feats_subset', 'algo',
        'train-precision', 'train-recall', 'train-f1', 'train-acc',
        'val-precision', 'val-recall', 'val-f1', 'val-acc'
    ]
)

i = 0

for ds_cnt in range(len(datasets)):
    print(f"\n Dataset: {dataset_names[ds_cnt]}")
    ds = datasets[ds_cnt]

    X_sub = ds.drop(columns=['Y'])
    y_sub = ds['Y']

    # Standardize numeric features only
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_sub)

    # Stratified train - test dataset split
    sss = StratifiedShuffleSplit(n_splits=4, test_size=0.3, random_state=29)
    for train_idx, val_idx in sss.split(X_scaled, y_sub):
        X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
        y_train, y_val = y_sub.iloc[train_idx], y_sub.iloc[val_idx]
        
        y_train = y_train.squeeze().astype(int)
        y_val = y_val.squeeze().astype(int)

    # Benchmark models
    for clf_cnt in tqdm(range(len(classifiers)), desc="Algorithms", position=0):
        
        clf = classifiers[clf_cnt]
        name = names[clf_cnt]

        modelling_df.at[i, 'feats_subset'] = dataset_names[ds_cnt]
        modelling_df.at[i, 'algo'] = name

        clf.fit(X_train, y_train)

        # Train CV
        y_pred_train = cross_val_predict(clf, X_train, y_train, cv=5, method="predict")

        modelling_df.at[i, 'train-precision'] = precision_score(y_train, y_pred_train)
        modelling_df.at[i, 'train-recall'] = recall_score(y_train, y_pred_train)
        modelling_df.at[i, 'train-f1'] = f1_score(y_train, y_pred_train)
        modelling_df.at[i, 'train-acc'] = accuracy_score(y_train, y_pred_train)

        # Validation CV
        y_pred_val = cross_val_predict(clf, X_val, y_val, cv=5, method="predict")

        modelling_df.at[i, 'val-precision'] = precision_score(y_val, y_pred_val)
        modelling_df.at[i, 'val-recall'] = recall_score(y_val, y_pred_val)
        modelling_df.at[i, 'val-f1'] = f1_score(y_val, y_pred_val)
        modelling_df.at[i, 'val-acc'] = accuracy_score(y_val, y_pred_val)

        i += 1

print("\n Benchmarking complete!")



 Dataset: corr_feat_dataset


Algorithms:  90%|█████████ | 9/10 [00:45<00:04,  4.00s/it]

[LightGBM] [Info] Number of positive: 4645, number of negative: 16355
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000630 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 141
[LightGBM] [Info] Number of data points in the train set: 21000, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.221190 -> initscore=-1.258742
[LightGBM] [Info] Start training from score -1.258742
[LightGBM] [Info] Number of positive: 3716, number of negative: 13084
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000481 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 137
[LightGBM] [Info] Number of data points in the train set: 16800, number of used features: 7
[LightGBM] [Info] [binar

Algorithms: 100%|██████████| 10/10 [00:48<00:00,  4.87s/it]



 Dataset: lasso_feat_dataset


Algorithms:  90%|█████████ | 9/10 [01:03<00:05,  5.06s/it]

[LightGBM] [Info] Number of positive: 4645, number of negative: 16355
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001614 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 404
[LightGBM] [Info] Number of data points in the train set: 21000, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.221190 -> initscore=-1.258742
[LightGBM] [Info] Start training from score -1.258742
[LightGBM] [Info] Number of positive: 3716, number of negative: 13084
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002339 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 401
[LightGBM] [Info] Number of data points in the train set: 16800, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.221190 -> initscore=-1.258742
[Light

Algorithms: 100%|██████████| 10/10 [01:05<00:00,  6.59s/it]



 Dataset: tree_feat_dataset


Algorithms:  90%|█████████ | 9/10 [01:27<00:07,  7.63s/it]

[LightGBM] [Info] Number of positive: 4645, number of negative: 16355
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004789 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3255
[LightGBM] [Info] Number of data points in the train set: 21000, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.221190 -> initscore=-1.258742
[LightGBM] [Info] Start training from score -1.258742
[LightGBM] [Info] Number of positive: 3716, number of negative: 13084
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003635 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3250
[LightGBM] [Info] Number of data points in the train set: 16800, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.221190 -> initscore=-1.258742
[LightGBM] [Info] Start training from score -1.258742
[LightGBM] [In

Algorithms: 100%|██████████| 10/10 [01:32<00:00,  9.21s/it]



 Dataset: rfe_feat_dataset


Algorithms:  90%|█████████ | 9/10 [01:10<00:05,  5.42s/it]

[LightGBM] [Info] Number of positive: 4645, number of negative: 16355
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002208 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2944
[LightGBM] [Info] Number of data points in the train set: 21000, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.221190 -> initscore=-1.258742
[LightGBM] [Info] Start training from score -1.258742
[LightGBM] [Info] Number of positive: 3716, number of negative: 13084
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001672 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2941
[LightGBM] [Info] Number of data points in the train set: 16800, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.221190 -> initscore=-1.258742
[LightGBM] [Info] Start training from score -1.258742
[LightGBM] [In

Algorithms: 100%|██████████| 10/10 [01:13<00:00,  7.33s/it]



 Dataset: all_feat_dataset


Algorithms:  90%|█████████ | 9/10 [01:43<00:08,  8.74s/it]

[LightGBM] [Info] Number of positive: 4645, number of negative: 16355
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005180 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3291
[LightGBM] [Info] Number of data points in the train set: 21000, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.221190 -> initscore=-1.258742
[LightGBM] [Info] Start training from score -1.258742
[LightGBM] [Info] Number of positive: 3716, number of negative: 13084
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003318 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3286
[LightGBM] [Info] Number of data points in the train set: 16800, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.221190 -> initscore=-1.258742
[LightGBM] [Info] Start training from score -1.258742
[LightGBM] [In

Algorithms: 100%|██████████| 10/10 [01:48<00:00, 10.81s/it]


 Benchmarking complete!





In [59]:
modelling_df.sort_values(['val-f1','val-precision','val-recall'], ascending=False)

Unnamed: 0,feats_subset,algo,train-precision,train-recall,train-f1,train-acc,val-precision,val-recall,val-f1,val-acc
43,all_feat_dataset,Random Forest,0.485351,0.599139,0.536275,0.77081,0.514706,0.59769,0.553102,0.786333
23,tree_feat_dataset,Random Forest,0.4913,0.589666,0.536008,0.77419,0.512364,0.593169,0.549814,0.785111
3,corr_feat_dataset,Random Forest,0.50257,0.568353,0.533441,0.780095,0.523481,0.57107,0.546241,0.790111
2,corr_feat_dataset,Decision Tree,0.500469,0.573735,0.534604,0.779048,0.501485,0.593672,0.543698,0.779556
13,lasso_feat_dataset,Random Forest,0.498963,0.569645,0.531966,0.778286,0.520441,0.569061,0.543666,0.788667
33,rfe_feat_dataset,Random Forest,0.459343,0.59591,0.518789,0.755476,0.492687,0.592165,0.537865,0.774889
12,lasso_feat_dataset,Decision Tree,0.498018,0.567922,0.530678,0.77781,0.492575,0.583124,0.534039,0.774889
22,tree_feat_dataset,Decision Tree,0.458018,0.586006,0.514167,0.755048,0.459376,0.599196,0.520052,0.755333
42,all_feat_dataset,Decision Tree,0.457835,0.585576,0.513886,0.754952,0.459168,0.598694,0.51973,0.755222
8,corr_feat_dataset,XGBoost,0.468478,0.567922,0.513429,0.761905,0.460025,0.563536,0.506546,0.757111


In [60]:
modelling_df.sort_values(['val-f1','val-precision','val-recall'], ascending=False).to_csv("../outputs/model_benchmarks.csv", index=False)

### Fine-tuning best performers

In [None]:
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# ------------------------------
# Prepare Data
# ------------------------------
X_tuned = StandardScaler().fit_transform(X)   # Optional for trees, but consistent
y_tuned = y

# 5-fold CV repeated 3 times
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=29)

# ==========================================================
# 1. RANDOM FOREST TUNING
# ==========================================================
rf_param_grid = {
    'n_estimators': [200, 400],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}

rf = RandomForestClassifier(random_state=29, n_jobs=-1)

rf_search = GridSearchCV(
    estimator=rf,
    param_grid=rf_param_grid,
    scoring='roc_auc',
    cv=cv,
    verbose=3,
    n_jobs=-1
)

rf_search.fit(X_tuned, y_tuned)
print("Best RF Params:", rf_search.best_params_)
print("Best RF ROC-AUC:", rf_search.best_score_)

# ==========================================================
# 2. XGBOOST TUNING
# ==========================================================
xgb_param_grid = {
    'n_estimators': [200, 400],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 1.0],
    'colsample_bytree': [0.7, 1.0]
}

xgb = XGBClassifier(
    random_state=29,
    eval_metric='logloss',
    tree_method='hist',
    n_jobs=-1
)

xgb_search = GridSearchCV(
    estimator=xgb,
    param_grid=xgb_param_grid,
    scoring='roc_auc',
    cv=cv,
    verbose=3,
    n_jobs=-1
)

xgb_search.fit(X_tuned, y_tuned)
print("Best XGB Params:", xgb_search.best_params_)
print("Best XGB ROC-AUC:", xgb_search.best_score_)

# ==========================================================
# 3. LIGHTGBM TUNING
# ==========================================================
lgb_param_grid = {
    'n_estimators': [200, 400],
    'num_leaves': [31, 63, 127],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [-1, 10, 20],
    'subsample': [0.7, 1.0],
    'colsample_bytree': [0.7, 1.0]
}

lgb = LGBMClassifier(
    random_state=29,
    boosting_type='gbdt',
    n_jobs=-1
)

lgb_search = GridSearchCV(
    estimator=lgb,
    param_grid=lgb_param_grid,
    scoring='roc_auc',
    cv=cv,
    verbose=3,
    n_jobs=-1
)

lgb_search.fit(X_tuned, y_tuned)
print("Best LGBM Params:", lgb_search.best_params_)
print("Best LGBM ROC-AUC:", lgb_search.best_score_)


Fitting 15 folds for each of 48 candidates, totalling 720 fits
Best RF Params: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 400}
Best RF ROC-AUC: 0.7810074217845973
Fitting 15 folds for each of 72 candidates, totalling 1080 fits
Best XGB Params: {'colsample_bytree': 0.7, 'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 400, 'subsample': 0.7}
Best XGB ROC-AUC: 0.7845218191958593
Fitting 15 folds for each of 216 candidates, totalling 3240 fits


In [None]:
# from sklearn.model_selection import RandomizedSearchCV, RepeatedStratifiedKFold
# from sklearn.preprocessing import StandardScaler
# from sklearn.ensemble import RandomForestClassifier
# from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier
# from scipy.stats import randint, uniform

# # ------------------------------
# # Prepare Data
# # ------------------------------
# X_tuned = StandardScaler().fit_transform(X)   # Optional for trees
# y_tuned = y

# # CV setup (5 folds × 3 repeats)
# cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=29)

# # ==========================================================
# # 1. RANDOM FOREST (FAST RANDOM SEARCH)
# # ==========================================================
# rf_param_dist = {
#     'n_estimators': randint(200, 800),
#     'max_depth': randint(4, 30),
#     'min_samples_split': randint(2, 10),
#     'min_samples_leaf': randint(1, 5),
#     'max_features': ['sqrt', 'log2', None]
# }

# rf = RandomForestClassifier(random_state=29, n_jobs=-1)

# rf_search = RandomizedSearchCV(
#     estimator=rf,
#     param_distributions=rf_param_dist,
#     n_iter=30,                   # Faster
#     scoring='roc_auc',
#     cv=cv,
#     verbose=3,
#     random_state=29,
#     n_jobs=-1
# )

# rf_search.fit(X_tuned, y_tuned)
# print("Best RF Params:", rf_search.best_params_)
# print("Best RF ROC-AUC:", rf_search.best_score_)


# # ==========================================================
# # 2. XGBOOST RANDOM SEARCH
# # ==========================================================
# xgb_param_dist = {
#     'n_estimators': randint(200, 700),
#     'max_depth': randint(3, 10),
#     'learning_rate': uniform(0.01, 0.2),
#     'subsample': uniform(0.6, 0.4),
#     'colsample_bytree': uniform(0.6, 0.4),
#     'gamma': uniform(0, 5),
# }

# xgb = XGBClassifier(
#     random_state=29,
#     eval_metric='logloss',
#     tree_method='hist',
#     n_jobs=-1
# )

# xgb_search = RandomizedSearchCV(
#     estimator=xgb,
#     param_distributions=xgb_param_dist,
#     n_iter=40,                  # More iterations for XGB
#     scoring='roc_auc',
#     cv=cv,
#     verbose=3,
#     random_state=29,
#     n_jobs=-1
# )

# xgb_search.fit(X_tuned, y_tuned)
# print("Best XGB Params:", xgb_search.best_params_)
# print("Best XGB ROC-AUC:", xgb_search.best_score_)


# # ==========================================================
# # 3. LIGHTGBM RANDOM SEARCH
# # ==========================================================
# lgb_param_dist = {
#     'n_estimators': randint(200, 700),
#     'num_leaves': randint(31, 255),
#     'learning_rate': uniform(0.01, 0.2),
#     'max_depth': randint(-1, 20),
#     'subsample': uniform(0.6, 0.4),
#     'colsample_bytree': uniform(0.6, 0.4)
# }

# lgb = LGBMClassifier(
#     random_state=29,
#     boosting_type='gbdt',
#     n_jobs=-1
# )

# lgb_search = RandomizedSearchCV(
#     estimator=lgb,
#     param_distributions=lgb_param_dist,
#     n_iter=40,                 # More iterations → better
#     scoring='roc_auc',
#     cv=cv,
#     verbose=3,
#     random_state=29,
#     n_jobs=-1
# )

# lgb_search.fit(X_tuned, y_tuned)
# print("Best LGBM Params:", lgb_search.best_params_)
# print("Best LGBM ROC-AUC:", lgb_search.best_score_)
