In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from Data_Processing import preprocessing, modify_target_binary
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, recall_score, accuracy_score
from typing import Dict
from tqdm import tqdm
import numpy as np
from xgboost import XGBClassifier

In [2]:
loan_df = pd.read_csv("data/accepted_2007_to_2018Q4.csv")
print("DataFrame read!")
loan_df = modify_target_binary(loan_df, "loan_status")
# Remove Current/Issued targets before preprocessing
loan_df = loan_df.loc[~loan_df["loan_status"].isin(["Current","Issued"])].copy()

  loan_df = pd.read_csv("data/accepted_2007_to_2018Q4.csv")


DataFrame read!


Training using class weights and Random Grid Search

In [None]:
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state= 42)
RF_grid_dist =        {"n_estimators": [300],
                       "max_depth" :  np.arange(5, 61, 5),
                       "min_samples_split" : np.arange(2,21),
                       "max_features": ["sqrt","log2"],
                       "min_samples_leaf": np.arange(2,21),
                       "class_weight": ["balanced"],
                       "n_jobs" : [-1]
                       }

XGBoost_grid_dist = {"n_estimators": np.array([300]),
                     "max_depth": np.arange(5,61,5),
                     "learning_rate":np.array([0.001, 0.01, 0.1, 0.5]),
                     "objective": ["binary:logistic"],
                     "subsample": np.arange(0.1, 1.0, 0.1),
                     "colsample_bytree": np.arange(0.1, 1.0, 0.1),
                     "reg_alpha": np.array([0.01, 0.05, 0.1, 0.5, 0.8]),
                     "reg_lambda": np.array([0.01, 0.05, 0.1, 0.5, 0.8])}

# Function to perform random search with stratified k-fold validation
def RANDOM_SEARCH_CV_SS(param_dist: Dict, n_iterations: int, X_train: pd.DataFrame, y_train: pd.Series, model_classifier, cv:StratifiedKFold):
    results = []
    best_score = -np.inf
    best_params = None

    for _ in tqdm(range(n_iterations)):


        params = {}
        for key,values in param_dist.items():
            params[key] = np.random.choice(values)

        f1_scores = []
        recalls = []
        accuracy_scores = []

        for train_idx, val_idx in cv.split(X_train, y_train):
            X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
            y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

            model = model_classifier(
                **params
            )

            model.fit(X_tr, y_tr)
            y_pred = model.predict(X_val)

            f1_scores.append(f1_score(y_val, y_pred))
            recalls.append(recall_score(y_val, y_pred))
            accuracy_scores.append(accuracy_score(y_val, y_pred))

        mean_f1 = np.mean(f1_scores)

        if mean_f1 > best_score:
            best_score = mean_f1
            best_model = model
            best_params = params

    return best_params, best_score, best_model
    
# Function to tune random forest hyperparameters on sample of data
def hyperparameter_tuning(rows_sample:int, df:pd.DataFrame, cv:StratifiedKFold, param_dist:Dict, model_classifier):
    df_sample = df.sample(n = rows_sample)
    X_sample =  df_sample.drop(columns="loan_status")
    y_sample =  df_sample["loan_status"]
    X_sample = preprocessing(X_sample)
    best_params, best_score, best_model = RANDOM_SEARCH_CV_SS(param_dist, 50, X_sample, y_sample, model_classifier, cv)
    print(f"Best CV F1-score: {best_score}")
    print(best_params)
    return best_params, best_score, best_model

print("Random Forest: ")
best_RF_params, best_f1_score, best_RF_model = hyperparameter_tuning(10000, loan_df, skf, RF_grid_dist, RandomForestClassifier)
print("XGBoost :")
best_XGB_params, best_f1_score, best_XGB_model = hyperparameter_tuning(10000, loan_df, skf, XGBoost_grid_dist, XGBClassifier)

Random Forest: 


100%|██████████| 50/50 [32:59<00:00, 39.59s/it]


Best CV F1-score: 0.7172723166024167
{'n_estimators': np.int64(300), 'max_depth': np.int64(30), 'min_samples_split': np.int64(17), 'max_features': np.str_('sqrt'), 'min_samples_leaf': np.int64(2), 'class_weight': np.str_('balanced')}
XGBoost :


100%|██████████| 50/50 [1:25:42<00:00, 102.85s/it]   

Best CV F1-score: 0.7417546970210589
{'n_estimators': np.int64(300), 'max_depth': np.int64(5), 'learning_rate': np.float64(0.1), 'objective': np.str_('binary:logistic'), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.1), 'reg_alpha': np.float64(0.5), 'reg_lambda': np.float64(0.1)}





In [None]:
# Training using best hyperparameters on the whole dataset
X = loan_df.drop(columns=["loan_status"])
y = loan_df["loan_status"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, 
                                                    random_state = 42, 
                                                    shuffle = True, 
                                                    stratify = y)

X_train, X_test = preprocessing(X_train), preprocessing(X_test)
print("Processing finished!")
# XGBoost best params
XGB_model = XGBClassifier(**best_XGB_params)
XGB_model.fit(X_train,y_train)
print("XBG model trained!")
# Random Forest best params
RF_model = RandomForestClassifier(**best_RF_params)
RF_model.fit(X_train,y_train)
print("Random Forest trained!")
# Save the models trained with the best hyperparameters
XGB_model.save_model("xgb_model_1.json")
RF_model.save_model("rf_model_1.json")
print("Models saved!")

Processing finished!


KeyboardInterrupt: 