## Model preambles

In this notebook, Most of the models will utilize a cross-validated grid search. In the case of Adaboost and Gradient Boosting methods, we employ Randomized search to find the best fit. This approach allows the models to explore a range of potential hyperparameters and identify the optimal configuration for each model. The following section set up the specific tuning parameters for each model that will be used later in this notebook. 

Please ensure that the necessary functions, including `credit_approval_data_cleaner` and `n_mo_delinquency`, are imported from the `functions.py` file located in the current directory before running the models.

In [254]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import pickle

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import ConfusionMatrixDisplay, classification_report, RocCurveDisplay
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.metrics import Recall
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2

In [17]:
credit = pd.read_csv('../data/train_cleaned.csv')
credit.head()

Unnamed: 0,id,is_delinquent,length_of_credit,number_of_delinquent_months,average_delinquency_rate,3mo_delinquency,6mo_delinquency,12mo_delinquency,flag_own_car,flag_own_realty,...,name_family_status,name_housing_type,flag_mobil,flag_work_phone,flag_phone,flag_email,occupation_type,cnt_fam_members,age,years_employed
0,5008804,0,16,2,0.125,0,0,0,Y,Y,...,Civil marriage,Rented apartment,1,1,0,0,missing,2.0,32,12
1,5008805,0,15,2,0.133333,0,0,0,Y,Y,...,Civil marriage,Rented apartment,1,1,0,0,missing,2.0,32,12
2,5008806,0,30,7,0.233333,0,0,1,Y,Y,...,Married,House / apartment,1,0,0,0,Security staff,2.0,58,3
3,5008808,1,5,2,0.4,1,1,1,N,Y,...,Single / not married,House / apartment,1,0,1,1,Sales staff,1.0,52,8
4,5008809,0,5,0,0.0,0,0,0,N,Y,...,Single / not married,House / apartment,1,0,1,1,Sales staff,1.0,52,8


In [18]:
credit.columns

Index(['id', 'is_delinquent', 'length_of_credit',
       'number_of_delinquent_months', 'average_delinquency_rate',
       '3mo_delinquency', '6mo_delinquency', '12mo_delinquency',
       'flag_own_car', 'flag_own_realty', 'cnt_children', 'amt_income_total',
       'name_income_type', 'name_education_type', 'name_family_status',
       'name_housing_type', 'flag_mobil', 'flag_work_phone', 'flag_phone',
       'flag_email', 'occupation_type', 'cnt_fam_members', 'age',
       'years_employed'],
      dtype='object')

In [259]:
X = credit.drop(columns=[
    'id', 'is_delinquent', 'number_of_delinquent_months', 'average_delinquency_rate', 
    '3mo_delinquency', '6mo_delinquency', '12mo_delinquency'])

y_1 = credit['is_delinquent']
y_2 = credit['3mo_delinquency']
y_3 = credit['6mo_delinquency']
y_4 = credit['12mo_delinquency']

In [260]:
cat_cols = X_train.select_dtypes(include='object').columns
num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns

## Train-test(Validation) split

In [261]:
X_train, X_val, y_train_1, y_val_1 = train_test_split(X, y_1, test_size=0.2, random_state=42)

X_train, X_val, y_train_2, y_val_2 = train_test_split(X, y_2, test_size=0.2, random_state=42)
X_train, X_val, y_train_3, y_val_3 = train_test_split(X, y_3, test_size=0.2, random_state=42)

X_train, X_val, y_train_4, y_val_4 = train_test_split(X, y_4, test_size=0.2, random_state=42)

## One-Hot-Encoding and Standard Scaling variables with Column Transform

In [245]:
cat_cols = X_train.select_dtypes(include='object').columns
num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns

sc = StandardScaler()

oh = OneHotEncoder(
    categories='auto', 
    drop='first', 
    sparse_output=False, 
    dtype='int',
    handle_unknown = 'ignore'
)

ct = ColumnTransformer(
    transformers=[
        ('oh', oh, cat_cols),
        ('sc', sc, num_cols)
    ]
)

X_train_ct = ct.fit_transform(X_train)
X_val_ct = ct.transform(X_val)

In [266]:
with open('../data/ct.pkl', 'wb') as file:
    pickle.dump(ct, file)

## Fit and scoring function

In [40]:
data_sets = [("is_delinquent", y_train_1, y_val_1), 
             ("3mo_delinquency", y_train_2, y_val_2), 
             ("6mo_delinquency", y_train_3, y_val_3)]

def fit_and_score(gs, X_train, y_train, X_val, y_val, name):
    print(f"\nPerforming grid search and scoring on dataset: {name}\n{'-'*50}")
    gs.fit(X_train, y_train)
    print(f'Training score on {name} : {gs.score(X_train, y_train)}')
    print(f'Testing score on {name} : {gs.score(X_val, y_val)}')
    best_params = gs.best_params_
    print(f"Best parameters for {name}:", best_params)
    return best_params


def fit_and_score_no_param(gs, X_train, y_train, X_val, y_val, name):
    print(f"\nPerforming grid search and scoring on dataset: {name}\n{'-'*50}")
    gs.fit(X_train, y_train)
    print(f'Training score on {name} : {gs.score(X_train, y_train)}')
    print(f'Testing score on {name} : {gs.score(X_val, y_val)}')


## Target Column Distributions

In [41]:
print(credit['is_delinquent'].value_counts(normalize=True))
print(credit['3mo_delinquency'].value_counts(normalize=True))
print(credit['6mo_delinquency'].value_counts(normalize=True))

0    0.757308
1    0.242692
Name: is_delinquent, dtype: float64
0    0.589633
1    0.410367
Name: 3mo_delinquency, dtype: float64
0    0.526401
1    0.473599
Name: 6mo_delinquency, dtype: float64


## Gradient Boosting with default parameters
Predicting the classes with 3 different targets

In [48]:
gb = GradientBoostingClassifier()

for name, y_train, y_val in data_sets:
    fit_and_score_no_param(gb, X_train_ct, y_train, X_val_ct, y_val, name)



Performing grid search and scoring on dataset: is_delinquent
--------------------------------------------------
Training score on is_delinquent : 0.778815196394076
Testing score on is_delinquent : 0.7633612363168062

Performing grid search and scoring on dataset: 3mo_delinquency
--------------------------------------------------
Training score on 3mo_delinquency : 0.758813586606568
Testing score on 3mo_delinquency : 0.7546683837733419

Performing grid search and scoring on dataset: 6mo_delinquency
--------------------------------------------------
Training score on 6mo_delinquency : 0.7706455247907277
Testing score on 6mo_delinquency : 0.766580811332904


## Gradient Boosting with GridSearchCV

In [242]:
pgrids_gb = {
    'learning_rate': [.15, .175, 0.2],
    'n_estimators': [285, 290, 295],
    'max_depth': [8, 9, 10]
}
    
gb = GradientBoostingClassifier()
rs_gb = RandomizedSearchCV(gb, param_distributions=pgrids_gb, cv=5)
rs_gb.fit(X_train_ct, y_train_3)


print(rs_gb.score(X_train_ct, y_train_3))
print(rs_gb.score(X_val_ct, y_val_3)) 


# for name, y_train, y_val in data_sets:
#     fit_and_score_no_param(gs, X_train_ct, y_train, X_val_ct, y_val, name)

0.9772215067611075
0.7905666452028333


## Ada Boost with GridSearchCV

In [238]:
%%time
tree = DecisionTreeClassifier(random_state=123)
ada = AdaBoostClassifier(estimator=tree, random_state=42)

pgrids_ada = {
    'learning_rate': [2.25, 2.5, 2.7],
    'n_estimators': [250, 300, 350],
    'estimator__max_depth': [71, 81, 91, 101, None],
    'estimator__min_samples_leaf': [7, 8, 9],
    'estimator__max_features': ['auto', 'sqrt', 'log2']
}


rs_ada = RandomizedSearchCV(ada, param_distributions=pgrids_ada, cv=5)
rs_ada.fit(X_train_ct, y_train_3)

print(rs_ada.score(X_train_ct, y_train_3))
print(rs_ada.score(X_val_ct, y_val_3))  

0.9087652929813265
0.7933032839665164
CPU times: user 4min 33s, sys: 5.55 s, total: 4min 39s
Wall time: 4min 43s


In [243]:
rs_ada.best_params_

{'n_estimators': 300,
 'learning_rate': 2.25,
 'estimator__min_samples_leaf': 7,
 'estimator__max_features': 'auto',
 'estimator__max_depth': None}


## Support Vector Classifier

In [42]:
svc = SVC(C=1, kernel='rbf', degree = 1)

for name, y_train, y_val in data_sets:
    fit_and_score_no_param(svc, X_train_ct, y_train, X_val_ct, y_val, name)


Performing grid search and scoring on dataset: is_delinquent
--------------------------------------------------
Training score on is_delinquent : 0.7627978106889891
Testing score on is_delinquent : 0.7548293625241468

Performing grid search and scoring on dataset: 3mo_delinquency
--------------------------------------------------
Training score on 3mo_delinquency : 0.7787749517063748
Testing score on 3mo_delinquency : 0.7598197037990985

Performing grid search and scoring on dataset: 6mo_delinquency
--------------------------------------------------
Training score on 6mo_delinquency : 0.788554410817772
Testing score on 6mo_delinquency : 0.7670637475853187


## Logistic Regression

In [45]:
param_grid = {
    'C':  np.linspace(1, 10,5),
    'penalty': ['l2'],
    'solver': ['lbfgs'],
    'max_iter': [500,1000]
    
}

gs = GridSearchCV(
estimator= LogisticRegression(),
param_grid=param_grid)


for name, y_train, y_val in data_sets:
    fit_and_score(gs, X_train_ct, y_train, X_val_ct, y_val, name)


Performing grid search and scoring on dataset: is_delinquent
--------------------------------------------------
Training score on is_delinquent : 0.7627575660012879
Testing score on is_delinquent : 0.754507405022537
Best parameters for is_delinquent: {'C': 10.0, 'max_iter': 500, 'penalty': 'l2', 'solver': 'lbfgs'}

Performing grid search and scoring on dataset: 3mo_delinquency
--------------------------------------------------
Training score on 3mo_delinquency : 0.749637797810689
Testing score on 3mo_delinquency : 0.7435608499678042
Best parameters for 3mo_delinquency: {'C': 10.0, 'max_iter': 500, 'penalty': 'l2', 'solver': 'lbfgs'}

Performing grid search and scoring on dataset: 6mo_delinquency
--------------------------------------------------
Training score on 6mo_delinquency : 0.7563184159690921
Testing score on 6mo_delinquency : 0.7549903412749517
Best parameters for 6mo_delinquency: {'C': 1.0, 'max_iter': 500, 'penalty': 'l2', 'solver': 'lbfgs'}


## Random Forest

In [47]:
param_grid = {
    'n_estimators': [300,400],
    'max_depth': [None],
    'min_samples_split': [5,8],
    'min_samples_leaf': [3, 4],
    'max_features': ['sqrt']
}

rf = RandomForestClassifier()

gs = GridSearchCV(estimator=rf, param_grid=param_grid)

for name, y_train, y_val in data_sets:
    fit_and_score(gs, X_train_ct, y_train, X_val_ct, y_val, name)


Performing grid search and scoring on dataset: is_delinquent
--------------------------------------------------
Training score on is_delinquent : 0.8654217643271088
Testing score on is_delinquent : 0.7867031551835157
Best parameters for is_delinquent: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 3, 'min_samples_split': 8, 'n_estimators': 400}

Performing grid search and scoring on dataset: 3mo_delinquency
--------------------------------------------------
Training score on 3mo_delinquency : 0.8577350289761752
Testing score on 3mo_delinquency : 0.7916934964584674
Best parameters for 3mo_delinquency: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 3, 'min_samples_split': 5, 'n_estimators': 300}

Performing grid search and scoring on dataset: 6mo_delinquency
--------------------------------------------------
Training score on 6mo_delinquency : 0.8578557630392788
Testing score on 6mo_delinquency : 0.7915325177076626
Best parameters for 6mo_delinquency: {

## Dense Neural Network

In [234]:
def dnn(X_train, y_train, X_val, y_val):

    model = Sequential()
    model.add(Dense(256, input_dim=X_train_ct.shape[1], activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', 
                  optimizer='adam', 
                  metrics=['accuracy'])

    es = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    model.fit(
    X_train_ct, 
    y_train_1, 
    epochs=50, 
    batch_size=32, 
    validation_data=(X_val, y_val),
    callbacks = es
    )

    return model

In [5]:
# Model did worse with dropouts, l2 regularizer
# Performed similar with extra hidden layers

In [235]:
is_delinq = dnn(X_train_ct, y_train_1, X_val_ct, y_val_1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50


In [236]:
three_month_delinq = dnn(X_train_ct, y_train_2, X_val_ct, y_val_2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50


In [237]:
six_month_delinq = dnn(X_train_ct, y_train_3, X_val_ct, y_val_3)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50


### Fitting AdaBoost on the full training sample

In [263]:
X_ct = ct.transform(X)

tree = DecisionTreeClassifier(random_state=123, 
                              max_depth=None, 
                              min_samples_split=7, 
                              max_features='auto')

ada = AdaBoostClassifier(estimator=tree, 
                         random_state=42, 
                         learning_rate=2.25, 
                         n_estimators=300)

ada.fit(X_ct, y_3)

print(ada.score(X_ct, y_3))

0.9714745653573729


In [264]:
with open('../data/adaboost_model.pkl', 'wb') as file:
    pickle.dump(ada, file)