In [26]:
# import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from scipy import stats
from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, accuracy_score, roc_auc_score, classification_report

# libraries to save pickle
import joblib

from time import time
# For full compatibility of Category Encoders with Pipelines and ColumnTransformers 
# import sklearn
# sklearn.set_config(transform_output="pandas")

# import category_encoders as ce

# from sklearn.preprocessing import OneHotEncoder 

In [27]:
df_raw = pd.read_csv("data/final_data/UCI_Credit_Card_Defaults_raw.csv")
df_capped = pd.read_csv("data/final_data/UCI_Credit_Card_Defaults_capped.csv")
df_transfomred = pd.read_csv("data/final_data/UCI_Credit_Card_Defaults_transformed.csv")

In [38]:
# final_columns = ['SEX_male', 'EDUCATION_high_school', 'EDUCATION_others', 'EDUCATION_university', 'MARRIAGE_others', 'MARRIAGE_single', 'PAY_SEPT_-1', 'PAY_SEPT_0', 'PAY_SEPT_1', 'PAY_SEPT_2',
#            'PAY_SEPT_3', 'PAY_SEPT_4', 'PAY_SEPT_5', 'PAY_SEPT_6', 'PAY_SEPT_7', 'PAY_SEPT_8', 'PAY_AUG_-1', 'PAY_AUG_0', 'PAY_AUG_1', 'PAY_AUG_2', 'PAY_AUG_3', 'PAY_AUG_4', 'PAY_AUG_5',
#            'PAY_AUG_6', 'PAY_AUG_7', 'PAY_AUG_8', 'PAY_JUL_-1', 'PAY_JUL_0', 'PAY_JUL_1', 'PAY_JUL_2', 'PAY_JUL_3', 'PAY_JUL_4', 'PAY_JUL_5', 'PAY_JUL_6', 'PAY_JUL_7', 'PAY_JUL_8',
#             'PAY_JUN_-1', 'PAY_JUN_0', 'PAY_JUN_1', 'PAY_JUN_2', 'PAY_JUN_3', 'PAY_JUN_4', 'PAY_JUN_5', 'PAY_JUN_6', 'PAY_JUN_7', 'PAY_JUN_8', 'PAY_MAY_-1', 'PAY_MAY_0', 'PAY_MAY_1',
#             'PAY_MAY_2', 'PAY_MAY_3', 'PAY_MAY_4', 'PAY_MAY_5', 'PAY_MAY_6', 'PAY_MAY_7', 'PAY_MAY_8', 'PAY_APR_-1', 'PAY_APR_0', 'PAY_APR_1', 'PAY_APR_2', 'PAY_APR_3', 'PAY_APR_4',
#             'PAY_APR_5', 'PAY_APR_6', 'PAY_APR_7', 'PAY_APR_8', 'AGE', 'LIMIT_BAL_tr', 'BILL_AMT_SEPT_tr', 'BILL_AMT_AUG_tr', 'BILL_AMT_JUL_tr', 'BILL_AMT_JUN_tr', 'BILL_AMT_MAY_tr',
#             'BILL_AMT_APR_tr', 'PAY_AMT_SEPT_tr', 'PAY_AMT_AUG_tr', 'PAY_AMT_JUL_tr', 'PAY_AMT_JUN_tr', 'PAY_AMT_MAY_tr', 'PAY_AMT_APR_tr', 'DEFAULT_PAYMENT']

In [39]:
# df.columns

In [8]:
df = df_transfomred
df.sample(5)

Unnamed: 0,LIMIT_BAL_tr,SEX,EDUCATION,MARRIAGE,AGE,PAY_SEPT,PAY_AUG,PAY_JUL,PAY_JUN,PAY_MAY,...,BILL_AMT_JUN_tr,BILL_AMT_MAY_tr,BILL_AMT_APR_tr,PAY_AMT_SEPT_tr,PAY_AMT_AUG_tr,PAY_AMT_JUL_tr,PAY_AMT_JUN_tr,PAY_AMT_MAY_tr,PAY_AMT_APR_tr,DEFAULT_PAYMENT
10935,49.324241,0,2,2,23,0,0,2,0,0,...,16.708762,16.75631,9.48796,5.165657,1.3459,3.138568,3.138568,3.208451,3.208451,0
30470,36.840315,1,1,1,27,0,0,1,0,0,...,9.083168,9.179902,5.795525,4.257535,0.408838,3.206313,2.292271,2.266616,2.189015,1
24583,69.79532,1,2,1,34,-1,0,0,0,0,...,22.733613,22.661801,12.174773,4.818105,3.830542,3.729985,3.728126,3.727594,3.716851,1
14026,65.421326,0,2,2,38,0,0,0,0,0,...,16.527974,16.558289,9.501004,3.98422,3.274083,3.138568,3.181595,3.208451,3.486245,0
31069,21.544347,1,2,2,42,0,0,0,0,0,...,9.311938,9.143411,0.0,3.293525,2.782058,2.91129,1.668321,0.0,0.0,1


#### Seperate out Dependant and Target Features

In [28]:
features = df.drop(columns=['DEFAULT_PAYMENT'])
labels = df['DEFAULT_PAYMENT']

####  Create Training, Validation and Test datasets

In [29]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.40, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.50, random_state=42)

In [11]:
print("Train dataset size: {} %".format(round(len(X_train)/len(features), 2)))
print("Validation dataset size: {} %".format(round(len(X_val)/len(features), 2)))
print("Test dataset size: {} %".format(round(len(X_test)/len(features), 2)))

Train dataset size: 0.6 %
Validation dataset size: 0.2 %
Test dataset size: 0.2 %


In [30]:
# Write out all the data to csv files

X_train.to_csv("data/split_data/train_features.csv", index=False)
X_val.to_csv("data/split_data/val_features.csv", index=False)
X_test.to_csv("data/split_data/test_features.csv", index=False)

y_train.to_csv("data/split_data/train_labels.csv", index=False)
y_val.to_csv("data/split_data/val_labels.csv", index=False)
y_test.to_csv("data/split_data/test_labels.csv", index=False)

#### Standardize all features

In [31]:
train_features = pd.read_csv("data/split_data/train_features.csv")
val_features = pd.read_csv("data/split_data/val_features.csv")
test_features = pd.read_csv("data/split_data/test_features.csv")

train_labels = pd.read_csv("data/split_data/train_labels.csv")
val_labels = pd.read_csv("data/split_data/val_labels.csv")
test_labels = pd.read_csv("data/split_data/test_labels.csv")

In [46]:
train_features.sample(5)

Unnamed: 0,LIMIT_BAL_clean,SEX,EDUCATION,MARRIAGE,AGE,PAY_SEPT,PAY_AUG,PAY_JUL,PAY_JUN,PAY_MAY,...,BILL_AMT_JUL_clean,BILL_AMT_JUN_clean,BILL_AMT_MAY_clean,BILL_AMT_APR_clean,PAY_AMT_SEPT_clean,PAY_AMT_AUG_clean,PAY_AMT_JUL_clean,PAY_AMT_JUN_clean,PAY_AMT_MAY_clean,PAY_AMT_APR_clean
25003,10000,1,2,1,27,1,2,2,2,1,...,5916.463302,5673.012193,6536.020401,6484.901774,75.814856,1853.802663,0,955.975613,197.82705,3378.79159
16299,200000,1,1,1,36,-1,-1,-1,-1,-1,...,4078.0,3000.0,3849.0,4479.0,1186.0,4092.0,3000,3849.0,4479.0,8120.0
16610,50000,1,2,1,40,0,0,0,0,0,...,3430.292906,4151.537516,4903.344463,5034.578501,1092.218939,1000.75496,768,798.490079,180.603968,101.09081
4268,30000,0,2,2,22,0,0,-1,2,-1,...,1900.0,150.0,2558.0,2829.0,1013.0,1900.0,0,2558.0,2829.0,3000.0
5261,232276,1,2,2,31,0,0,0,0,0,...,213170.566471,206556.937166,57597.030718,24147.668774,5217.652611,8370.206515,5697,2113.608959,748.26883,0.0


In [32]:
# Fit the scaler on training data
scaler = StandardScaler()  # Standard Scaler rescales the dataset to have Mean = 0 and Std = 1
scaler.fit(train_features)

In [33]:
data_feature = train_features.columns

In [34]:

train_features[data_feature] = scaler.transform(train_features[data_feature])
val_features[data_feature] = scaler.transform(val_features[data_feature])
test_features[data_feature] = scaler.transform(test_features[data_feature])

In [19]:
# Function to print the best params
def print_results(results):
    print("BEST PARAMS: {}\n".format(results.best_params_))
    print("BEST SCORE: {}\n".format(results.best_score_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print("{} (+/-{}) for {}".format(round(mean, 3), round(std*2, 3), params))

#### Classification Algorithms
* Logistic regression
* Support vector machine
* Naive Bayes
* Decision tree
* K-nearest neighbor
* Random forest
* AdaBoost
* Neural networks

#### Logistic Regression

In [35]:
# Conduct search for best params while running Cross-Validation (GridSearchCV)

lr = LogisticRegression()
parameters = {
    'penalty': ['l1', 'l2', 'elasticnet'],
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['lbfgs', 'liblinear', 'newton-cholesky']
}


cv = GridSearchCV(estimator=lr, param_grid=parameters, cv=5, n_jobs=-1, scoring='accuracy')
cv.fit(train_features, train_labels.values.ravel())

print_results(cv)

BEST PARAMS: {'C': 0.01, 'penalty': 'l1', 'solver': 'liblinear'}

BEST SCORE: 0.742901755422392

nan (+/-nan) for {'C': 0.01, 'penalty': 'l1', 'solver': 'lbfgs'}
0.743 (+/-0.012) for {'C': 0.01, 'penalty': 'l1', 'solver': 'liblinear'}
nan (+/-nan) for {'C': 0.01, 'penalty': 'l1', 'solver': 'newton-cholesky'}
0.743 (+/-0.013) for {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
0.743 (+/-0.014) for {'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'}
0.743 (+/-0.013) for {'C': 0.01, 'penalty': 'l2', 'solver': 'newton-cholesky'}
nan (+/-nan) for {'C': 0.01, 'penalty': 'elasticnet', 'solver': 'lbfgs'}
nan (+/-nan) for {'C': 0.01, 'penalty': 'elasticnet', 'solver': 'liblinear'}
nan (+/-nan) for {'C': 0.01, 'penalty': 'elasticnet', 'solver': 'newton-cholesky'}
nan (+/-nan) for {'C': 0.1, 'penalty': 'l1', 'solver': 'lbfgs'}
0.743 (+/-0.012) for {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
nan (+/-nan) for {'C': 0.1, 'penalty': 'l1', 'solver': 'newton-cholesky'}
0.743 (+/-0.015) for {'C':

In [36]:
# Write out the model fit on the whole training set
joblib.dump(cv.best_estimator_, "data/models/model_logistic_regression.pkl")

['data/models/model_logistic_regression.pkl']

#### Support Vector Classifier

In [53]:
# Conduct search for best params while running Cross-Validation (GridSearchCV)

svc = SVC()
# parameters = {
#     'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
#     'C': [0.01, 0.1, 1, 10, 100]
# }
parameters = {
    'kernel': ['poly', 'rbf'],
    'C': [0.01, 0.1, 1, 10, 100]
}


cv = GridSearchCV(estimator=svc, param_grid=parameters, cv=5, n_jobs=5, scoring='accuracy')
cv.fit(train_features, train_labels.values.ravel())

print_results(cv)

BEST PARAMS: {'C': 10, 'kernel': 'rbf'}

BEST SCORE: 0.7741114377865247

0.713 (+/-0.013) for {'C': 0.01, 'kernel': 'poly'}
0.723 (+/-0.012) for {'C': 0.01, 'kernel': 'rbf'}
0.736 (+/-0.011) for {'C': 0.1, 'kernel': 'poly'}
0.746 (+/-0.012) for {'C': 0.1, 'kernel': 'rbf'}
0.743 (+/-0.015) for {'C': 1, 'kernel': 'poly'}
0.76 (+/-0.01) for {'C': 1, 'kernel': 'rbf'}
0.748 (+/-0.012) for {'C': 10, 'kernel': 'poly'}
0.774 (+/-0.012) for {'C': 10, 'kernel': 'rbf'}
0.747 (+/-0.016) for {'C': 100, 'kernel': 'poly'}
0.773 (+/-0.014) for {'C': 100, 'kernel': 'rbf'}


In [54]:
# Write out the model fit on the whole training set
joblib.dump(cv.best_estimator_, "data/models/model_support_vector_classifier.pkl")

['data/models/model_support_vector_classifier.pkl']

#### Naive Bayes

In [55]:
# Conduct search for best params while running Cross-Validation (GridSearchCV)

nb = GaussianNB()
parameters = {
    'var_smoothing': np.logspace(0,-9, num=100)
}


cv = GridSearchCV(estimator=nb, param_grid=parameters, cv=5, n_jobs=5, scoring='accuracy')
cv.fit(train_features, train_labels.values.ravel())

print_results(cv)

BEST PARAMS: {'var_smoothing': 0.3511191734215131}

BEST SCORE: 0.6325084651929127

0.628 (+/-0.005) for {'var_smoothing': 1.0}
0.629 (+/-0.005) for {'var_smoothing': 0.8111308307896871}
0.631 (+/-0.006) for {'var_smoothing': 0.657933224657568}
0.632 (+/-0.007) for {'var_smoothing': 0.533669923120631}
0.632 (+/-0.005) for {'var_smoothing': 0.43287612810830584}
0.633 (+/-0.008) for {'var_smoothing': 0.3511191734215131}
0.631 (+/-0.008) for {'var_smoothing': 0.2848035868435802}
0.631 (+/-0.009) for {'var_smoothing': 0.23101297000831597}
0.63 (+/-0.008) for {'var_smoothing': 0.1873817422860384}
0.63 (+/-0.009) for {'var_smoothing': 0.15199110829529336}
0.63 (+/-0.008) for {'var_smoothing': 0.12328467394420659}
0.629 (+/-0.009) for {'var_smoothing': 0.1}
0.629 (+/-0.009) for {'var_smoothing': 0.08111308307896872}
0.628 (+/-0.009) for {'var_smoothing': 0.0657933224657568}
0.628 (+/-0.009) for {'var_smoothing': 0.0533669923120631}
0.628 (+/-0.01) for {'var_smoothing': 0.04328761281083057}
0.

In [56]:
# Write out the model fit on the whole training set
joblib.dump(cv.best_estimator_, "data/models/model_naive_bayes_classifier.pkl")

['data/models/model_naive_bayes_classifier.pkl']

### Decision Tree Classifier

In [57]:
# Conduct search for best params while running Cross-Validation (GridSearchCV)

dtc = DecisionTreeClassifier()
parameters = {
    'criterion': ["gini", "entropy", "log_loss"],
    "max_features": ["auto", "sqrt", "log2"],
    "min_samples_leaf": [5, 10, 20, 50, 100],
    "max_depth": [2, 3, 5, 10, 20]    
}


cv = GridSearchCV(estimator=dtc, param_grid=parameters, cv=5, n_jobs=5, scoring='accuracy')
cv.fit(train_features, train_labels.values.ravel())

print_results(cv)

BEST PARAMS: {'criterion': 'entropy', 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 20}

BEST SCORE: 0.7388355560745725

nan (+/-nan) for {'criterion': 'gini', 'max_depth': 2, 'max_features': 'auto', 'min_samples_leaf': 5}
nan (+/-nan) for {'criterion': 'gini', 'max_depth': 2, 'max_features': 'auto', 'min_samples_leaf': 10}
nan (+/-nan) for {'criterion': 'gini', 'max_depth': 2, 'max_features': 'auto', 'min_samples_leaf': 20}
nan (+/-nan) for {'criterion': 'gini', 'max_depth': 2, 'max_features': 'auto', 'min_samples_leaf': 50}
nan (+/-nan) for {'criterion': 'gini', 'max_depth': 2, 'max_features': 'auto', 'min_samples_leaf': 100}
0.654 (+/-0.056) for {'criterion': 'gini', 'max_depth': 2, 'max_features': 'sqrt', 'min_samples_leaf': 5}
0.632 (+/-0.041) for {'criterion': 'gini', 'max_depth': 2, 'max_features': 'sqrt', 'min_samples_leaf': 10}
0.653 (+/-0.045) for {'criterion': 'gini', 'max_depth': 2, 'max_features': 'sqrt', 'min_samples_leaf': 20}
0.646 (+/-0.025) for {'criter

In [58]:
# Write out the model fit on the whole training set
joblib.dump(cv.best_estimator_, "data/models/model_decision_tree_classifier.pkl")

['data/models/model_decision_tree_classifier.pkl']

#### KNN Classifer

In [59]:
# Conduct search for best params while running Cross-Validation (GridSearchCV)

knc = KNeighborsClassifier()
parameters = { 
    'n_neighbors' : [2, 5, 7, 9, 11, 13, 15, 30, 60],
     'weights' : ['uniform','distance'],
     'metric' : ['minkowski','euclidean','manhattan'],
     "algorithm" : ["auto", "ball_tree", "kd_tree", "brute"]
     }


cv = GridSearchCV(estimator=knc, param_grid=parameters, cv=5, n_jobs=-1, scoring='accuracy')
cv.fit(train_features, train_labels.values.ravel())

print_results(cv)

BEST PARAMS: {'algorithm': 'ball_tree', 'metric': 'manhattan', 'n_neighbors': 13, 'weights': 'distance'}

BEST SCORE: 0.7807814933289776

0.733 (+/-0.012) for {'algorithm': 'auto', 'metric': 'minkowski', 'n_neighbors': 2, 'weights': 'uniform'}
0.744 (+/-0.017) for {'algorithm': 'auto', 'metric': 'minkowski', 'n_neighbors': 2, 'weights': 'distance'}
0.741 (+/-0.014) for {'algorithm': 'auto', 'metric': 'minkowski', 'n_neighbors': 5, 'weights': 'uniform'}
0.755 (+/-0.013) for {'algorithm': 'auto', 'metric': 'minkowski', 'n_neighbors': 5, 'weights': 'distance'}
0.743 (+/-0.016) for {'algorithm': 'auto', 'metric': 'minkowski', 'n_neighbors': 7, 'weights': 'uniform'}
0.76 (+/-0.013) for {'algorithm': 'auto', 'metric': 'minkowski', 'n_neighbors': 7, 'weights': 'distance'}
0.743 (+/-0.017) for {'algorithm': 'auto', 'metric': 'minkowski', 'n_neighbors': 9, 'weights': 'uniform'}
0.761 (+/-0.015) for {'algorithm': 'auto', 'metric': 'minkowski', 'n_neighbors': 9, 'weights': 'distance'}
0.742 (+/-0

In [60]:
# Write out the model fit on the whole training set
joblib.dump(cv.best_estimator_, "data/models/model_kneighbors_classifier.pkl")

['data/models/model_kneighbors_classifier.pkl']

### Random Forest Classifier

In [61]:
# Conduct search for best params while running Cross-Validation (GridSearchCV)

rfc = RandomForestClassifier()
parameters = { 
    'n_estimators' : [50, 100, 150, 200],
    'criterion': ["gini", "entropy", "log_loss"],
    "max_features": ["auto", "sqrt", "log2"],
    "min_samples_leaf": [5, 10, 20, 50, 100],
    "max_depth": range(2, 20, 3)   
     }


cv = GridSearchCV(estimator=rfc, param_grid=parameters, cv=5, n_jobs=-1, scoring='accuracy')
cv.fit(train_features, train_labels.values.ravel())

print_results(cv)

BEST PARAMS: {'criterion': 'log_loss', 'max_depth': 17, 'max_features': 'log2', 'min_samples_leaf': 5, 'n_estimators': 150}

BEST SCORE: 0.8045365775967325

nan (+/-nan) for {'criterion': 'gini', 'max_depth': 2, 'max_features': 'auto', 'min_samples_leaf': 5, 'n_estimators': 50}
nan (+/-nan) for {'criterion': 'gini', 'max_depth': 2, 'max_features': 'auto', 'min_samples_leaf': 5, 'n_estimators': 100}
nan (+/-nan) for {'criterion': 'gini', 'max_depth': 2, 'max_features': 'auto', 'min_samples_leaf': 5, 'n_estimators': 150}
nan (+/-nan) for {'criterion': 'gini', 'max_depth': 2, 'max_features': 'auto', 'min_samples_leaf': 5, 'n_estimators': 200}
nan (+/-nan) for {'criterion': 'gini', 'max_depth': 2, 'max_features': 'auto', 'min_samples_leaf': 10, 'n_estimators': 50}
nan (+/-nan) for {'criterion': 'gini', 'max_depth': 2, 'max_features': 'auto', 'min_samples_leaf': 10, 'n_estimators': 100}
nan (+/-nan) for {'criterion': 'gini', 'max_depth': 2, 'max_features': 'auto', 'min_samples_leaf': 10, 'n

In [62]:
# Write out the model fit on the whole training set
joblib.dump(cv.best_estimator_, "data/models/model_random_forest_classifier.pkl")

['data/models/model_random_forest_classifier.pkl']

### AdaBoost Classifier

In [63]:
# Conduct search for best params while running Cross-Validation (GridSearchCV)

abc = AdaBoostClassifier()
parameters = { 
    'n_estimators' : [100, 500, 1000, 5000]
     }


cv = GridSearchCV(estimator=abc, param_grid=parameters, cv=5, n_jobs=-1, scoring='accuracy')
cv.fit(train_features, train_labels.values.ravel())

print_results(cv)

BEST PARAMS: {'n_estimators': 5000}

BEST SCORE: 0.7825295184565249

0.763 (+/-0.015) for {'n_estimators': 100}
0.775 (+/-0.012) for {'n_estimators': 500}
0.779 (+/-0.012) for {'n_estimators': 1000}
0.783 (+/-0.003) for {'n_estimators': 5000}


In [64]:
# Write out the model fit on the whole training set
joblib.dump(cv.best_estimator_, "data/models/model_adaboost_classifier.pkl")

['data/models/model_adaboost_classifier.pkl']

### Gradient Boosting Classifier

In [65]:
# Conduct search for best params while running Cross-Validation (GridSearchCV)

gbc = GradientBoostingClassifier()
parameters = { 
    'n_estimators' : [100, 500, 1000, 5000],
    'max_depth':range(5,16,2), 
    'min_samples_split':range(200,2100,200),
    'min_samples_leaf':range(30,71,10),
    'max_features':range(7,20,2),
    'subsample':[0.6,0.7,0.75,0.8,0.85,0.9]
     }

# cv = GridSearchCV(estimator=gbc, param_grid=parameters, cv=5, n_jobs=-1, scoring='accuracy')
# cv.fit(train_features, train_labels.values.ravel())

# print_results(cv)

In [66]:
# Write out the model fit on the whole training set
joblib.dump(cv.best_estimator_, "data/models/model_gradient_boosting_classifier.pkl")

['data/models/model_gradient_boosting_classifier.pkl']

In [37]:
# Read models

models = {}

for mdl in ['logistic_regression', 'support_vector_classifier', 'naive_bayes_classifier', 'decision_tree_classifier', 
            'kneighbors_classifier', 'random_forest_classifier', 'adaboost_classifier', 'gradient_boosting_classifier']:
    models[mdl] = joblib.load("data/models/model_{}.pkl".format(mdl))

print("Models: {}".format(models))

Models: {'logistic_regression': LogisticRegression(C=0.01, penalty='l1', solver='liblinear'), 'support_vector_classifier': SVC(C=10), 'naive_bayes_classifier': GaussianNB(var_smoothing=0.3511191734215131), 'decision_tree_classifier': DecisionTreeClassifier(criterion='entropy', max_depth=20, max_features='sqrt',
                       min_samples_leaf=20), 'kneighbors_classifier': KNeighborsClassifier(algorithm='ball_tree', metric='manhattan', n_neighbors=13,
                     weights='distance'), 'random_forest_classifier': RandomForestClassifier(criterion='log_loss', max_depth=17, max_features='log2',
                       min_samples_leaf=5, n_estimators=150), 'adaboost_classifier': AdaBoostClassifier(n_estimators=5000), 'gradient_boosting_classifier': AdaBoostClassifier(n_estimators=5000)}


#### Model Evaluation

In [40]:
# Function to evaluate model scores
def evaluate_model(name, model, features, labels):
    start = time()
    pred = model.predict(features)
    end = time()

    accuracy = round(accuracy_score(y_pred=pred, y_true=labels), 2)
    precision = round(precision_score(y_pred=pred, y_true=labels), 2)
    recall = round(recall_score(y_pred=pred, y_true=labels), 2)
    f1 = round(f1_score(y_pred=pred, y_true=labels), 2)
    roc_auc = round(roc_auc_score(y_score=pred, y_true=labels), 2)

    print("{} -- \tAccuracy: {} | Precision: {} | Recall: {} | F1: {} | ROC-AUD: {} | Latency: {}ms".format(
        name, accuracy, precision, recall, f1, roc_auc, round((end-start), 2)
    ))

    print(classification_report(pred, labels))

In [41]:
evaluate_model(name="LogisticRegression", model=models['logistic_regression'], features=val_features, labels=val_labels)
evaluate_model(name="SupportVectorClassifier", model=models['support_vector_classifier'], features=val_features, labels=val_labels)
evaluate_model(name="NaiveBayesClassifier", model=models['naive_bayes_classifier'], features=val_features, labels=val_labels)
evaluate_model(name="DecisionTreeClassifier", model=models['decision_tree_classifier'], features=val_features, labels=val_labels)
evaluate_model(name="KNeighborsClassifier", model=models['kneighbors_classifier'], features=val_features, labels=val_labels)
evaluate_model(name="RandomForestClassifier", model=models['random_forest_classifier'], features=val_features, labels=val_labels)
evaluate_model(name="AdaBoostClassifier", model=models['adaboost_classifier'], features=val_features, labels=val_labels)
# evaluate_model(name="GradientBoostingClassifier", model=models['gradient_boosting_classifier'], features=val_features, labels=val_labels)



LogisticRegression -- 	Accuracy: 0.73 | Precision: 0.75 | Recall: 0.71 | F1: 0.73 | ROC-AUD: 0.73 | Latency: 0.01ms
              precision    recall  f1-score   support

           0       0.75      0.72      0.73      4845
           1       0.71      0.75      0.73      4501

    accuracy                           0.73      9346
   macro avg       0.73      0.73      0.73      9346
weighted avg       0.73      0.73      0.73      9346



ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- BILL_AMT_APR_tr
- BILL_AMT_AUG_tr
- BILL_AMT_JUL_tr
- BILL_AMT_JUN_tr
- BILL_AMT_MAY_tr
- ...
Feature names seen at fit time, yet now missing:
- BILL_AMT_APR_clean
- BILL_AMT_AUG_clean
- BILL_AMT_JUL_clean
- BILL_AMT_JUN_clean
- BILL_AMT_MAY_clean
- ...


### Evaluate our final model on test set

Out of 5 models:
* **Accuracy** : KNeighborsClassifier > SupportVectorClassifier > LogisticRegression > DecisionTreeClassifier > NaiveBayesClassifier
* **Precision** : NaiveBayesClassifier > SupportVectorClassifier> KNeighborsClassifier > DecisionTreeClassifier > LogisticRegression
* **Recall** : KNeighborsClassifier > SupportVectorClassifier > LogisticRegression > DecisionTreeClassifier > NaiveBayesClassifier
* **F1** : KNeighborsClassifier > SupportVectorClassifier > LogisticRegression > DecisionTreeClassifier > NaiveBayesClassifier
* **ROC-AUC** : KNeighborsClassifier > SupportVectorClassifier > LogisticRegression > DecisionTreeClassifier > NaiveBayesClassifier
* **Latency** : DecisionTreeClassifier > LogisticRegression == NaiveBayesClassifier > KNeighborsClassifier > SupportVectorClassifier

Since Precision measures the accuracy of positive predictions, while recall measures the completeness of positive predictions.

**NOTE**: In the related research field of credit default risk prediction, ``improving the recall rate (i.e reducing FN) is crucial for banks and other lending institutions``. The recall rate refers to the proportion of all true positive examples that are correctly identified as positive examples.

Ideally, we do not want to allow any defaults to fall through the cracks, so our optimal model will minimize False Negatives (So RecallScore is as high as possible).

Since `KNeighborsClassifier` outperforms all others models in terms of `Accuracy`, `Recall`, `F1` & `ROC-AUC` except `Precision` and `Latency`, we will go ahead with it as final model

In [71]:
# Evaluate the final model

evaluate_model(name="RandomForestClassifier", model=models['random_forest_classifier'], features=test_features, labels=test_labels)

RandomForestClassifier -- 	Accuracy: 0.81 | Precision: 0.82 | Recall: 0.78 | F1: 0.8 | ROC-AUD: 0.81 | Latency: 0.17ms


In [48]:
# Function to print the best params
from time import time
def evaluate_models(models, features, labels):
    best_models = {}
    for model_name, (model, param) in models.items():
        model_name = str(model).split("()")[0]
        print("\n\n========================={}=======================".format(model_name))
        start = time()
        random_search = RandomizedSearchCV(estimator=model, param_distributions=param, n_iter=10, cv=5, n_jobs=-1, scoring='recall', verbose=3)
        random_search.fit(features, labels)
        end = time()
        best_model = random_search.best_estimator_
        best_params = random_search.best_params_
        best_score = random_search.best_score_
        # print("BEST ESTIMATOR: {}\n".format(best_model))
        # print("BEST PARAMS: {}\n".format(best_params))        
        print("Model took: {} secs\n".format(end-start))
        
        best_models[model_name] = {
        'model': best_model,
        'best_params': best_params,
        'best_score': best_score}

        # joblib.dump(cv.best_estimator_, "data/practice1/model_{}_classifier.pkl".format(model_name))
    print("\nModel Report: {}".format(best_models))

In [50]:
models = {
    'RandomForest': (RandomForestClassifier(), {
        'n_estimators': [50, 100, 200],
        'criterion': ["gini", "entropy", "log_loss"],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
    }),
    'GradientBoosting': (GradientBoostingClassifier(), {
        'n_estimators': [50, 100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
    }),
    'SVM': (SVC(), {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf', 'poly'],
        'gamma': ['scale', 'auto'],
    }),
    'LogisticRegression': (LogisticRegression(), {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'saga'],
        'max_iter': [100, 200, 300, 400, 500],
    }),
    'NearestNeighbors': (KNeighborsClassifier(), {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
        'p': [1, 2],
    }),
    'DecisionTree': (DecisionTreeClassifier(), {
        'criterion': ['gini', 'entropy'],
        'splitter': ['best', 'random'],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
    }),
}

# models = (
#     {
#         LogisticRegression(): {
#             'penalty': ['l1', 'l2', 'elasticnet'],
#             'C': [0.01, 0.1, 1, 10, 100],
#             'solver': ['lbfgs', 'liblinear', 'newton-cholesky']
#             }},
#     {
#         GaussianNB(): {
#             'var_smoothing': np.logspace(0,-9, num=100)
#             }},
    # {
    #     AdaBoostClassifier(): { 
    #         'n_estimators' : [100, 500, 1000, 5000]
    #         }},
    # {
    #     SVC(): {
    #         'kernel': ['poly', 'rbf', 'sigmoid'],
    #         'C': [0.01, 0.1, 1, 10, 100],
    #         'degree': [2, 3, 4, 5]
    #         }},
    # {
    #     RandomForestClassifier(): { 
    #         'n_estimators' : [50, 100, 150, 200],
    #         'criterion': ["gini", "entropy", "log_loss"],
    #         "max_features": ["auto", "sqrt", "log2"],
    #         "min_samples_leaf": [5, 10, 20, 50, 100],
    #         "max_depth": range(2, 20, 3)   
    #         }},
    # {
    #     GradientBoostingClassifier(): { 
    #         'n_estimators' : [100, 500, 1000, 5000],
    #         'max_depth':range(5,16,2), 
    #         'min_samples_split':range(200,2100,200),
    #         'min_samples_leaf':range(30,71,10),
    #         'max_features':range(7,20,2),
    #         'subsample':[0.6,0.7,0.75,0.8,0.85,0.9]
    #         }},
    # {
    #     KNeighborsClassifier(): { 
    #         'n_neighbors' : [2, 5, 7, 9, 11, 13, 15, 30, 60],
    #         'weights' : ['uniform','distance'],
    #         'metric' : ['minkowski','euclidean','manhattan'],
    #         "algorithm" : ["auto", "ball_tree", "kd_tree", "brute"]
    #         }},
    # {
    #     DecisionTreeClassifier(): {
    #         'criterion': ["gini", "entropy", "log_loss"],
    #         "max_features": ["auto", "sqrt", "log2"],
    #         "min_samples_leaf": [5, 10, 20, 50, 100],
    #         "max_depth": [2, 3, 5, 10, 20]    
    #         }}
            # )

evaluate_models(models, train_features, train_labels.values.ravel())



Fitting 5 folds for each of 10 candidates, totalling 50 fits
Model took: 92.84499859809875 secs



Fitting 5 folds for each of 10 candidates, totalling 50 fits
Model took: 264.45066833496094 secs



Fitting 5 folds for each of 10 candidates, totalling 50 fits
Model took: 314.42439460754395 secs



Fitting 5 folds for each of 10 candidates, totalling 50 fits
Model took: 1.0536093711853027 secs



Fitting 5 folds for each of 10 candidates, totalling 50 fits
Model took: 13.51507043838501 secs



Fitting 5 folds for each of 10 candidates, totalling 50 fits
Model took: 2.7622010707855225 secs


Model Report: {'RandomForestClassifier': {'model': RandomForestClassifier(criterion='log_loss', min_samples_split=5,
                       n_estimators=200), 'best_params': {'n_estimators': 200, 'min_samples_split': 5, 'max_depth': None, 'criterion': 'log_loss'}, 'best_score': 0.8357465461834822}, 'GradientBoostingClassifier': {'model': GradientBoostingClassifier(max_depth=7, n_estimators=200), 'b

In [53]:
best_models = {'RandomForestClassifier': {'model': RandomForestClassifier(criterion='log_loss', min_samples_split=5,
                       n_estimators=200), 'best_params': {'n_estimators': 200, 'min_samples_split': 5, 'max_depth': None, 'criterion': 'log_loss'}, 'best_score': 0.8357465461834822}, 'GradientBoostingClassifier': {'model': GradientBoostingClassifier(max_depth=7, n_estimators=200), 'best_params': {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 7, 'learning_rate': 0.1}, 'best_score': 0.8320727834856928}, 'SVC': {'model': SVC(C=1, gamma='auto'), 'best_params': {'kernel': 'rbf', 'gamma': 'auto', 'C': 1}, 'best_score': 0.7828501768346933}, 'LogisticRegression': {'model': LogisticRegression(C=0.001, max_iter=200, solver='liblinear'), 'best_params': {'solver': 'liblinear', 'penalty': 'l2', 'max_iter': 200, 'C': 0.001}, 'best_score': 0.7425808171821091}, 'KNeighborsClassifier': {'model': KNeighborsClassifier(algorithm='kd_tree', n_neighbors=7, p=1), 'best_params': {'weights': 'uniform', 'p': 1, 'n_neighbors': 7, 'algorithm': 'kd_tree'}, 'best_score': 0.781780244889527}, 'DecisionTreeClassifier': {'model': DecisionTreeClassifier(max_depth=10, min_samples_leaf=2, min_samples_split=10), 'best_params': {'splitter': 'best', 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_depth': 10, 'criterion': 'gini'}, 'best_score': 0.762947445456782}}

In [58]:
best_model_name = max(best_models, key=lambda x: best_models[x]['best_score'])
best_score = best_models[best_model_name]['best_score']
best_model = best_models[best_model_name]['model']

In [59]:
best_score

0.8357465461834822

In [57]:
best_model_name

'RandomForestClassifier'