In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import pickle
import shap
import warnings
warnings.filterwarnings("ignore")

In [None]:
def clean_column_spaces(df):
    old_column_names = df.columns.tolist()
    new_column_names =[]
    for col in old_column_names:
        col = col.replace(' ', '')
        new_column_names.append(col)
    df.columns= new_column_names
    return df 

def separate_train_test(df,testsize =0.2):
    X = df.iloc[:, 1:-1]
    y = df.iloc[:, -1]
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = testsize,random_state=0)
    return X_train,X_test,y_train,y_test

In [None]:
df = pd.read_csv('loan_approval_dataset.csv')
df = clean_column_spaces(df)
X_train,X_test,y_train,y_test = separate_train_test(df)

le_y = LabelEncoder()
le_y.fit(y_train)
pickle.dump(le_y, open('models/encoding_y.pkl', 'wb'))


for col in ['education','self_employed']:
    le = LabelEncoder()
    le.fit(X_train[col].unique())
    pickle.dump(le, open('models/encoding_' + col + '.pkl', 'wb'))
    X_train[col] = le.transform(X_train[col])
    X_test[col] = le.transform(X_test[col])

X_train_df = X_train
X_train= X_train.values
y_train = le_y.transform(y_train)
y_test = le_y.transform(y_test)

In [None]:
# Model imports
import random
random.seed(0)

from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

#Model Variable 
rf_model = RandomForestClassifier(verbose= False)
cat_model = CatBoostClassifier(verbose = False)
xgb_model = XGBClassifier(verbosity=0)

X_train = X_train_df
X_test=X_test
#Model Fitting
rf_model.fit(X_train, y_train)
cat_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)


from sklearn.model_selection import cross_val_score
models = [rf_model,cat_model,xgb_model]
result = pd.DataFrame(columns=['model_name','accuracy'])



for ind, m in enumerate(models):
    accuracies = cross_val_score(estimator = m, X = X_train, y = y_train, cv = 3)
    if ind!=1:
        result.loc[ind,'model_name']= 'model_'+ str(m).split("(")[0]
    else:
        result.loc[ind,'model_name']='model_'+ str(m).split(".")[2].split(" ")[0]
    result.loc[ind,'accuracy']= accuracies.mean()*100

print('CROSS-VAL RESULTS')
print(result)

from sklearn.metrics import roc_auc_score
print('=============================')
print('TEST RESULTS')
for ind,m in enumerate(models):
    y_pred = models[ind].predict(X_test)
    ac = accuracy_score(y_test, y_pred)
    rocauc = roc_auc_score(y_test, m.predict_proba(X_test)[:, 1])
    if ind!=1:
        print(str('model_'+ str(m).split("(")[0]) + ' -> Accuracy : ' + str(ac) + ' ROCAUC :' + str(rocauc))
    else:
        print(str('model_'+ str(m).split(".")[2].split(" ")[0] + ' -> Accuracy : ' + str(ac) + ' ROCAUC :' + str(rocauc)))
    
    

In [None]:
#RF PARAMS
from sklearn.model_selection import GridSearchCV
param_grid = { 
    'n_estimators': [100,200,500,1000],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [2,4,6,8,10],
    'criterion' :['gini', 'entropy']
}

rf_model = RandomForestClassifier(verbose= False)

grid_search = GridSearchCV(estimator = rf_model,
                           param_grid = param_grid,
                           scoring = 'accuracy',
                           cv = 3,
                           n_jobs = -1)
grid_search.fit(X_train, y_train)
best_accuracy_rf = grid_search.best_score_
best_parameters_rf = grid_search.best_params_
print(best_parameters_rf) 
{'criterion': 'entropy', 'max_depth': 10, 'max_features': 'sqrt', 'n_estimators': 500}

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

#XGB PARAMS
from sklearn.model_selection import GridSearchCV
param_grid = {'gamma': [0,0.1,0.2,0.4,1.6,10,50,100],
              'learning_rate': [0.01,0.05, 0.1,  0.2,0.5, 0.6],
              'max_depth': [5,8,10,50,80,100],
              'n_estimators': [50,80,100,150,200],
              'reg_alpha': [0,0.1,5,20,50],
              'reg_lambda': [0,0.1,5,20,50]}

xgb_model = XGBClassifier(verbosity=0)

grid_search = GridSearchCV(estimator = xgb_model,
                           param_grid = param_grid,
                           scoring = 'accuracy',
                           cv = 3,
                           n_jobs = -1)

grid_search.fit(X_train, y_train)
best_accuracy_rf = grid_search.best_score_
best_parameters_rf = grid_search.best_params_
print(best_parameters_rf) 
#{'max_depth': 80, 'max_features': 2, 'min_samples_leaf': 3, 'min_samples_split': 8, 'n_estimators': 200}

In [None]:
param_grid = {'learning_rate': [0.03, 0.1,0.2,0.5],
        'depth': [2,4, 6, 10,20],
        'l2_leaf_reg': [1, 3, 5, 7, 9]}
model = CatBoostClassifier(verbose=False)
grid_search = GridSearchCV(estimator = model,
                           param_grid = param_grid,
                           scoring = 'accuracy',
                           cv = 3,
                           n_jobs = -1)
grid_search.fit(X_train, y_train)
best_accuracy_rf = grid_search.best_score_
best_parameters_rf = grid_search.best_params_
print(best_parameters_rf) 
#{'depth': 10, 'l2_leaf_reg': 7, 'learning_rate': 0.03}

In [None]:
import shap
explainer = shap.Explainer(models[0].predict, X_test)
shap_values = explainer(X_test)
shap.summary_plot(shap_values)

In [None]:
import shap
explainer = shap.Explainer(models[1].predict, X_test)
shap_values = explainer(X_test)
shap.summary_plot(shap_values)

In [None]:
import shap
explainer = shap.Explainer(models[2].predict, X_test)
shap_values = explainer(X_test)
shap.summary_plot(shap_values)

In [None]:
#FINAL MODELS
# Model imports
import random
random.seed(0)

from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

#Model Variable 
rf_model = RandomForestClassifier(criterion= 'entropy', 
                                  max_depth= 8, 
                                  max_features='sqrt', 
                                  n_estimators = 500,
                                  verbose= False)
cat_model = CatBoostClassifier(depth=10,
                                l2_leaf_reg=7, 
                                learning_rate=0.03,
                                verbose = False)
xgb_model = XGBClassifier(gamma =0,
                          learning_rate= 0.5, 
                          max_depth = 5 ,
                          n_estimators = 80, 
                          reg_alpha = 0.1, 
                          reg_lambda = 0.1,
                          verbosity = 0)

#Model Fitting
rf_model.fit(X_train, y_train)
cat_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)

print('TEST RESULTS')
for ind,m in enumerate(models):
    y_pred = models[ind].predict(X_test)
    ac = accuracy_score(y_test, y_pred)
    rocauc = roc_auc_score(y_test, m.predict_proba(X_test)[:, 1])
    if ind!=1:
        print(str('model_'+ str(m).split("(")[0]) + ' -> Accuracy : ' + str(ac) + ' ROCAUC :' + str(rocauc))
    else:
        print(str('model_'+ str(m).split(".")[2].split(" ")[0] + ' -> Accuracy : ' + str(ac) + ' ROCAUC :' + str(rocauc)))
    
    

In [None]:
'''
TEST RESULTS
model_RandomForestClassifier -> Accuracy : 0.9836065573770492 ROCAUC :0.9968922033584999
model_CatBoostClassifier -> Accuracy : 0.9859484777517564 ROCAUC :0.9977413596899134
BEST -> model_XGBClassifier -> Accuracy : 0.9882903981264637 ROCAUC :0.9985327502844962
'''

pickle.dump(xgb_model,open('classification_model.pkl', 'wb')) 
