In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from matplotlib import pyplot

import numpy as np
from sklearn.model_selection import train_test_split

#Transforming
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

#cluster and Eval
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.cm as cm
from sklearn.manifold import TSNE
from sklearn import decomposition

#Visualization
import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.svm import LinearSVC

In [2]:
from sklearn import svm
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_excel("loan_data_final.xlsx")

In [4]:
X = df.drop(['loan_status'],axis = 1)
y = df['loan_status']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

X_train.shape, X_test.shape
cols = X_train.columns

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train = pd.DataFrame(X_train, columns=[cols])
X_test = pd.DataFrame(X_test, columns=[cols])

# 1. SVM

In [6]:
from sklearn import metrics
from sklearn.metrics import make_scorer, f1_score, recall_score, precision_score
svc=SVC() 
svc.fit(X_train,y_train)
y_pred_svc=svc.predict(X_test)

print('Model accuracy score : {0:0.4f}'. format(accuracy_score(y_test, y_pred_svc)))
print('Confusion metrics :\n', metrics.confusion_matrix(y_test, y_pred_svc,labels = [1,0]))
print(metrics.classification_report(y_test,y_pred_svc))

Model accuracy score : 0.9205
Confusion metrics :
 [[6382  119]
 [ 487  637]]
              precision    recall  f1-score   support

           0       0.84      0.57      0.68      1124
           1       0.93      0.98      0.95      6501

    accuracy                           0.92      7625
   macro avg       0.89      0.77      0.82      7625
weighted avg       0.92      0.92      0.91      7625



## Parameters tuning

In [22]:
from sklearn.model_selection import GridSearchCV

svc = SVC()

scorer = make_scorer(f1_score)
param_grid = {'C': [0.1,1, 10, 100]}
svc_turn_cv = GridSearchCV(estimator=svc,
                           param_grid=param_grid,
                           scoring=scorer,
                           cv=10)
svc_turn_cv.fit(X_train, y_train)



# Print hyperparameter
print("Tuned hyperparameter parameters: {}".format(svc_turn_cv.best_params_)) 
print("Best cross-validation score: {}".format(svc_turn_cv.best_score_))

Tuned hyperparameter parameters: {'C': 1}
Best cross-validation score: 0.9580234657023574


In [23]:
from sklearn.model_selection import GridSearchCV

svc = SVC()

scorer = make_scorer(f1_score)
param_grid = {'gamma': [1,0.1,0.01,0.001,0.0001]}
svc_turn_cv = GridSearchCV(estimator=svc,
                           param_grid=param_grid,
                           scoring=scorer,
                           cv=10)
svc_turn_cv.fit(X_train, y_train)



# Print hyperparameter
print("Tuned hyperparameter parameters: {}".format(svc_turn_cv.best_params_)) 
print("Best cross-validation score: {}".format(svc_turn_cv.best_score_))

Tuned hyperparameter parameters: {'gamma': 0.0001}
Best cross-validation score: 0.9588306609768507


In [24]:
from sklearn.model_selection import GridSearchCV

svc = SVC()

scorer = make_scorer(f1_score)
param_grid = {'kernel': ['rbf', 'poly', 'sigmoid']}
svc_turn_cv = GridSearchCV(estimator=svc,
                           param_grid=param_grid,
                           scoring=scorer,
                           cv=10)
svc_turn_cv.fit(X_train, y_train)



# Print hyperparameter
print("Tuned hyperparameter parameters: {}".format(svc_turn_cv.best_params_)) 
print("Best cross-validation score: {}".format(svc_turn_cv.best_score_))

Tuned hyperparameter parameters: {'kernel': 'rbf'}
Best cross-validation score: 0.9580234657023574


In [25]:
from sklearn import metrics
from sklearn.metrics import make_scorer, f1_score, recall_score, precision_score
svc=SVC(C = 1, gamma=0.0001,kernel='rbf') 
svc.fit(X_train,y_train)
y_pred_svc=svc.predict(X_test)

print('Model accuracy score : {0:0.4f}'. format(accuracy_score(y_test, y_pred_svc)))
print('Confusion metrics :\n', metrics.confusion_matrix(y_test, y_pred_svc,labels = [1,0]))
print(metrics.classification_report(y_test,y_pred_svc))

Model accuracy score : 0.9216
Confusion metrics :
 [[6376  125]
 [ 473  651]]
              precision    recall  f1-score   support

           0       0.84      0.58      0.69      1124
           1       0.93      0.98      0.96      6501

    accuracy                           0.92      7625
   macro avg       0.88      0.78      0.82      7625
weighted avg       0.92      0.92      0.92      7625



## Final SVM model

# 2. Logistics Regression

In [8]:
lg = LogisticRegression(random_state=0)
lg.fit(X_train,y_train)
y_pred_lg = lg.predict(X_test)

print(metrics.classification_report(y_test,y_pred_lg))
print('Confusion metrics :\n', metrics.confusion_matrix(y_test, y_pred_lg,labels = [1,0]))
print('Model accuracy score with default hyperparameters: {0:0.4f}'. format(accuracy_score(y_test, y_pred_lg)))

              precision    recall  f1-score   support

           0       0.84      0.58      0.68      1124
           1       0.93      0.98      0.95      6501

    accuracy                           0.92      7625
   macro avg       0.88      0.78      0.82      7625
weighted avg       0.92      0.92      0.91      7625

Confusion metrics :
 [[6376  125]
 [ 476  648]]
Model accuracy score with default hyperparameters: 0.9212


In [9]:
print(lg)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


In [None]:
# import sklearn

# sklearn.metrics.SCORERS.keys()

## Parameters tuning

In [10]:
from sklearn.model_selection import GridSearchCV
lg = LogisticRegression()
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10,100],
              'solver': ['liblinear', 'lbfgs', 'sag', 'saga','newton-cg'],
              'penalty':['l1','l2','elasticnet','none']}

from sklearn import metrics
from sklearn.metrics import make_scorer, f1_score, recall_score, precision_score

# scorer = make_scorer(f1_score, pos_label = 0)
scorer = make_scorer(f1_score)
lg_tuning = GridSearchCV(estimator=lg,
                         param_grid=param_grid,
                         scoring=scorer,
                         cv=10)
lg_tuning.fit(X_train, y_train)



# Print hyperparameter
print("Tuned hyperparameter parameters: {}".format(lg_tuning.best_params_)) 
print("Best cross-validation score: {}".format(lg_tuning.best_score_))

Tuned hyperparameter parameters: {'C': 0.001, 'penalty': 'none', 'solver': 'sag'}
Best cross-validation score: 0.958897282291819


### Final LG mdoel

In [11]:
from sklearn import metrics
lg = LogisticRegression(C = 0.001, solver = 'sag', penalty = 'none')
lg.fit(X_train,y_train)
y_pred_lg = lg.predict(X_test)

print('Model accuracy score : {0:0.4f}'. format(accuracy_score(y_test, y_pred_lg)))
print('Confusion metrics :\n', metrics.confusion_matrix(y_test, y_pred_lg,labels = [1,0]))
print(metrics.classification_report(y_test,y_pred_lg))

Model accuracy score : 0.9212
Confusion metrics :
 [[6376  125]
 [ 476  648]]
              precision    recall  f1-score   support

           0       0.84      0.58      0.68      1124
           1       0.93      0.98      0.95      6501

    accuracy                           0.92      7625
   macro avg       0.88      0.78      0.82      7625
weighted avg       0.92      0.92      0.91      7625



# 3. Decision tree

In [12]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train,y_train)
y_pred_dt = dt.predict(X_test)

print('Model accuracy score : {0:0.4f}'. format(accuracy_score(y_test, y_pred_dt)))
print('Confusion metrics :\n', metrics.confusion_matrix(y_test, y_pred_dt))
print(metrics.classification_report(y_test,y_pred_dt))

Model accuracy score : 0.8620
Confusion metrics :
 [[ 638  486]
 [ 566 5935]]
              precision    recall  f1-score   support

           0       0.53      0.57      0.55      1124
           1       0.92      0.91      0.92      6501

    accuracy                           0.86      7625
   macro avg       0.73      0.74      0.73      7625
weighted avg       0.87      0.86      0.86      7625



In [13]:
print(dt)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')


## Parameter tuning

In [19]:
from sklearn.model_selection import GridSearchCV
dt = DecisionTreeClassifier()
param_grid = {'criterion' : ['gini', 'entropy'],
              'max_depth' : [2,4,6,8,10,12,14]}

from sklearn import metrics
from sklearn.metrics import make_scorer, f1_score, recall_score, precision_score


scorer = make_scorer(f1_score)
dt_tuning = GridSearchCV(estimator=dt,
                         param_grid=param_grid,
                         scoring=scorer,
                         cv=10)
dt_tuning.fit(X_train, y_train)



# Print hyperparameter
print("Tuned hyperparameter parameters: {}".format(dt_tuning.best_params_)) 
print("Best cross-validation score: {}".format(dt_tuning.best_score_))



Tuned hyperparameter parameters: {'criterion': 'gini', 'max_depth': 2}
Best cross-validation score: 0.9588306609768507


## Final DT model

In [20]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(criterion='gini',max_depth=2)
dt.fit(X_train,y_train)
y_pred_dt = dt.predict(X_test)

print('Model accuracy score : {0:0.4f}'. format(accuracy_score(y_test, y_pred_dt)))
print('Confusion metrics :\n', metrics.confusion_matrix(y_test, y_pred_dt))
print(metrics.classification_report(y_test,y_pred_dt))

Model accuracy score : 0.9217
Confusion metrics :
 [[ 650  474]
 [ 123 6378]]
              precision    recall  f1-score   support

           0       0.84      0.58      0.69      1124
           1       0.93      0.98      0.96      6501

    accuracy                           0.92      7625
   macro avg       0.89      0.78      0.82      7625
weighted avg       0.92      0.92      0.92      7625

