In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', None)

In [2]:
# !pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip

In [3]:
loan = pd.read_csv('/content/drive/My Drive/Loan Defaulters/cleaned_loan.csv')

In [4]:
loan.dtypes

loan_amnt               float64
term                      int64
int_rate                float64
installment             float64
grade                    object
emp_length              float64
home_ownership           object
verification_status      object
loan_status               int64
purpose                  object
dti                     float64
earliest_cr_line          int64
open_acc                float64
pub_rec                 float64
revol_util              float64
total_acc               float64
initial_list_status      object
application_type         object
pub_rec_bankruptcies    float64
fico_average            float64
log_annual_inc          float64
log_revol_bal           float64
dtype: object

In [5]:
loan['home_ownership'] = loan['home_ownership'].replace(['NONE', 'ANY'], 'OTHER')

In [6]:
loan['dti'] = loan['dti'] + 1

In [7]:
loan['log_dti'] = loan['dti'].apply(lambda x : np.log10(x+1))
loan.drop('dti',axis = 1,inplace = True)
loan.drop('pub_rec_bankruptcies',axis = 1,inplace = True)

In [8]:
loan['emp_length'].fillna(10,inplace=True)

In [9]:
# Conveting certain columns to str to be encoded ordinally
def str_con(s):
  if type(s) != str:
    return str(s)
  else:
    return s

loan['term'] = loan['term'].apply(lambda s : str_con(s))
loan['emp_length'] = loan['emp_length'].apply(lambda s : str_con(s))
#loan['pub_rec_bankruptcies'] = loan['pub_rec_bankruptcies'].apply(lambda s : str_con(s))
loan['pub_rec'] = loan['pub_rec'].apply(lambda s : str_con(s))


In [10]:
loan['earliest_cr_year'] = 2015  - loan['earliest_cr_line']
loan.drop('earliest_cr_line',axis = 1,inplace = True)

In [11]:

# categorical encoding and getting dummy variables
dummies = pd.get_dummies(loan[['verification_status', 'application_type','initial_list_status','purpose','grade','home_ownership']], drop_first=True)

loan = loan.drop(['verification_status', 'application_type','initial_list_status','purpose','grade','home_ownership'],axis=1)

loan = pd.concat([loan,dummies],axis=1)

In [12]:
col = ['revol_util','log_dti']
value_revol = loan['revol_util'].median()
value_dti = loan['log_dti'].median()
loan['revol_util'].fillna(value_revol,inplace = True)
loan['log_dti'].fillna(value_dti,inplace = True)

In [13]:
from sklearn.preprocessing import OneHotEncoder,StandardScaler,OrdinalEncoder,LabelEncoder
ord_col = ['term','emp_length','pub_rec']
ord_encode = OrdinalEncoder()
ord_encode = ord_encode.fit(loan[ord_col])
loan[ord_col]=ord_encode.transform(loan[ord_col])

label_encode = LabelEncoder()
label_encode = label_encode.fit(loan['loan_status'])
loan['loan_status']=label_encode.transform(loan['loan_status'])

In [14]:
num_col = ['loan_amnt', 'int_rate', 'installment', 
        'open_acc', 'revol_util', 'total_acc',
        'fico_average', 'log_annual_inc',
       'log_revol_bal', 'log_dti', 'earliest_cr_year']

In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder,StandardScaler,OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from numpy import absolute
from numpy import mean
from numpy import std

In [16]:
trans = StandardScaler()
loan[num_col] = trans.fit_transform(loan[num_col])

In [17]:
X =  loan.drop('loan_status',axis = 1)
Y = loan['loan_status']
X_train, X_test, y_train, y_test =train_test_split(X,Y,
                                                   test_size=0.30,
                                                   )

In [18]:
lg = LogisticRegression()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
et = ExtraTreesClassifier()
gb = GradientBoostingClassifier()


def base_model(name,model):
  print(name)
  model.fit(X_train,y_train)
  prediction = model.predict(X_test)
  print(classification_report(y_test,prediction))
  print(accuracy_score(y_test,prediction))
  print(confusion_matrix(y_test,prediction))

In [23]:
base_model('logistic reg',lg)
print('.'*1000)
base_model('Decision tree',dt)
print('.'*1000)
base_model('Random forest',rf)
print('.'*1000)
base_model('Extra tree',et)
print('.'*1000)
base_model('Gradient boost',gb)

logistic reg
              precision    recall  f1-score   support

           0       0.56      0.03      0.05      2622
           1       0.85      1.00      0.92     14272

    accuracy                           0.85     16894
   macro avg       0.70      0.51      0.48     16894
weighted avg       0.80      0.85      0.78     16894

0.8456848585296555
[[   69  2553]
 [   54 14218]]
..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [24]:
!pip install Imbalanced-Learn



In [25]:
def base_model_sampler(name,model):
  print(name)
  model.fit(X_res,y_res)
  prediction = model.predict(X_test)
  print(classification_report(y_test,prediction))
  print(accuracy_score(y_test,prediction))
  print(confusion_matrix(y_test,prediction))
from imblearn.over_sampling import RandomOverSampler
os = RandomOverSampler(0.7)
X_res,y_res = os.fit_sample(X_train,y_train)
base_model('Decision tree',dt)
print('.'*1000)
base_model('Random forest',rf)
print('.'*1000)
base_model('Extra tree',et)
print('.'*1000)
base_model('Gradient boost',gb)
print('.'*1000)
base_model('logistic reg',lg)

Decision tree
              precision    recall  f1-score   support

           0       0.23      0.26      0.24      2622
           1       0.86      0.84      0.85     14272

    accuracy                           0.75     16894
   macro avg       0.54      0.55      0.55     16894
weighted avg       0.76      0.75      0.76     16894

0.7486089736000947
[[  684  1938]
 [ 2309 11963]]
.................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

In [27]:
from imblearn.combine import SMOTETomek

In [None]:
os = SMOTETomek(0.75)
X_res,y_res = os.fit_sample(X_train,y_train)
def base_model_sampler(name,model):
  print(name)
  model.fit(X_res,y_res)
  prediction = model.predict(X_test)
  print(classification_report(y_test,prediction))
  print(accuracy_score(y_test,prediction))
  print(confusion_matrix(y_test,prediction))
base_model('Decision tree',dt)
base_model('Random forest',rf)
base_model('Extra tree',et)
base_model('Gradient boost',gb)
base_model('logistic reg',lg)


Decision tree
              precision    recall  f1-score   support

           0       0.22      0.24      0.23      2633
           1       0.86      0.84      0.85     14261

    accuracy                           0.75     16894
   macro avg       0.54      0.54      0.54     16894
weighted avg       0.76      0.75      0.75     16894

0.7457677281875222
[[  639  1994]
 [ 2301 11960]]
Random forest
              precision    recall  f1-score   support

           0       0.55      0.04      0.08      2633
           1       0.85      0.99      0.92     14261

    accuracy                           0.85     16894
   macro avg       0.70      0.52      0.50     16894
weighted avg       0.80      0.85      0.79     16894

0.845329702853084
[[  111  2522]
 [   91 14170]]
Extra tree
              precision    recall  f1-score   support

           0       0.43      0.06      0.10      2633
           1       0.85      0.99      0.91     14261

    accuracy                           0.84 

In [None]:
os = SMOTETomek(0.75)
X_res,y_res = os.fit_sample(X_train,y_train)
from sklearn.model_selection import RandomizedSearchCV
criterion = ['gini','entropy']
max_features = ['auto','sqrt','log2']
max_depth = [int(x) for x in np.linspace(10,1000,10)]
min_samples_split = [2,5,8,10,14]
min_samples_leaf = [1,2,4,6,8]
random_grid = {'criterion': criterion,'max_features':max_features,
               'max_depth': max_depth,'min_samples_split':min_samples_split,
               'min_samples_leaf':min_samples_leaf}
print(random_grid)

dt = DecisionTreeClassifier()
dt_randomcv = RandomizedSearchCV(estimator = dt ,param_distributions = random_grid,
                                 n_iter = 100,cv= 5,verbose = 2,random_state = 100,n_jobs = -1)
dt_randomcv.fit(X_res,y_res)
# model = DecisionTreeClassifier(criterion= 'gini',
#                                max_features = 'sqrt',min_samples_leaf =10,
#                                random_state = 100).fit(X_train,y_train)
prediction = dt_randomcv.predict(X_test)
print(classification_report(y_test,prediction))
confusion_matrix(y_test,prediction)

{'criterion': ['gini', 'entropy'], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 120, 230, 340, 450, 560, 670, 780, 890, 1000], 'min_samples_split': [2, 5, 8, 10, 14], 'min_samples_leaf': [1, 2, 4, 6, 8]}
Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:   25.1s
[Parallel(n_jobs=-1)]: Done 361 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  1.4min finished


              precision    recall  f1-score   support

           0       0.23      0.30      0.26      2633
           1       0.86      0.81      0.84     14261

    accuracy                           0.73     16894
   macro avg       0.55      0.56      0.55     16894
weighted avg       0.76      0.73      0.75     16894



array([[  790,  1843],
       [ 2682, 11579]])

In [29]:
log_reg = LogisticRegression()
log_reg.fit(X_train,y_train)
prediction = log_reg.predict(X_test)
print(classification_report(y_test,prediction))
print(accuracy_score(y_test,prediction))
print(confusion_matrix(y_test,prediction))

              precision    recall  f1-score   support

           0       0.56      0.03      0.05      2622
           1       0.85      1.00      0.92     14272

    accuracy                           0.85     16894
   macro avg       0.70      0.51      0.48     16894
weighted avg       0.80      0.85      0.78     16894

0.8456848585296555
[[   69  2553]
 [   54 14218]]


In [34]:
os = SMOTETomek(0.85)
X_res,y_res = os.fit_sample(X_train,y_train)

In [35]:
log_reg = LogisticRegression()
log_reg.fit(X_res,y_res)
prediction = log_reg.predict(X_test)
print(classification_report(y_test,prediction))
print(accuracy_score(y_test,prediction))
print(confusion_matrix(y_test,prediction))

              precision    recall  f1-score   support

           0       0.27      0.60      0.37      2622
           1       0.91      0.70      0.79     14272

    accuracy                           0.69     16894
   macro avg       0.59      0.65      0.58     16894
weighted avg       0.81      0.69      0.73     16894

0.6857464188469279
[[ 1583  1039]
 [ 4270 10002]]


In [46]:
log_reg = LogisticRegression(penalty='l1', C=0.001, 
                             solver='liblinear', max_iter=100,  n_jobs=-1)
log_reg.fit(X_res,y_res)
prediction = log_reg.predict(X_test)
print(classification_report(y_test,prediction))
print(accuracy_score(y_test,prediction))
print(confusion_matrix(y_test,prediction))

              precision    recall  f1-score   support

           0       0.27      0.56      0.36      2622
           1       0.90      0.72      0.80     14272

    accuracy                           0.69     16894
   macro avg       0.58      0.64      0.58     16894
weighted avg       0.80      0.69      0.73     16894

0.6928495323783592
[[ 1458  1164]
 [ 4025 10247]]


In [49]:
param_grid = [
    {
     'penalty' : ['l1', 'l2'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['liblinear']}
]

# Create grid search object
log_reg = GridSearchCV(LogisticRegression(), param_grid = param_grid, cv = 5, verbose=True, n_jobs=-1)
# Fit on data
best_log = log_reg.fit(X_res, y_res)
prediction = best_log.predict(X_test)
print(classification_report(y_test,prediction))
print(accuracy_score(y_test,prediction))
print(confusion_matrix(y_test,prediction))

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  3.8min finished


              precision    recall  f1-score   support

           0       0.27      0.60      0.37      2622
           1       0.90      0.70      0.79     14272

    accuracy                           0.68     16894
   macro avg       0.59      0.65      0.58     16894
weighted avg       0.81      0.68      0.72     16894

0.6835562921747366
[[1568 1054]
 [4292 9980]]


In [50]:
log_reg = GridSearchCV(cv=None,
             estimator=LogisticRegression(C=1.0, intercept_scaling=1,   
               dual=False, fit_intercept=True, penalty='l2', tol=0.0001),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]})
best_log = log_reg.fit(X_res, y_res)
prediction = best_log.predict(X_test)
print(classification_report(y_test,prediction))
print(accuracy_score(y_test,prediction))
print(confusion_matrix(y_test,prediction))

              precision    recall  f1-score   support

           0       0.27      0.60      0.37      2622
           1       0.91      0.70      0.79     14272

    accuracy                           0.69     16894
   macro avg       0.59      0.65      0.58     16894
weighted avg       0.81      0.69      0.73     16894

0.6857464188469279
[[ 1583  1039]
 [ 4270 10002]]


In [51]:
print('The best parameters after hyperparameter tuning')
best_log.best_params_

The best parameters after hyperparameter tuning


{'C': 1}

In [52]:
print('Best estimators')
best_log.best_estimator_

Best estimators


LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [53]:
# We maintain this model with accuracy 68% because it provides the highest recall and 
# also does well in giving small False negatives
new_log = best_log.best_estimator_
prediction = new_log.predict(X_test)
print(classification_report(y_test,prediction))
print(accuracy_score(y_test,prediction))
print(confusion_matrix(y_test,prediction))

              precision    recall  f1-score   support

           0       0.27      0.60      0.37      2622
           1       0.91      0.70      0.79     14272

    accuracy                           0.69     16894
   macro avg       0.59      0.65      0.58     16894
weighted avg       0.81      0.69      0.73     16894

0.6857464188469279
[[ 1583  1039]
 [ 4270 10002]]
