In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate, KFold

from sklearn.metrics import recall_score, roc_auc_score, f1_score
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix


In [9]:
df_selected = pd.read_csv("CleanData.csv")
df_selected.drop("Unnamed: 0", axis=1)


Unnamed: 0,Patron_Salary,Automobile_Possession,Two-Wheeler_Ownership,Ongoing_Borrowing,Residence_Proprietorship,Offspring_Number,Loan_Capital,Borrowing_Periodic_Payment,Customer_Revenue_Category,Patron_Academic_Qualification,...,Customer_Vocation,Patron_Kin_Count,Customer_Urban_Area_Ranking,Sort_of_Institution,Rating_Origin_1,Rating_Origin_2,Rating_Origin_3,Community_Non-Payment_Incidence,Solvency_Information_Agency,Default
0,6750.0,0.0,0.0,1.0,0.0,0.0,61190.55,3416.85,0,4,...,14,2.0,2.0,42,0.568066,0.478787,0.746300,0.0186,0.0,0
1,18000.0,0.0,0.0,1.0,0.0,1.0,59527.35,2788.20,4,1,...,13,2.0,2.0,42,0.270675,0.552795,0.329655,0.0742,0.0,0
2,15750.0,0.0,0.0,1.0,1.0,0.0,53870.40,2295.45,3,4,...,8,2.0,3.0,57,0.270675,0.135182,0.631355,0.0856,3.0,0
3,11250.0,0.0,1.0,1.0,1.0,1.0,13752.00,653.85,4,4,...,8,2.0,2.0,33,0.270675,0.697928,0.420611,0.0639,0.0,0
4,13500.0,0.0,0.0,1.0,1.0,0.0,60415.20,3097.80,3,4,...,8,2.0,2.0,57,0.711468,0.657508,0.549597,0.0856,4.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79060,10350.0,0.0,1.0,0.0,0.0,0.0,18792.90,1736.55,4,1,...,14,1.0,2.0,42,0.162760,0.621042,0.746300,0.3340,0.0,0
79061,12150.0,0.0,0.0,1.0,0.0,0.0,78192.00,2383.65,3,4,...,8,1.0,2.0,57,0.270675,0.678249,0.283712,0.0515,2.0,0
79062,8100.0,0.0,1.0,0.0,1.0,1.0,55107.90,2989.35,1,4,...,6,3.0,3.0,50,0.169049,0.048079,0.746300,0.0856,0.0,0
79063,38250.0,1.0,1.0,0.0,1.0,0.0,45000.00,2719.35,4,0,...,14,2.0,2.0,5,0.182737,0.103538,0.077499,0.0979,2.0,0


#### Class Balance

In [40]:
df_selected.Default.value_counts(normalize=True)


0    0.914728
1    0.085272
Name: Default, dtype: float64

Default Class is highly imbalanced. Upsample minority class or downsample the majority class.

#### Unsampling

In [44]:
#separate classes into two data frames: 1. df_major and 2. df_minor
df_major = df_selected[df_selected.Default == 0]
df_minor = df_selected[df_selected.Default == 1]


In [45]:
df_minor_upsmapled = resample(df_minor, replace = True, n_samples = 358436, random_state = 2018)

In [46]:
df_minor_upsmapled = pd.concat([df_minor_upsmapled, df_major])


In [48]:
df_minor_upsmapled.Default.value_counts()


1    358436
0     72323
Name: Default, dtype: int64

#### Evaluate the model

In [None]:
def evaluate_model(ytest, ypred, ypred_proba = None):
    if ypred_proba is not None:
        print('ROC-AUC score of the model:   {}'.format(roc_auc_score(ytest, ypred_proba[:, 1])))
    print('Accuracy of the model: {}\n'.format(accuracy_score(ytest, ypred)))
    print('Classification report: \n{}\n'.format(classification_report(ytest, ypred)))
    print('Confusion matrix: \n{}\n'.format(confusion_matrix(ytest, ypred)))

#### Standard Scaling

In [49]:
X = df_minor_upsmapled.drop('Default', axis = 1)
Y = df_minor_upsmapled.Default


#Splitting into test and train set
xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.25, random_state=0)
mms = StandardScaler()
mms.fit(xtrain)
xtrain_scaled = mms.transform(xtrain)

### Logistic regression (LR) model

In [50]:
logisticRegr = LogisticRegression()

In [51]:
logisticRegr.fit(xtrain_scaled, ytrain)

In [52]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=None, solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [53]:
xtest_scaled = mms.transform(xtest)
lr_pred = logisticRegr.predict(xtest_scaled)

In [54]:
evaluate_model(ytest, lr_pred)

Accuracy of the model: 0.8331692821989043

Classification report: 
              precision    recall  f1-score   support

           0       0.56      0.05      0.09     18177
           1       0.84      0.99      0.91     89513

    accuracy                           0.83    107690
   macro avg       0.70      0.52      0.50    107690
weighted avg       0.79      0.83      0.77    107690


Confusion matrix: 
[[  942 17235]
 [  731 88782]]



The accuracy of the model is 83 percent which is fairly good. We'll try another algo to improve the performance

In our final dataset, almost 60% of our features are categorical. Therefore, a tree-based model may be a better choice. Lets implement Random forest. 

#### Random forest (RF) model

In [61]:
# Defining the hyperparameters
def random_forest(xtrain, xtest, ytrain):
    rf_params = {
        'n_estimators': 126, 
        'max_depth': 14
    }

    rf = RandomForestClassifier(**rf_params)
    rf.fit(xtrain, ytrain)
    rfpred = rf.predict(xtest)
    rfpred_proba = rf.predict_proba(xtest)
    
    return rfpred, rfpred_proba

In [62]:
rfpred, rfpred_proba = random_forest(xtrain_scaled, xtest_scaled, ytrain)

In [63]:
evaluate_model(ytest, rfpred, rfpred_proba)

ROC-AUC score of the model:   0.9510978461195292
Accuracy of the model: 0.8905747980313864

Classification report: 
              precision    recall  f1-score   support

           0       1.00      0.35      0.52     18177
           1       0.88      1.00      0.94     89513

    accuracy                           0.89    107690
   macro avg       0.94      0.68      0.73    107690
weighted avg       0.90      0.89      0.87    107690


Confusion matrix: 
[[ 6393 11784]
 [    0 89513]]



Random forest does better. Almost 6% accuracy jump from logistic regression. Proves that tree-based models perform well on categorical data.