In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, matthews_corrcoef
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

In [2]:
loans = pd.read_csv('Assignment2Data.csv', low_memory=False, header=0)
#loans = pd.read_csv("./Assignment2Data.csv", low_memory=False, header=0)
loans.head(n=10)

Unnamed: 0,acc_now_delinq,acc_open_past_24mths,all_util,annual_inc,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,collections_12_mths_ex_med,delinq_amnt,...,total_cu_tl,total_il_high_credit_limit,total_pymnt,total_pymnt_inv,total_rec_int,total_rec_late_fee,total_rec_prncp,total_rev_hi_lim,delinq_2yrs,bad_loans
0,0.0,3.0,1.0,52000.0,25099.0,30359.0,0.5,0.0,0.0,0.0,...,2.0,0.0,3011.577285,3011.58,11.58,0.0,3000.0,31000.0,0.0,0
1,0.0,2.0,47.0,55000.0,2222.0,10551.0,52.0,0.0,0.0,0.0,...,1.0,27751.0,5013.306667,5013.31,13.31,0.0,5000.0,33800.0,0.0,0
2,0.0,3.0,86.0,51000.0,4541.0,2498.0,0.1,0.0,0.0,0.0,...,0.0,24001.0,1032.903619,1032.9,32.9,0.0,1000.0,2500.0,0.0,0
3,0.0,2.0,41.0,62000.0,18520.0,18310.0,51.2,0.0,0.0,0.0,...,0.0,21976.0,20009.805556,19984.79,9.81,0.0,20000.0,39500.0,0.0,0
4,0.0,11.0,60.0,30000.0,5990.0,7898.0,7.1,0.0,0.0,0.0,...,5.0,62319.0,24334.736554,24334.74,334.74,0.0,24000.0,12000.0,0.0,0
5,0.0,3.0,32.0,58000.0,16129.0,10282.0,6.5,0.0,0.0,0.0,...,0.0,68112.0,265.85,265.3,223.63,15.0,27.22,11000.0,1.0,1
6,0.0,2.0,57.0,48000.0,2855.0,7459.0,25.4,0.0,0.0,0.0,...,0.0,27697.0,0.0,0.0,0.0,0.0,0.0,12200.0,1.0,1
7,0.0,21.0,32.0,186500.0,2443.0,107477.0,18.8,0.0,0.0,0.0,...,0.0,57714.0,20117.246667,20117.25,117.25,0.0,20000.0,139400.0,0.0,0
8,0.0,7.0,58.0,12000.0,48449.0,100.0,96.0,0.0,0.0,0.0,...,0.0,0.0,15424.086919,15424.09,424.09,0.0,15000.0,12400.0,1.0,0
9,0.0,4.0,74.0,54000.0,11336.0,6635.0,44.7,0.0,0.0,0.0,...,3.0,34050.0,2505.296389,2505.3,5.3,0.0,2500.0,12000.0,0.0,0


In [3]:
# Check how many good and bad loans there are
loans['bad_loans'].value_counts()

0    2516
1     476
Name: bad_loans, dtype: int64

### Oversample Bad Loans so that it is proportionate

In [4]:
# Class count
count_class_0, count_class_1 = loans['bad_loans'].value_counts()

# Divide by class
df_class_0 = loans[loans['bad_loans'] == 0]
df_class_1 = loans[loans['bad_loans'] == 1]

df_class_1_over = df_class_1.sample(count_class_0, replace=True, random_state=1234)

# Address unblanced data issue: Oversampling on data samples with rare outputs (which is '1' in this example)
df_class_1_over = df_class_1.sample(count_class_0, replace=True, random_state=1234)
loans = pd.concat([df_class_1_over, df_class_0], axis=0)

loans['bad_loans'].value_counts()

1    2516
0    2516
Name: bad_loans, dtype: int64

### Split Data 80/20

In [5]:
X = loans.drop('bad_loans', axis=1) # Here no need to set inplace=True
Y = loans['bad_loans']

In [6]:
validation_size = 0.2
seed = 1234
X_train_noscale, X_test_noscale, Y_train, Y_test = train_test_split(X, Y, test_size=validation_size, random_state=seed)

In [7]:
# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_noscale)
X_test= scaler.transform(X_test_noscale) #changes numpy. change back to df

X_train = pd.DataFrame(X_train, index=X_train_noscale.index, columns=X_train_noscale.columns)
X_test = pd.DataFrame(X_test, index=X_test_noscale.index, columns=X_test_noscale.columns)

## Feature Selection ##

Recursive Feature Elimination (RFE)

In [8]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', max_iter=1000) #fail to converge even when max_iter set to 1000
rfe = RFE(model, 15)
fit = rfe.fit(X_train, Y_train)


In [9]:
from itertools import compress
cols = list(compress(X.columns, fit.support_))
print(cols)

['installment', 'int_rate', 'last_pymnt_amnt', 'loan_amnt', 'out_prncp', 'out_prncp_inv', 'tot_cur_bal', 'tot_hi_cred_lim', 'total_bal_ex_mort', 'total_bal_il', 'total_pymnt', 'total_pymnt_inv', 'total_rec_int', 'total_rec_prncp', 'total_rev_hi_lim']


In [10]:
X_train2 = X_train[cols]
X_test2 = X_test[cols]

## Model Selection ##

Test out different model + gridsearchCV

1) Logistic Regression

2) Decision Tree

3) XGBoost Classifier

4) Random Forest Classfier 

5) Voting Classfier

In [11]:
def get_results(predictions):
    print("Accuracy score is: " + str(accuracy_score(Y_test, predictions)))
    #print("F1_score is: " + str(f1_score(Y_test, predictions)))
    print("MCC: ", matthews_corrcoef(Y_test, predictions))
    print("Confusion Matrix")
    print(confusion_matrix(Y_test, predictions))
    return

## Logistic Regression

In [12]:
#Logistic Regression
def logisticRegression(X_train, Y_train, X_test, Y_test):
    lg = LogisticRegression(random_state=seed, solver='liblinear')
    lg.fit(X_train, Y_train)
    predictions = lg.predict(X_test)
    get_results(predictions)
    return

logisticRegression(X_train2, Y_train, X_test2, Y_test)

Accuracy score is: 0.9225422045680238
MCC:  0.8523832281345636
Confusion Matrix
[[430  72]
 [  6 499]]


In [13]:
#Based on GridSearchCV, these params gave higher accuracy.
LR_clf = LogisticRegression(random_state=seed, solver='liblinear', C=27825.59402207126, penalty='l2' )
LR_model = LR_clf.fit(X_train, Y_train)
predictions = LR_model.predict(X_test)
get_results(predictions)

Accuracy score is: 0.9255213505461768
MCC:  0.8547341363965946
Confusion Matrix
[[441  61]
 [ 14 491]]


## Decision Tree

In [14]:
#Decision Tree
def decisionTree(X_train, Y_train, X_test, Y_test):
    dtc = DecisionTreeClassifier(random_state=seed)
    dtc.fit(X_train, Y_train)
    predictions = dtc.predict(X_test)
    get_results(predictions)
    return
#Run Basic Decision Tree without any cross validation or parameter tuning
decisionTree(X_train2, Y_train, X_test2, Y_test)

Accuracy score is: 0.9602780536246276
MCC:  0.9229015676621813
Confusion Matrix
[[464  38]
 [  2 503]]


In [15]:
#Based on GridSearchCV, these params gave higher accuracy. Do not run the gridsearchcv code above, takes quite some time!!

DT_clf = DecisionTreeClassifier(random_state=seed, max_depth = 8, min_samples_split= 0.1, \
                             min_samples_leaf= 1, criterion='gini')
DT_model = DT_clf.fit(X_train2, Y_train)
predictions = DT_model.predict(X_test2)
get_results(predictions)

Accuracy score is: 0.9285004965243296
MCC:  0.861216954887906
Confusion Matrix
[[441  61]
 [ 11 494]]


## XGBClassifier

In [16]:
##XGBOOST

def xgBoost(X_train, Y_train, X_test, Y_test):
    xgbc = XGBClassifier(random_state=seed)
    xgbc.fit(X_train, Y_train)
    predictions = xgbc.predict(X_test)
    get_results(predictions)
    return

xgBoost(X_train2, Y_train, X_test2, Y_test)

Accuracy score is: 0.9404170804369414
MCC:  0.8862962540527672
Confusion Matrix
[[444  58]
 [  2 503]]


In [17]:
XGB_clf = XGBClassifier(random_state=seed, max_depth = 5 , learning_rate = 0.1, n_estimators = 400)
XGB_model = XGB_clf.fit(X_train2, Y_train)

In [18]:
predictions = XGB_model.predict(X_test2)
get_results(predictions)

Accuracy score is: 0.9622641509433962
MCC:  0.9271561981544995
Confusion Matrix
[[464  38]
 [  0 505]]


## Random Forest

In [19]:
def randomForest(X_train, Y_train, X_test, Y_test):
    rf = RandomForestClassifier(random_state=seed)
    rf.fit(X_train, Y_train)
    predictions = rf.predict(X_test)
    get_results(predictions)
    return

randomForest(X_train2, Y_train, X_test2, Y_test)

Accuracy score is: 0.9652432969215492
MCC:  0.9316251595798188
Confusion Matrix
[[472  30]
 [  5 500]]




In [20]:
RF_clf = RandomForestClassifier(random_state=seed, max_depth = 9 , min_samples_leaf=1, min_samples_split=2)
RF_model = RF_clf.fit(X_train2, Y_train)



In [21]:
predictions = RF_model.predict(X_test2)
get_results(predictions)

Accuracy score is: 0.9443892750744787
MCC:  0.893901287703283
Confusion Matrix
[[447  55]
 [  1 504]]


## Voting Classifier

In [22]:
#Voting Classifier
vote_clf = VotingClassifier(estimators=[('lr', LR_clf), ('dt', DT_clf), ('xgb', XGB_clf), ('rf', RF_clf)], voting='hard')
vote_model = vote_clf.fit(X_train2, Y_train)

In [23]:
predictions = vote_model.predict(X_test2)
get_results(predictions)

Accuracy score is: 0.9433962264150944
MCC:  0.8917131407946488
Confusion Matrix
[[447  55]
 [  2 503]]


## Conclusion

From all the 5 models, we see that Random Forest gives the highest accuracy and mcc values. It is not surprising as it is an ensemble method, which often reduces variance of the model and increase the stability of our model.

Also, the accuracy and mcc values were already very high to begin with, making it hard to tune the model and  hence minimal improvements were observed after tuning