In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import mglearn
%matplotlib inline

In [2]:
import os
path = 'E:\Python Consumer Credit'
os.chdir(path)

In [3]:
df_tr=pd.read_csv('df__under_tr.csv',header=None)
df_ts=pd.read_csv('df__under_ts.csv',header=None)

In [4]:
X_train = df_tr.iloc[:,:-1].values
y_train = df_tr.iloc[:,-1].values
X_test = df_ts.iloc[:,:-1].values
y_test = df_ts.iloc[:,-1].values
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(557088, 14)
(139272, 14)
(557088,)
(139272,)


## Classification algorithms

#### Random Forest Classifier

In [5]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier(criterion='gini', max_depth=8, max_features='log2', n_estimators=20).fit(X_train,y_train)
y_tr_rf_pred = rfc.predict(X_train)
y_ts_rf_pred = rfc.predict(X_test)

In [6]:
from sklearn.metrics import confusion_matrix, accuracy_score
rf_cm_tr = confusion_matrix(y_train,y_tr_rf_pred)
print(rf_cm_tr)
accuracy_score(y_train, y_tr_rf_pred)

[[167365 111457]
 [ 84010 194256]]


0.6491272474007698

In [7]:
rf_cm_ts = confusion_matrix(y_test,y_ts_rf_pred)
print(rf_cm_ts)
accuracy_score(y_test, y_ts_rf_pred)

[[41459 27899]
 [21427 48487]]


0.6458297432362571

## Applying k-fold Cross Validation

In [8]:
from sklearn.model_selection import KFold, cross_val_score
accuracies = cross_val_score(estimator = rfc, X = X_train, y = y_train, cv = 5)
print('Accuracy: {:.2f} %'.format(accuracies.mean()*100))
print('Standard deviation: {:.2f} %'.format(accuracies.std()*100))

Accuracy: 62.33 %
Standard deviation: 0.14 %


In [9]:
accuracies

array([0.62481825, 0.62079736, 0.62448617, 0.62378273, 0.62277749])

## Grid Search

In [10]:
from sklearn.model_selection import GridSearchCV
parameters = [{'n_estimators': list(range(1,21)), 'max_features': ['auto', 'sqrt', 'log2'],
               'max_depth' : ['None',8], 'criterion' :['gini', 'entropy']}]
grid_search = GridSearchCV(estimator = rfc, 
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 5,
                           n_jobs = -1)
grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print('Best accuracy: {:.2f} %'.format(best_accuracy*100))
print('Best parameters: ',best_parameters)

Best accuracy: 64.73 %
Best parameters:  {'criterion': 'gini', 'max_depth': 8, 'max_features': 'log2', 'n_estimators': 20}


## Metrics

In [8]:
from sklearn.metrics import roc_auc_score, jaccard_score, f1_score, precision_score, recall_score
print(roc_auc_score(y_test,y_ts_rf_pred))
print(jaccard_score(y_test,y_ts_rf_pred))
print(f1_score(y_test,y_ts_rf_pred))
print(precision_score(y_test,y_ts_rf_pred))
print(recall_score(y_test,y_ts_rf_pred))

0.6456385777396858
0.4957112040321839
0.6628434723171565
0.6347629146702275
0.6935234716937952
