In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import mglearn
%matplotlib inline

In [2]:
import os
path = 'E:\Python Consumer Credit'
os.chdir(path)

In [3]:
df_tr=pd.read_csv('df__over_tr.csv',header=None)
df_ts=pd.read_csv('df__over_ts.csv',header=None)

In [4]:
X_train = df_tr.iloc[:,:-1].values
y_train = df_tr.iloc[:,-1].values
X_test = df_ts.iloc[:,:-1].values
y_test = df_ts.iloc[:,-1].values
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(10172801, 12)
(2543201, 12)
(10172801,)
(2543201,)


## Classification algorithms

#### Random Forest Classifier

In [5]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier(n_estimators=17, criterion='entropy').fit(X_train,y_train)
y_tr_rf_pred = rfc.predict(X_train)
y_ts_rf_pred = rfc.predict(X_test)

In [6]:
from sklearn.metrics import confusion_matrix, accuracy_score
rf_cm_tr = confusion_matrix(y_train,y_tr_rf_pred)
print(rf_cm_tr)
accuracy_score(y_train, y_tr_rf_pred)

[[5066392   20473]
 [  23457 5062479]]


0.9956816220036153

In [7]:
rf_cm_ts = confusion_matrix(y_test,y_ts_rf_pred)
print(rf_cm_ts)
accuracy_score(y_test, y_ts_rf_pred)

[[1075392  195744]
 [ 191444 1080621]]


0.847755643380134

## Applying k-fold Cross Validation

In [8]:
from sklearn.model_selection import KFold, cross_val_score
accuracies = cross_val_score(estimator = rfc, X = X_train, y = y_train, cv = 5)
print('Accuracy: {:.2f} %'.format(accuracies.mean()*100))
print('Standard deviation: {:.2f} %'.format(accuracies.std()*100))

Accuracy: 82.62 %
Standard deviation: 0.02 %


In [9]:
accuracies

array([0.82643184, 0.82601054, 0.82636786, 0.82588471, 0.82617716])

## Grid Search

In [10]:
from sklearn.model_selection import GridSearchCV
parameters = [{'n_estimators': list(range(1,21)), 'max_features': ['auto', 'sqrt', 'log2'],
               'max_depth' : ['None',8], 'criterion' :['gini', 'entropy']}]
grid_search = GridSearchCV(estimator = rfc, 
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 5,
                           n_jobs = -1)
grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print('Best accuracy: {:.2f} %'.format(best_accuracy*100))
print('Best parameters: ',best_parameters)

Best accuracy: 68.85 %
Best parameters:  {'criterion': 'gini', 'max_depth': 8, 'max_features': 'sqrt', 'n_estimators': 17}


## Metrics

In [9]:
from sklearn.metrics import roc_auc_score, jaccard_score, f1_score, precision_score, recall_score
print(roc_auc_score(y_test,y_ts_rf_pred))
print(jaccard_score(y_test,y_ts_rf_pred))
print(f1_score(y_test,y_ts_rf_pred))
print(precision_score(y_test,y_ts_rf_pred))
print(recall_score(y_test,y_ts_rf_pred))

0.8477550054432512
0.7362136354253176
0.848068026196521
0.8466394800860255
0.8495014012648725
