In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import mglearn
%matplotlib inline

In [3]:
import os
path = 'E:\Python Consumer Credit'
os.chdir(path)

In [4]:
df_tr=pd.read_csv('df__under_tr.csv',header=None)
df_ts=pd.read_csv('df__under_ts.csv',header=None)

In [5]:
X_train = df_tr.iloc[:,:-1].values
y_train = df_tr.iloc[:,-1].values
X_test = df_ts.iloc[:,:-1].values
y_test = df_ts.iloc[:,-1].values
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(557088, 14)
(139272, 14)
(557088,)
(139272,)


#### XGBoost Classifier

In [6]:
from xgboost import XGBClassifier
xgb = XGBClassifier(colsample_bytree=0.7, learning_rate=0.05, max_depth=8, min_child_weight=11, n_estimators=1000, objective='binary:logistic', subsample=0.8, tree_method='exact').fit(X_train,y_train)
y_tr_xgb_pred = xgb.predict(X_train)
y_ts_xgb_pred = xgb.predict(X_test)

In [7]:
from sklearn.metrics import confusion_matrix, accuracy_score
xgb_cm_tr = confusion_matrix(y_train,y_tr_xgb_pred)
print(xgb_cm_tr)
accuracy_score(y_train, y_tr_xgb_pred)

[[193432  85390]
 [ 84080 194186]]


0.6957931242460796

In [8]:
xgb_cm_ts = confusion_matrix(y_test,y_ts_xgb_pred)
print(xgb_cm_ts)
accuracy_score(y_test, y_ts_xgb_pred)

[[46586 22772]
 [22851 47063]]


0.672418002182779

## Applying k-fold Cross Validation

In [8]:
from sklearn.model_selection import KFold, cross_val_score
accuracies = cross_val_score(estimator = xgb, X = X_train, y = y_train, cv = 5)
print('Accuracy: {:.2f} %'.format(accuracies.mean()*100))
print('Standard deviation: {:.2f} %'.format(accuracies.std()*100))

Accuracy: 66.42 %
Standard deviation: 0.11 %


In [9]:
accuracies

array([0.66368091, 0.66276544, 0.66599652, 0.66464723, 0.66401896])

## Grid Search

In [10]:
from sklearn.model_selection import GridSearchCV
parameters = [{'n_estimators': [1000], #number of trees, change it to 1000 for better results
               'max_depth': [6,7,8],
               'learning_rate': [0.05], #so called `eta` value
               'objective':['binary:logistic'],
               'tree_method':['exact'],
               'min_child_weight': [11],
               'subsample': [0.8],
               'colsample_bytree': [0.7]}]
grid_search = GridSearchCV(estimator = xgb, 
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 5,
                           n_jobs = -1)
grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print('Best accuracy: {:.2f} %'.format(best_accuracy*100))
print('Best parameters: ',best_parameters)

Best accuracy: 67.04 %
Best parameters:  {'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 8, 'min_child_weight': 11, 'n_estimators': 1000, 'objective': 'binary:logistic', 'subsample': 0.8, 'tree_method': 'exact'}


## Metrics

In [9]:
from sklearn.metrics import roc_auc_score, jaccard_score, f1_score, precision_score, recall_score
print(roc_auc_score(y_test,y_ts_xgb_pred))
print(jaccard_score(y_test,y_ts_xgb_pred))
print(f1_score(y_test,y_ts_xgb_pred))
print(precision_score(y_test,y_ts_xgb_pred))
print(recall_score(y_test,y_ts_xgb_pred))

0.6724150457864839
0.5077681634766847
0.6735361254821144
0.6739170902842414
0.6731555911548474
