### Performance Criteria:
- Runtime
- Model Accuracy

Using a clean dataset which has been used for other xgboost models. The data relates to bank loan defaulters.

We will first look at binary classification models. 

In [30]:
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss

from statistics import mean

import pandas as pd
import os
import time

In [46]:
# Reading the data

path_to_data = os.getenv('Documents') + 'XGBoost Comparison Study/Datasets/'

train_data = pd.read_csv(path_to_data + 'train_clean.csv')
test_data = pd.read_csv(path_to_data + 'test_clean.csv') # May not need this

In [5]:
# Working with train_data as has target variable (Loan Status)

train_data.head()

Unnamed: 0,Loan Amount,Funded Amount,Funded Amount Investor,Term,Interest Rate,Home Ownership,Debit to Income,Open Account,Revolving Balance,Revolving Utilities,...,Public Record_4,Delinquency - two years_0,Delinquency - two years_1,Delinquency - two years_2,Delinquency - two years_3,Delinquency - two years_4,Delinquency - two years_5,Delinquency - two years_6,Delinquency - two years_7,Delinquency - two years_8
0,10000,32236,12329.36286,59,11.135007,176346.6267,16.284758,13,24246,74.932551,...,0,0,1,0,0,0,0,0,0,0
1,3609,11940,12191.99692,59,12.237563,39833.921,15.412409,12,812,78.297186,...,0,1,0,0,0,0,0,0,0,0
2,28276,9311,21603.22455,59,12.545884,91506.69105,28.137619,14,1843,2.07304,...,0,1,0,0,0,0,0,0,0,0
3,11170,6954,17877.15585,59,16.731201,108286.5759,18.04373,7,13819,67.467951,...,0,0,1,0,0,0,0,0,0,0
4,16890,13226,13539.92667,59,15.0083,44234.82545,17.209886,13,1544,85.250761,...,0,0,1,0,0,0,0,0,0,0


In [6]:
X = train_data.drop(['Loan Status'], axis=1)
y = train_data['Loan Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17)

### Scikit-Learn Wrapper

In [7]:
# Scikit-Learn Wrapper

# Parameters 
scikit_xgb = xgb.XGBClassifier()  
scikit_xgb.get_params()

{'objective': 'binary:logistic',
 'use_label_encoder': True,
 'base_score': None,
 'booster': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'enable_categorical': False,
 'gamma': None,
 'gpu_id': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_delta_step': None,
 'max_depth': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'predictor': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [8]:
%%time
# Fitting the model

model_1 = scikit_xgb.fit(X_train, y_train, eval_metric='auc')



Wall time: 15.4 s


In [18]:
%%time
# Predictions 

predictions_scikit = model_1.predict_proba(X_test)
predictions_scikit

Wall time: 36.3 ms


array([[0.92695266, 0.07304732],
       [0.8921554 , 0.10784459],
       [0.9165391 , 0.0834609 ],
       ...,
       [0.9226985 , 0.07730149],
       [0.9052887 , 0.09471128],
       [0.93470305, 0.06529693]], dtype=float32)

In [17]:
%%time
# Evaluation
defaulter_prob = predictions_scikit[:,1]
roc_auc_score(y_test, defaulter_prob)

Wall time: 14.6 ms


0.5102747061038507

### Global Configuration

In [19]:
%%time
# Global Configuration

params = scikit_xgb.get_params()
params['eval_metric'] = 'auc'

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

eval_list = [(dtrain, 'train'), (dtest, 'test')]

Wall time: 227 ms


In [20]:
%%time
# Fitting the model

del params['enable_categorical']
del params['missing']
del params['n_estimators']
del params['use_label_encoder']

model_2 = xgb.train(params=params, dtrain=eval_list[0][0], num_boost_round=100) # Consistant with wrapper

Wall time: 14 s


In [21]:
%%time 
# Predictions 

predictions_glob = model_2.predict(eval_list[1][0])
predictions_glob

Wall time: 13.2 ms


array([0.07304732, 0.10784459, 0.0834609 , ..., 0.07730149, 0.09471128,
       0.06529693], dtype=float32)

In [22]:
%%time
# Evaluation

roc_auc_score(dtest.get_label(), predictions_glob)

Wall time: 18 ms


0.5102747061038507

In [23]:
# Exact same roc_auc_score for Global Configuration and Scikit-Learn Wrapper

### Runtime

In [24]:
# Scikit-Learn Wrapper

scikit_runtime = []

for i in range(10):
    
    start_time = time.time()
    
    model_scikit = scikit_xgb.fit(X_train, y_train, eval_metric='auc')
    
    scikit_runtime.append(time.time() - start_time)



NameError: name 'scikit_rumtime' is not defined

In [31]:
print('Average train time:', mean(scikit_runtime))

Average train time: 14.810705661773682


In [33]:
# Global Configuration

glob_runtime = []

for i in range(10):
    
    start_time = time.time()
    
    model_glob = xgb.train(params=params, dtrain=eval_list[0][0], num_boost_round=100)
    
    glob_runtime.append(time.time() - start_time)

In [34]:
print('Average train time:', mean(glob_runtime))

Average train time: 9.228725481033326


In [None]:
# With the global wrapper, there is a noticable performance difference

### Cross Validation

In [48]:
# Scikit-Learn 

from sklearn.model_selection import KFold

X = train_data.drop(['Loan Status'], axis=1)
y = train_data['Loan Status']

# With ten folds
start_time = time.time()

kf = KFold(n_splits=5)

roc_auc_scores = []

for train_index, test_index in kf.split(X, y):
    
    X_train= X.iloc[train_index]
    X_test = X.iloc[test_index]
    y_train = y.iloc[train_index] 
    y_test = y.iloc[test_index]
    
    model_scikit = scikit_xgb.fit(X_train, y_train, eval_metric='auc')
    
    predictions_scikit = model_1.predict_proba(X_test)
    
    defaulter_prob = predictions_scikit[:,1]
    roc_auc_scores.append(roc_auc_score(y_test, defaulter_prob))
    
end_time = time.time() - start_time

print('Runtime:', end_time)
print('Average roc_auc_score:', mean(roc_auc_scores))



Runtime: 40.08089208602905
Average roc_auc_score: 0.5138134349877176


In [51]:
# Global Wrapper 

start_time = time.time()

eval_hist = xgb.cv(params=params, dtrain=eval_list[0][0], num_boost_round=100, nfold=5, folds=kf, metrics='auc',
                  verbose_eval=True)

print('Runtime:', time.time()- start_time)

[0]	train-auc:0.54510+0.00533	test-auc:0.51584+0.00590
[1]	train-auc:0.56852+0.00560	test-auc:0.51970+0.00486
[2]	train-auc:0.58603+0.00618	test-auc:0.51654+0.00240
[3]	train-auc:0.60203+0.00753	test-auc:0.51718+0.00186
[4]	train-auc:0.62407+0.00531	test-auc:0.51772+0.00526
[5]	train-auc:0.64081+0.00475	test-auc:0.51835+0.00526
[6]	train-auc:0.66143+0.00309	test-auc:0.51559+0.00500
[7]	train-auc:0.67937+0.00408	test-auc:0.51747+0.00569
[8]	train-auc:0.69174+0.00469	test-auc:0.52085+0.00478
[9]	train-auc:0.70393+0.00359	test-auc:0.51863+0.00664
[10]	train-auc:0.71653+0.00396	test-auc:0.52051+0.00426
[11]	train-auc:0.72703+0.00534	test-auc:0.52052+0.00502
[12]	train-auc:0.73672+0.00515	test-auc:0.51853+0.00375
[13]	train-auc:0.74595+0.00445	test-auc:0.51929+0.00498
[14]	train-auc:0.75225+0.00317	test-auc:0.51838+0.00610
[15]	train-auc:0.75825+0.00418	test-auc:0.51773+0.00666
[16]	train-auc:0.76546+0.00603	test-auc:0.51840+0.00727
[17]	train-auc:0.77037+0.00633	test-auc:0.51795+0.00705
[1

In [52]:
eval_hist

Unnamed: 0,train-auc-mean,train-auc-std,test-auc-mean,test-auc-std
0,0.545101,0.005327,0.515843,0.005905
1,0.568523,0.005596,0.519700,0.004861
2,0.586026,0.006183,0.516539,0.002396
3,0.602032,0.007535,0.517180,0.001864
4,0.624073,0.005309,0.517724,0.005257
...,...,...,...,...
95,0.949178,0.005881,0.520957,0.005833
96,0.950207,0.005757,0.520499,0.005519
97,0.950806,0.005264,0.520698,0.005540
98,0.951524,0.005313,0.520267,0.005266


In [53]:
# Stratified 

# Scikit-Learn Wrapper

from sklearn.model_selection import StratifiedKFold

start_time = time.time()
roc_auc_scores =[]

skf = StratifiedKFold(n_splits=5)

for train_index, test_index in skf.split(X, y):
    
    X_train = X.iloc[train_index]
    X_test = X.iloc[test_index]
    y_train = y.iloc[train_index]
    y_test = y.iloc[test_index]
    
    model_scikit = scikit_xgb.fit(X_train, y_train, eval_metric='auc')
    
    predictions_scikit = model_1.predict_proba(X_test)
    
    defaulter_prob = predictions_scikit[:,1]
    roc_auc_scores.append(roc_auc_score(y_test, defaulter_prob))
    
end_time = time.time() - start_time

print('Runtime:', end_time)
print('Average roc_auc_score:', mean(roc_auc_scores))



Runtime: 44.095080614089966
Average roc_auc_score: 0.5126130282462368


In [54]:
# Global Wrapper 

start_time = time.time()

eval_hist = xgb.cv(params=params, dtrain=eval_list[0][0], num_boost_round=100, nfold=5, folds=kf, metrics='auc', 
                   stratified=True, verbose_eval=True)

print('Runtime:', time.time()- start_time)

[0]	train-auc:0.54510+0.00533	test-auc:0.51584+0.00590
[1]	train-auc:0.56852+0.00560	test-auc:0.51970+0.00486
[2]	train-auc:0.58603+0.00618	test-auc:0.51654+0.00240
[3]	train-auc:0.60203+0.00753	test-auc:0.51718+0.00186
[4]	train-auc:0.62407+0.00531	test-auc:0.51772+0.00526
[5]	train-auc:0.64081+0.00475	test-auc:0.51835+0.00526
[6]	train-auc:0.66143+0.00309	test-auc:0.51559+0.00500
[7]	train-auc:0.67937+0.00408	test-auc:0.51747+0.00569
[8]	train-auc:0.69174+0.00469	test-auc:0.52085+0.00478
[9]	train-auc:0.70393+0.00359	test-auc:0.51863+0.00664
[10]	train-auc:0.71653+0.00396	test-auc:0.52051+0.00426
[11]	train-auc:0.72703+0.00534	test-auc:0.52052+0.00502
[12]	train-auc:0.73672+0.00515	test-auc:0.51853+0.00375
[13]	train-auc:0.74595+0.00445	test-auc:0.51929+0.00498
[14]	train-auc:0.75225+0.00317	test-auc:0.51838+0.00610
[15]	train-auc:0.75825+0.00418	test-auc:0.51773+0.00666
[16]	train-auc:0.76546+0.00603	test-auc:0.51840+0.00727
[17]	train-auc:0.77037+0.00633	test-auc:0.51795+0.00705
[1