# 3: Build an XGBoost

In [1]:
import pickle
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import plotly.express as px
import scipy.sparse
import os

# import keras
import tensorflow.keras.backend as K
from tensorflow.keras.models import Sequential, Model 
from keras.layers import Dense, Embedding, Conv1D, MaxPooling1D, GlobalMaxPool1D, Flatten, Input, concatenate, Dropout, Activation, regularizers, BatchNormalization

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, confusion_matrix

print('TF version:',tf.__version__)


Using TensorFlow backend.


TF version: 1.15.0


In [36]:
cwd = os.getcwd()
print(cwd)
os.chdir(cwd)

C:\Users\brizio\Documents\PythonNB\FICOchallenge


## Load prepped data
- Normal Scaling of numeric variables (prep_option = 1)
- Binning (following Rudin) and one hot encoding (prep_option = 2)
- Binning and applying WOE, calculating WOE on Rudin's bins (prep_option = 3)
- Binning and applying WOE, following Rudin (prep_option = 4)

In [1]:
prep_option = 4

In [62]:
if prep_option == 1:
    data_path = "Data/Scaled_data.csv"
if prep_option == 2:
    data_path = "Data/Bin_Encoded_data_v2.csv"
if prep_option == 3:
    data_path = "Data/WOE_data.csv"
if prep_option == 4:
    data_path = "Data/WOE_Rud_data.csv"
    
ori_path = "Data/heloc_dataset_v1.csv"

CLASS = 'RiskPerformance' 

data = pd.read_csv(ori_path)
X1 = pd.read_csv(data_path)
y = pd.read_csv("Data/y_data.csv")

print('Target: Bad (y=1)')
class_names = sorted(y[CLASS].unique(),  reverse=True)
print(y[CLASS].value_counts())
y_onehot = pd.get_dummies(y[CLASS])[['Bad']]
print(np.array(np.unique(y_onehot, return_counts=True)).T)

print('X shape:',X1.shape)
X1.head()

Target: Bad (y=1)
Bad     5459
Good    5000
Name: RiskPerformance, dtype: int64
[[   0 5000]
 [   1 5459]]
X shape: (10459, 23)


Unnamed: 0,ExternalRiskEstimate_bin_WOE,MSinceOldestTradeOpen_bin_WOE,MSinceMostRecentTradeOpen_bin_WOE,AverageMInFile_bin_WOE,NumSatisfactoryTrades_bin_WOE,NumTrades60Ever2DerogPubRec_bin_WOE,NumTrades90Ever2DerogPubRec_bin_WOE,NumTotalTrades_bin_WOE,NumTradesOpeninLast12M_bin_WOE,PercentTradesNeverDelq_bin_WOE,...,PercentInstallTrades_bin_WOE,NetFractionInstallBurden_bin_WOE,NumInstallTradesWBalance_bin_WOE,MSinceMostRecentInqexcl7days_bin_WOE,NumInqLast6M_bin_WOE,NumInqLast6Mexcl7days_bin_WOE,NetFractionRevolvingBurden_bin_WOE,NumRevolvingTradesWBalance_bin_WOE,NumBank2NatlTradesWHighUtilization_bin_WOE,PercentTradesWBalance_bin_WOE
0,1.799,0.086,0.083,0.269,0.166,0.952,-0.021,-0.097,-0.021,1.012,...,-0.503,0.047,0.242,1.223,-0.047,-0.051,-0.088,0.034,-0.601,-0.13
1,1.799,0.549,0.083,1.238,1.999,0.952,-0.053,0.535,-0.021,-0.147,...,0.161,0.047,0.256,1.223,-0.047,-0.051,-0.739,-0.188,0.601,-0.982
2,1.017,0.549,0.083,1.238,0.539,-0.021,-0.021,0.535,0.428,-0.147,...,-0.503,0.147,0.242,1.223,0.17,0.021,0.633,-0.263,-0.601,0.203
3,1.017,0.086,0.083,0.269,-0.086,-0.021,-0.021,-0.377,0.287,0.366,...,-0.145,0.363,0.313,1.223,0.471,0.021,0.633,-0.15,0.541,0.772
4,-1.094,-0.148,0.0,0.0,0.539,-0.021,-0.021,0.116,-0.021,-0.147,...,-0.62,0.363,0.242,1.223,-0.047,-0.051,0.633,-0.188,-0.601,0.203


### XGBoost

In [74]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

xgb = XGBClassifier()
xgb.fit(X_onehot_train, y_onehot_train)
# make predictions for test data
y_pred = xgb.predict(X_onehot_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_onehot_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 69.87%


In [None]:
# Using a random forest to optimize
# import scikit_optimize as skopt
from skopt import forest_minimize

def tune_xgbc(params):
# Implementation learned on a lesson of Mario Filho (Kagle Grandmaster) for parametes optmization.
# Link to the video: https://www.youtube.com/watch?v=WhnkeasZNHI

    """Function to be passed as scikit-optimize minimizer/maximizer input

    Parameters:
    Tuples with information about the range that the optimizer should use for that parameter, 
    as well as the behaviour that it should follow in that range.

    Returns:
    float: the metric that should be minimized. If the objective is maximization, then the negative 
    of the desired metric must be returned. In this case, the negative AUC average generated by CV is returned.
    """


    #Hyperparameters to be optimized
    print(params)
    learning_rate = params[0] 
    n_estimators = params[1] 
    max_depth = params[2]
    min_child_weight = params[3]
    gamma = params[4]
    subsample = params[5]
    colsample_bytree = params[6]


    #Model to be optimized
    mdl = XGBClassifier(learning_rate = learning_rate, 
                            n_estimators = n_estimators, 
                            max_depth = max_depth, 
                            min_child_weight = min_child_weight, 
                            gamma = gamma, 
                            subsample = subsample, 
                            colsample_bytree = colsample_bytree, seed = 42)


    #Cross-Validation in order to avoid overfitting
    auc = cross_val_score(mdl, X_onehot_train, y_onehot_train, cv = 10, scoring = 'roc_auc')

    print(auc.mean())
    # as the function is minimization (forest_minimize), we need to use the negative of the desired metric (AUC)
    return -auc.mean()

# Creating a sample space in which the initial randomic search should be performed
space = [(1e-3, 1e-1, 'log-uniform'), # learning rate
          (100, 2000), # n_estimators
          (1, 10), # max_depth 
          (1, 6.), # min_child_weight 
          (0, 0.5), # gamma 
          (0.5, 1.), # subsample 
          (0.5, 1.)] # colsample_bytree 

# Minimization using a random forest with 20 random samples and 50 iterations for Bayesian optimization.
result = forest_minimize(tune_xgbc, space, random_state = 42, n_random_starts = 20, n_calls  = 25, verbose = 1)

In [84]:
result.x

[0.0026587543983272693,
 1315,
 5,
 4.087407548138583,
 0.3058265802441405,
 0.5035331526098588,
 0.5115312125207079]

In [88]:
xgb1 = XGBClassifier(learning_rate = result.x[0], 
                     n_estimators = result.x[1], 
                     max_depth = result.x[2], 
                     min_child_weight = result.x[3], 
                     gamma = result.x[4], 
                     subsample = result.x[5], 
                     colsample_bytree = result.x[6], 
                     seed = 42)


xgb1.fit(X_onehot_train, y_onehot_train)
# make predictions for test data
y_pred = xgb1.predict(X_onehot_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_onehot_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 72.12%


In [None]:
from sklearn import preprocessing
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

CLASS = 'RiskPerformance'

# Split X and y
X = data.drop(columns=[CLASS])
y = data[CLASS]
y_onehot = pd.get_dummies(y)[['Bad']]

np.array(np.unique(y_onehot, return_counts=True)).T
X = X.astype('float32')
y = LabelEncoder().fit_transform(y_onehot['Bad'].astype('int8'))

X_onehot_train, X_onehot_test, y_onehot_train, y_onehot_test, = \
train_test_split(X, y, test_size = .25, random_state = 2020, shuffle = True)
print(X_onehot_train.shape, X_onehot_test.shape, y_onehot_train.shape, y_onehot_test.shape)

RS = RobustScaler()
scaledX = RS.fit_transform(X_onehot_train)
scaledX = pd.DataFrame(X_onehot_train, columns = X_onehot_train.columns)
colList = X_onehot_train.columns

scaledX_test = pd.DataFrame(RS.transform(X_onehot_test), columns = X_onehot_test.columns)

params = {"verbosity":0,
          "nthread":-1,
          "seed":1,
          "booster":"gbtree",
          "lambda":1,
          "alpha":0,
          "learning_rate":0.0085,
          "gamma":0.642,
          "max_depth":16,
          "min_child_weight":5,
          "max_delta_step":2,
          "subsample":0.374,
          "colsample_bytree":0.8280000000000001,
          "colsample_bylevel":1,
          "scale_pos_weight":1,
          "process_type":"default",
          "tree_method":"auto",
          "objective":"binary:logistic",
          "eval_metric":'auc'}

import scipy.sparse

dtrain = xgb.DMatrix(scipy.sparse.csc_matrix(scaledX.to_numpy()), label = y_onehot_train, feature_names = colList)

dtest  = xgb.DMatrix(scipy.sparse.csc_matrix(scaledX_test.to_numpy()), label = y_onehot_test, feature_names = colList)

# evallist = (dtrain, 'train')
xg_reg = xgb.train(params, dtrain, 750,  [(dtrain,'train'),(dtest,'test')])

In [120]:
# make predictions for test data
y_pred = xg_reg.predict(dtest)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_onehot_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 53.23%


In [None]:
# Using a random forest to optimize
# import scikit_optimize as skopt
from skopt import forest_minimize

def tune_xgbc(params):
# Implementation learned on a lesson of Mario Filho (Kagle Grandmaster) for parametes optmization.
# Link to the video: https://www.youtube.com/watch?v=WhnkeasZNHI

    """Function to be passed as scikit-optimize minimizer/maximizer input

    Parameters:
    Tuples with information about the range that the optimizer should use for that parameter, 
    as well as the behaviour that it should follow in that range.

    Returns:
    float: the metric that should be minimized. If the objective is maximization, then the negative 
    of the desired metric must be returned. In this case, the negative AUC average generated by CV is returned.
    """


    #Hyperparameters to be optimized
    print(params)
    learning_rate = params[0] 
    gamma = params[1] 
    max_depth = params[2]
    min_child_weight = params[3]
    max_delta_step = params[4]
    subsample = params[5]
    colsample_bytree = params[6]
    rounds = params[7]

    params1 = {"silent":1,
              "nthread":-1,
              "seed":1,
              "booster":"gbtree",
              "lambda":1,
              "alpha":0,
              "learning_rate":learning_rate,
              "gamma":gamma,
              "max_depth":max_depth,
              "min_child_weight":min_child_weight,
              "max_delta_step":max_delta_step,
              "subsample":subsample,
              "colsample_bytree":colsample_bytree,
              "colsample_bylevel":1,
              "scale_pos_weight":1,
              "process_type":"default",
              "tree_method":"auto",
              "objective":"binary:logistic",
              "eval_metric":'auc'}


    #Model to be optimized
    mdl = xgb.train(params1, dtrain, rounds,  [(dtrain,'train'),(dtest,'test')])


    #Cross-Validation in order to avoid overfitting
    auc = xgb.cv(params1, dtrain, nfold  = 10, metrics = 'auc')

    print(auc.mean()[2])
    # as the function is minimization (forest_minimize), we need to use the negative of the desired metric (AUC)
    return -auc.mean()[2]

# Creating a sample space in which the initial randomic search should be performed
space = [(1e-3, 1e-1, 'log-uniform'), # learning rate
          (0, 0.5), # gamma
          (1, 30), # max_depth 
          (1, 6.), # min_child_weight 
          (0, 5), # max_delta_step 
          (0.5, 1.), # subsample 
          (0.5, 1.), # colsample_bytree 
          (10,1000)] # rounds

# Minimization using a random forest with 20 random samples and 50 iterations for Bayesian optimization.
result = forest_minimize(tune_xgbc, space, random_state = 42, n_random_starts = 20, n_calls  = 25, verbose = 1)

In [121]:
result.x

[0.04947535032796275,
 0.27261958736380937,
 6,
 3.064296866484532,
 4,
 0.6680047054202385,
 0.9831620375498387,
 156]

In [122]:
learning_rate = result.x[0] 
gamma = result.x[1] 
max_depth = result.x[2]
min_child_weight = result.x[3]
max_delta_step = result.x[4]
subsample = result.x[5]
colsample_bytree = result.x[6]
rounds = result.x[7]

params1 = {"silent":1,
          "nthread":-1,
          "seed":1,
          "booster":"gbtree",
          "lambda":1,
          "alpha":0,
          "learning_rate":learning_rate,
          "gamma":gamma,
          "max_depth":max_depth,
          "min_child_weight":min_child_weight,
          "max_delta_step":max_delta_step,
          "subsample":subsample,
          "colsample_bytree":colsample_bytree,
          "colsample_bylevel":1,
          "scale_pos_weight":1,
          "process_type":"default",
          "tree_method":"auto",
          "objective":"binary:logistic",
          "eval_metric":'auc'}

mdl = xgb.train(params1, dtrain, rounds,  [(dtrain,'train'),(dtest,'test')])

# make predictions for test data
y_pred = mdl.predict(dtest)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_onehot_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

[0]	train-auc:0.79507	test-auc:0.50277
[1]	train-auc:0.80696	test-auc:0.51860
[2]	train-auc:0.81502	test-auc:0.51386
[3]	train-auc:0.81821	test-auc:0.53876
[4]	train-auc:0.82074	test-auc:0.55240
[5]	train-auc:0.82293	test-auc:0.55316
[6]	train-auc:0.82524	test-auc:0.54122
[7]	train-auc:0.82687	test-auc:0.53702
[8]	train-auc:0.82805	test-auc:0.53702
[9]	train-auc:0.82877	test-auc:0.52892
[10]	train-auc:0.83034	test-auc:0.53534
[11]	train-auc:0.83231	test-auc:0.53307
[12]	train-auc:0.83368	test-auc:0.53607
[13]	train-auc:0.83436	test-auc:0.53100
[14]	train-auc:0.83565	test-auc:0.56098
[15]	train-auc:0.83632	test-auc:0.56276
[16]	train-auc:0.83734	test-auc:0.57935
[17]	train-auc:0.83751	test-auc:0.57944
[18]	train-auc:0.83832	test-auc:0.55937
[19]	train-auc:0.83939	test-auc:0.56898
[20]	train-auc:0.84054	test-auc:0.56722
[21]	train-auc:0.84147	test-auc:0.57966
[22]	train-auc:0.84176	test-auc:0.57997
[23]	train-auc:0.84302	test-auc:0.55693
[24]	train-auc:0.84416	test-auc:0.55703
[25]	train