# Gradient Boosting and XGBoost

I will be creating models using Sklearn's Gradient Boost, and the XGBoost algorithm.

In [79]:
# Importing Required Packages.
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd
import category_encoders as ce
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from xgboost import XGBClassifier
from catboost import CatBoostClassifier, Pool
import warnings
warnings.filterwarnings('ignore')

# Training labels
training_labels = pd.read_csv('../../Data/training_set_labels.csv', index_col='respondent_id')
training_features = pd.read_csv('../../Data/training_set_features.csv', index_col='respondent_id')

In [80]:
def metrics(y_test, _preds):
    print('accuracy: {:0.3f}'.format(accuracy_score(y_test, _preds)))
    print('recall: {:0.3f}'.format(recall_score(y_test, _preds)))
    print('f1: {:0.3f}'.format(f1_score(y_test, _preds)))
    print('roc_auc: {:0.3f}'.format(roc_auc_score(y_test , _preds)))

In [81]:
# TTS
X_train, X_test, y_train, y_test = train_test_split(training_features, training_labels['h1n1_vaccine'], test_size=0.33, random_state=42)

## Preproccessing
For the preproccessing, all of the columns are categorical, however, some of them are numerical, and some of them are strings. We will want to handle these these columns differently when imputing missing values.

- Numerical Categories
    - Use Sklearn's Iterative Imputer to fill in the missing values
- String Categories
    - Fill missing values with a new value: 'unknown'
    - One hot encode the results
- Categories with more then 10 unique categories
    - We will frequency code these instead, so we don't have an overwhelming amount of columns in the dataframe.

In [82]:
#training_features = training_features.select_dtypes(exclude='object')

In [83]:
training_features

Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Not in Labor Force,qufhixun,Non-MSA,0.0,0.0,,
26703,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,"<= $75,000, Above Poverty",Not Married,Rent,Employed,lzgpxyit,"MSA, Principle City",1.0,0.0,fcxhlnwr,cmhcxjea
26704,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,,Not Married,Own,,lzgpxyit,"MSA, Not Principle City",0.0,0.0,,
26705,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,...,"<= $75,000, Above Poverty",Married,Rent,Employed,lrircsnp,Non-MSA,1.0,0.0,fcxhlnwr,haliazsg


In [84]:
# Preproccessing columns
num_cols = []
ohe_cols = []
freq_cols = []

# Seperate columns into numerical, categorical, and freq

for c in training_features.columns:
    if training_features[c].dtype in ['float64', 'int64']:
        num_cols.append(c)
    elif training_features[c].nunique() < 10:
        ohe_cols.append(c)
    else:
        freq_cols.append(c)


In [85]:
# Fill NaN values using IterativeImputer
num_transformer = Pipeline(steps=[
    ('num_imputer', IterativeImputer(max_iter=15)),
])

# Onehot Encoding transformer for Categorical variable
ohe_transformer = Pipeline(steps=[
    ('ohe_imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('oh_encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Transformer for categories with more then 10 unique values
freq_transformer = Pipeline(steps=[
    ('freq_encoder', ce.count.CountEncoder(normalize=True, min_group_size=.05)),
    ('freq_imputer', SimpleImputer(strategy='constant', fill_value=0))
])


In [86]:
# Combine transformers into preprocessor.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('ohe', ohe_transformer, ohe_cols),
        ('freq', freq_transformer, freq_cols)
    ])

# Sklearn Gradient Boost
Here I will create a baseline gradient boost model to compare future models too.

In [75]:
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('gb_clf', GradientBoostingClassifier())
])

In [76]:
# Cross validate test
cross_validate(clf, X_train, y_train)

{'fit_time': array([5.4752152 , 4.23772097, 4.47239089, 4.14246488, 3.9379859 ]),
 'score_time': array([0.1433959 , 0.08711386, 0.09352803, 0.11599827, 0.07763314]),
 'test_score': array([0.85247276, 0.84828164, 0.8505169 , 0.8471213 , 0.85187255])}

In [77]:
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score
clf.fit(X_train, y_train)
_preds = clf.predict(X_test)

metrics(y_test, _preds)

accuracy: 0.851
recall: 0.474
f1: 0.575
roc_auc: 0.713


Without Objects:
accuracy: 0.853
recall: 0.489
f1: 0.585
roc_auc: 0.720

With Objects:

accuracy: 0.854
recall: 0.486
f1: 0.585
roc_auc: 0.719

## XGBoost
Let's try a baseline model for XGBoost as well.


In [54]:
XG_clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('gb_clf', XGBClassifier(eval_metric='auc'))
])


In [55]:
cross_validate(XG_clf, X_train, y_train)

{'fit_time': array([16.52609086, 18.68650603, 10.141397  ,  8.20269108,  8.62241793]),
 'score_time': array([0.52228022, 0.16040587, 0.17363405, 0.19290066, 0.15759015]),
 'test_score': array([0.85079631, 0.84353171, 0.84353171, 0.84041364, 0.84684181])}

In [56]:
XG_clf.fit(X_train, y_train)
boost_preds = XG_clf.predict(X_test)
metrics(y_test, boost_preds)

accuracy: 0.845
recall: 0.513
f1: 0.584
roc_auc: 0.724


# Catboost
Finally, since all of our data is categorical, I'd like to try catboost as well.

In [95]:
cat_clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('catboost_clf', CatBoostClassifier())
])

In [89]:
cat_clf.fit(X_train, y_train)

Learning rate set to 0.035304
0:	learn: 0.6659079	total: 74ms	remaining: 1m 13s
1:	learn: 0.6413337	total: 96.8ms	remaining: 48.3s
2:	learn: 0.6223101	total: 125ms	remaining: 41.5s
3:	learn: 0.6016191	total: 164ms	remaining: 40.8s
4:	learn: 0.5824863	total: 182ms	remaining: 36.2s
5:	learn: 0.5650356	total: 194ms	remaining: 32.2s
6:	learn: 0.5496686	total: 206ms	remaining: 29.2s
7:	learn: 0.5346099	total: 218ms	remaining: 27.1s
8:	learn: 0.5221328	total: 236ms	remaining: 26s
9:	learn: 0.5096483	total: 256ms	remaining: 25.3s
10:	learn: 0.4979666	total: 272ms	remaining: 24.4s
11:	learn: 0.4877183	total: 297ms	remaining: 24.5s
12:	learn: 0.4780516	total: 323ms	remaining: 24.6s
13:	learn: 0.4690322	total: 353ms	remaining: 24.8s
14:	learn: 0.4616690	total: 431ms	remaining: 28.3s
15:	learn: 0.4543440	total: 498ms	remaining: 30.6s
16:	learn: 0.4483254	total: 516ms	remaining: 29.9s
17:	learn: 0.4424012	total: 551ms	remaining: 30s
18:	learn: 0.4366204	total: 572ms	remaining: 29.5s
19:	learn: 0.4

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('num_imputer',
                                                                   IterativeImputer(max_iter=15))]),
                                                  ['h1n1_concern',
                                                   'h1n1_knowledge',
                                                   'behavioral_antiviral_meds',
                                                   'behavioral_avoidance',
                                                   'behavioral_face_mask',
                                                   'behavioral_wash_hands',
                                                   'behavioral_large_gatherings',
                                                   'behavioral_outside_home',
                                                   'behavioral_touch_face',
                                                 

In [90]:
metrics(y_test, cat_clf.predict(X_test))

accuracy: 0.853
recall: 0.504
f1: 0.593
roc_auc: 0.726


Catboost did about as well as XGboost did, but the biggest thing I noticed is that it took 1/4 of the amount of time to train. I think this would be a much better model type to use going forward.

## Catboost Tuning

In [96]:
# Create pipleline with tuned params
tuned_cat_clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    # Changing the eval metric from "logloss" to "AUC" and modifying the learning rate
    ('catboost_clf', CatBoostClassifier(learning_rate=0.03,
    eval_metric='AUC'))
])

In [98]:
# Fit the new model
tuned_cat_clf.fit(X_train, y_train)

0:	total: 25.7ms	remaining: 25.6s
1:	total: 55.2ms	remaining: 27.5s
2:	total: 84.3ms	remaining: 28s
3:	total: 120ms	remaining: 30s
4:	total: 146ms	remaining: 29.1s
5:	total: 178ms	remaining: 29.4s
6:	total: 215ms	remaining: 30.5s
7:	total: 245ms	remaining: 30.4s
8:	total: 264ms	remaining: 29.1s
9:	total: 293ms	remaining: 29s
10:	total: 317ms	remaining: 28.5s
11:	total: 351ms	remaining: 28.9s
12:	total: 378ms	remaining: 28.7s
13:	total: 391ms	remaining: 27.6s
14:	total: 425ms	remaining: 27.9s
15:	total: 483ms	remaining: 29.7s
16:	total: 530ms	remaining: 30.6s
17:	total: 552ms	remaining: 30.1s
18:	total: 584ms	remaining: 30.2s
19:	total: 619ms	remaining: 30.3s
20:	total: 659ms	remaining: 30.7s
21:	total: 733ms	remaining: 32.6s
22:	total: 780ms	remaining: 33.1s
23:	total: 812ms	remaining: 33s
24:	total: 827ms	remaining: 32.3s
25:	total: 843ms	remaining: 31.6s
26:	total: 875ms	remaining: 31.5s
27:	total: 896ms	remaining: 31.1s
28:	total: 911ms	remaining: 30.5s
29:	total: 930ms	remaining: 3

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('num_imputer',
                                                                   IterativeImputer(max_iter=15))]),
                                                  ['h1n1_concern',
                                                   'h1n1_knowledge',
                                                   'behavioral_antiviral_meds',
                                                   'behavioral_avoidance',
                                                   'behavioral_face_mask',
                                                   'behavioral_wash_hands',
                                                   'behavioral_large_gatherings',
                                                   'behavioral_outside_home',
                                                   'behavioral_touch_face',
                                                 

In [99]:
# Calculate metrics
metrics(y_test, tuned_cat_clf.predict(X_test))

accuracy: 0.854
recall: 0.505
f1: 0.595
roc_auc: 0.727


## GridSearch Catboost

In [104]:
# Lets try a small gridsearch on Catboost


model = CatBoostClassifier(loss_function='Logloss')

grid = {'learning_rate': [0.03, 0.1],
        'depth': [4, 6, 10],
        'l2_leaf_reg': [1, 3, 5, 7, 9]}

grid_search_result = model.grid_search(grid,
                                       X=preprocessor.fit_transform(X_train),
                                       y=y_train,
                                       plot=True)
# cat_params = {
#     'cat_grid__iteratoins': [1, 2, 3, 4, 5]
# }
# cat_grid = CatBoostClassifier(iterations=)
# cat_grid_clf = GridSearchCV(cat_grid, param_grid=cat_params, scoring='f1')
# output = XG_grid_clf.fit(X_train, y_train)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6711518	test: 0.6709544	best: 0.6709544 (0)	total: 22.2ms	remaining: 22.1s
1:	learn: 0.6517267	test: 0.6513463	best: 0.6513463 (1)	total: 37ms	remaining: 18.5s
2:	learn: 0.6334321	test: 0.6327454	best: 0.6327454 (2)	total: 57.1ms	remaining: 19s
3:	learn: 0.6147696	test: 0.6138259	best: 0.6138259 (3)	total: 74.2ms	remaining: 18.5s
4:	learn: 0.5995647	test: 0.5982727	best: 0.5982727 (4)	total: 111ms	remaining: 22.1s
5:	learn: 0.5847253	test: 0.5830726	best: 0.5830726 (5)	total: 131ms	remaining: 21.6s
6:	learn: 0.5702219	test: 0.5683291	best: 0.5683291 (6)	total: 155ms	remaining: 21.9s
7:	learn: 0.5569746	test: 0.5548566	best: 0.5548566 (7)	total: 166ms	remaining: 20.6s
8:	learn: 0.5450996	test: 0.5427106	best: 0.5427106 (8)	total: 200ms	remaining: 22s
9:	learn: 0.5345914	test: 0.5319697	best: 0.5319697 (9)	total: 240ms	remaining: 23.7s
10:	learn: 0.5236382	test: 0.5208255	best: 0.5208255 (10)	total: 313ms	remaining: 28.2s
11:	learn: 0.5147606	test: 0.5116872	best: 0.5116872 (

## Gridsearch XGBoost Models
### Max_Depth

In [57]:
XG_params = {
    'gb_clf__max_depth': [1, 2, 3, 4, 5]
}
XG_grid_clf = GridSearchCV(XG_clf, param_grid=XG_params, scoring='f1')
output = XG_grid_clf.fit(X_train, y_train)

In [59]:
metrics(y_test, XG_grid_clf.predict(X_test))

accuracy: 0.853
recall: 0.510
f1: 0.596
roc_auc: 0.728


Just an initial Gridsearch to check
   - The time needed to run
   - How much the F1 score will increase

I've noted that it takes a VERY long time to run gridsearch with XGBoost, so in the interest of time, I'm going to perform less exhaustive grid searches.

## Learning Rate

In [60]:
XG_params_lr = {
    'gb_clf__max_depth': [3],
    'gb_clf__learning_rate': [0.1, 0.3, 0.5, 0.7]
}
XG_grid_clf_lr = GridSearchCV(XG_clf, param_grid=XG_params_lr, scoring='f1')
output = XG_grid_clf_lr.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
print(output.best_params_)
metrics(y_test, XG_grid_clf_lr.predict(X_test))

## Min Split loss

In [None]:
XG_params_msl = {
    'gb_clf__max_depth': [3],
    'gb_clf__learning_rate': [0.5],
    'gb_clf__min_split_loss': [0, 5, 10]
}
XG_grid_clf_msl = GridSearchCV(XG_clf, param_grid=XG_params_msl, scoring='f1')
output = XG_grid_clf_msl.fit(X_train, y_train)
print(output.best_params_)
metrics(y_test, XG_grid_clf_msl.predict(X_test))