# Gradient Boosting and XGBoost

I will be creating models using Sklearn's Gradient Boost, and the XGBoost algorithm.

In [68]:
# Importing Required Packages.
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd
import category_encoders as ce
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from xgboost import XGBClassifier
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

# Training labels
training_labels = pd.read_csv('../../Data/training_set_labels.csv', index_col='respondent_id')
training_features = pd.read_csv('../../Data/training_set_features.csv', index_col='respondent_id')

In [69]:
def metrics(y_test, _preds):
    print('accuracy: {:0.3f}'.format(accuracy_score(y_test, _preds)))
    print('recall: {:0.3f}'.format(recall_score(y_test, _preds)))
    print('f1: {:0.3f}'.format(f1_score(y_test, _preds)))
    print('roc_auc: {:0.3f}'.format(roc_auc_score(y_test , _preds)))

In [70]:
# TTS
X_train, X_test, y_train, y_test = train_test_split(training_features, training_labels['h1n1_vaccine'], test_size=0.33, random_state=42)

## Preproccessing
For the preproccessing, all of the columns are categorical, however, some of them are numerical, and some of them are strings. We will want to handle these these columns differently when imputing missing values.

- Numerical Categories
    - Use Sklearn's Iterative Imputer to fill in the missing values
- String Categories
    - Fill missing values with a new value: 'unknown'
    - One hot encode the results
- Categories with more then 10 unique categories
    - We will frequency code these instead, so we don't have an overwhelming amount of columns in the dataframe.

In [71]:
#training_features = training_features.select_dtypes(exclude='object')

In [72]:
training_features

Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Not in Labor Force,qufhixun,Non-MSA,0.0,0.0,,
26703,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,"<= $75,000, Above Poverty",Not Married,Rent,Employed,lzgpxyit,"MSA, Principle City",1.0,0.0,fcxhlnwr,cmhcxjea
26704,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,,Not Married,Own,,lzgpxyit,"MSA, Not Principle City",0.0,0.0,,
26705,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,...,"<= $75,000, Above Poverty",Married,Rent,Employed,lrircsnp,Non-MSA,1.0,0.0,fcxhlnwr,haliazsg


In [73]:
# Preproccessing columns
num_cols = []
ohe_cols = []
freq_cols = []

# Seperate columns into numerical, categorical, and freq

for c in training_features.columns:
    if training_features[c].dtype in ['float64', 'int64']:
        num_cols.append(c)
    elif training_features[c].nunique() < 10:
        ohe_cols.append(c)
    else:
        freq_cols.append(c)


In [74]:
# Fill NaN values using IterativeImputer
num_transformer = Pipeline(steps=[
    ('num_imputer', IterativeImputer(max_iter=15)),
])

# Onehot Encoding transformer for Categorical variable
ohe_transformer = Pipeline(steps=[
    ('ohe_imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('oh_encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Transformer for categories with more then 10 unique values
freq_transformer = Pipeline(steps=[
    ('freq_encoder', ce.count.CountEncoder(normalize=True, min_group_size=.05)),
    ('freq_imputer', SimpleImputer(strategy='constant', fill_value=0))
])


In [75]:
# Combine transformers into preprocessor.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('ohe', ohe_transformer, ohe_cols),
        ('freq', freq_transformer, freq_cols)
    ])

# Sklearn Gradient Boost
Here I will create a baseline gradient boost model to compare future models too.

In [76]:
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('gb_clf', GradientBoostingClassifier())
])

In [77]:
# Cross validate test
cross_validate(clf, X_train, y_train)

{'fit_time': array([6.18462467, 6.34219265, 7.08049297, 6.53098011, 5.67745829]),
 'score_time': array([0.0690763 , 0.082335  , 0.07787251, 0.06914449, 0.06008887]),
 'test_score': array([0.85331098, 0.84772283, 0.85219335, 0.85159307, 0.8529905 ])}

In [78]:
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score
clf.fit(X_train, y_train)
_preds = clf.predict(X_test)

metrics(y_test, _preds)

accuracy: 0.854
recall: 0.486
f1: 0.585
roc_auc: 0.719


Without Objects:
accuracy: 0.853
recall: 0.489
f1: 0.585
roc_auc: 0.720

With Objects:

accuracy: 0.854
recall: 0.486
f1: 0.585
roc_auc: 0.719

## XGBoost
Let's try a baseline model for XGBoost as well.


In [79]:
XG_clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('gb_clf', XGBClassifier(eval_metric='auc'))
])


In [80]:
cross_validate(XG_clf, X_train, y_train)

{'fit_time': array([4.25926065, 4.32036829, 4.34091234, 4.33380318, 3.87927485]),
 'score_time': array([0.07836843, 0.07836819, 0.07539225, 0.07936025, 0.07539177]),
 'test_score': array([0.85079631, 0.84353171, 0.84353171, 0.84041364, 0.84684181])}

In [81]:
XG_clf.fit(X_train, y_train)
boost_preds = XG_clf.predict(X_test)
metrics(y_test, boost_preds)

accuracy: 0.845
recall: 0.513
f1: 0.584
roc_auc: 0.724


# Catboost
Finally, since all of our data is categorical, I'd like to try catboost as well.

In [82]:
cat_clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('catboost_clf', CatBoostClassifier(task_type='GPU'))
])

In [83]:
cat_clf.fit(X_train, y_train)

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

Learning rate set to 0.028766
0:	learn: 0.6701202	total: 54ms	remaining: 53.9s
1:	learn: 0.6481042	total: 97.9ms	remaining: 48.9s
2:	learn: 0.6282114	total: 146ms	remaining: 48.4s
3:	learn: 0.6096395	total: 189ms	remaining: 47.2s
4:	learn: 0.5925618	total: 229ms	remaining: 45.6s
5:	learn: 0.5767983	total: 263ms	remaining: 43.6s
6:	learn: 0.5621145	total: 299ms	remaining: 42.4s
7:	learn: 0.5485969	total: 340ms	remaining: 42.1s
8:	learn: 0.5361738	total: 375ms	remaining: 41.3s
9:	learn: 0.5243772	total: 409ms	remaining: 40.5s
10:	learn: 0.5129884	total: 446ms	remaining: 40.1s
11:	learn: 0.5034961	total: 480ms	remaining: 39.5s
12:	learn: 0.4935413	total: 515ms	remaining: 39.1s
13:	learn: 0.4847750	total: 554ms	remaining: 39s
14:	learn: 0.4765381	total: 594ms	remaining: 39s
15:	learn: 0.4689992	total: 626ms	remaining: 38.5s
16:	learn: 0.4621276	total: 657ms	remaining: 38s
17:	learn: 0.4557646	total: 692ms	remaining: 37.7s
18:	learn: 0.4496457	total: 728ms	remaining: 37.6s
19:	learn: 0.4439

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('num_imputer',
                                                                   IterativeImputer(max_iter=15))]),
                                                  ['h1n1_concern',
                                                   'h1n1_knowledge',
                                                   'behavioral_antiviral_meds',
                                                   'behavioral_avoidance',
                                                   'behavioral_face_mask',
                                                   'behavioral_wash_hands',
                                                   'behavioral_large_gatherings',
                                                   'behavioral_outside_home',
                                                   'behavioral_touch_face',
                                                 

In [84]:
metrics(y_test, cat_clf.predict(X_test))

accuracy: 0.853
recall: 0.498
f1: 0.590
roc_auc: 0.723


Catboost did about as well as XGboost did, but the biggest thing I noticed is that it took 1/4 of the amount of time to train. I think this would be a much better model type to use going forward.

## Catboost Tuning

In [85]:
# Create pipleline with tuned params
tuned_cat_clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    # Changing the eval metric from "logloss" to "AUC" and modifying the learning rate
    ('catboost_clf', CatBoostClassifier(learning_rate=0.03,
    eval_metric='AUC', task_type='GPU'))
])

In [86]:
# Fit the new model
tuned_cat_clf.fit(X_train, y_train)

0:	learn: 0.8362622	total: 40.1ms	remaining: 40s
1:	learn: 0.8429240	total: 79.2ms	remaining: 39.5s
2:	learn: 0.8454295	total: 119ms	remaining: 39.4s
3:	learn: 0.8467833	total: 158ms	remaining: 39.4s
4:	learn: 0.8469359	total: 194ms	remaining: 38.6s
5:	learn: 0.8475717	total: 230ms	remaining: 38.2s
6:	learn: 0.8478817	total: 268ms	remaining: 38s
7:	learn: 0.8475021	total: 311ms	remaining: 38.5s
8:	learn: 0.8475730	total: 353ms	remaining: 38.9s
9:	learn: 0.8482671	total: 391ms	remaining: 38.7s
10:	learn: 0.8498571	total: 429ms	remaining: 38.5s
11:	learn: 0.8498525	total: 464ms	remaining: 38.2s
12:	learn: 0.8501118	total: 501ms	remaining: 38s
13:	learn: 0.8498805	total: 544ms	remaining: 38.3s
14:	learn: 0.8495913	total: 579ms	remaining: 38s
15:	learn: 0.8502429	total: 613ms	remaining: 37.7s
16:	learn: 0.8500830	total: 651ms	remaining: 37.6s
17:	learn: 0.8504451	total: 687ms	remaining: 37.5s
18:	learn: 0.8515907	total: 725ms	remaining: 37.4s
19:	learn: 0.8518794	total: 770ms	remaining: 37

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('num_imputer',
                                                                   IterativeImputer(max_iter=15))]),
                                                  ['h1n1_concern',
                                                   'h1n1_knowledge',
                                                   'behavioral_antiviral_meds',
                                                   'behavioral_avoidance',
                                                   'behavioral_face_mask',
                                                   'behavioral_wash_hands',
                                                   'behavioral_large_gatherings',
                                                   'behavioral_outside_home',
                                                   'behavioral_touch_face',
                                                 

In [87]:
# Calculate metrics
metrics(y_test, tuned_cat_clf.predict(X_test))

accuracy: 0.853
recall: 0.498
f1: 0.590
roc_auc: 0.723


## GridSearch Catboost - Learning Rate

In [91]:
# Lets try a small gridsearch on Catboost

cat_grid = Pipeline(steps=[
    ('preprocessor', preprocessor),
    # Changing the eval metric from "logloss" to "AUC" and modifying the learning rate
    ('catboost_clf', CatBoostClassifier(loss_function='Logloss', eval_metric='AUC', task_type='GPU', iterations=500))
])
cat_grid_params = {
    'catboost_clf__learning_rate': [0.001, 0.03, 0.05, 0.1, 0.3],



}
cat_grid_clf = GridSearchCV(cat_grid, param_grid=cat_grid_params, scoring='roc_auc')
output = cat_grid_clf.fit(X_train, y_train)


0:	learn: 0.8404095	total: 40.3ms	remaining: 20.1s
1:	learn: 0.8448557	total: 77.9ms	remaining: 19.4s
2:	learn: 0.8476370	total: 113ms	remaining: 18.7s
3:	learn: 0.8483959	total: 150ms	remaining: 18.6s
4:	learn: 0.8480303	total: 186ms	remaining: 18.5s
5:	learn: 0.8483180	total: 222ms	remaining: 18.3s
6:	learn: 0.8484684	total: 259ms	remaining: 18.2s
7:	learn: 0.8483835	total: 299ms	remaining: 18.4s
8:	learn: 0.8483927	total: 337ms	remaining: 18.4s
9:	learn: 0.8481142	total: 373ms	remaining: 18.3s
10:	learn: 0.8498603	total: 411ms	remaining: 18.3s
11:	learn: 0.8501020	total: 447ms	remaining: 18.2s
12:	learn: 0.8502600	total: 484ms	remaining: 18.1s
13:	learn: 0.8501071	total: 526ms	remaining: 18.3s
14:	learn: 0.8497953	total: 561ms	remaining: 18.2s
15:	learn: 0.8497611	total: 597ms	remaining: 18s
16:	learn: 0.8495025	total: 632ms	remaining: 18s
17:	learn: 0.8496693	total: 669ms	remaining: 17.9s
18:	learn: 0.8499210	total: 705ms	remaining: 17.8s
19:	learn: 0.8501513	total: 741ms	remaining

In [89]:
cat_grid_clf.best_params_

{'catboost_clf__learning_rate': 0.1}

In [90]:
metrics(y_test, cat_grid_clf.predict(X_test))

accuracy: 0.855
recall: 0.489
f1: 0.589
roc_auc: 0.721


## More Tuned Catboost

In [40]:
tuned_cat_clf2 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    # Changing the eval metric from "logloss" to "AUC" and modifying the learning rate
    ('catboost_clf2', CatBoostClassifier(eval_metric='AUC', task_type='GPU', iterations=500,
                                         learning_rate=0.05745075659543725,
                                         random_strength=4,
                                         bagging_temperature=8,
                                         max_bin=5,
                                         grow_policy='Lossguide',
                                         min_data_in_leaf=7,
                                         max_depth=6,
                                         l2_leaf_reg=11.323094517862078,
                                         auto_class_weights='Balanced'))
])

'cat_grid__learning_rate': [0.001, 0.3],
'cat_grid__random_strength': [1, 10],
'cat_grid__max_bin': [4, 5, 6, 8, 10, 20, 30],
'cat_grid__grow_policy': ['SymmetricTree', 'Depthwise', 'Lossguide'],
'cat_grid__max_depth': [2, 3, 4, 5, 10],
'cat_grid__auto_class_weights': ['Balanced', 'SqrtBalanced'],

In [28]:
tuned_cat_clf2.fit(X_train, y_train)

0:	learn: 0.7881560	total: 17.7ms	remaining: 8.81s
1:	learn: 0.8142965	total: 34.5ms	remaining: 8.58s
2:	learn: 0.8278958	total: 48.3ms	remaining: 8s
3:	learn: 0.8320541	total: 62.9ms	remaining: 7.8s
4:	learn: 0.8344122	total: 81.8ms	remaining: 8.1s
5:	learn: 0.8363422	total: 96.9ms	remaining: 7.98s
6:	learn: 0.8358417	total: 111ms	remaining: 7.85s
7:	learn: 0.8397007	total: 126ms	remaining: 7.77s
8:	learn: 0.8420074	total: 140ms	remaining: 7.66s
9:	learn: 0.8429627	total: 154ms	remaining: 7.57s
10:	learn: 0.8433807	total: 172ms	remaining: 7.63s
11:	learn: 0.8437379	total: 186ms	remaining: 7.55s
12:	learn: 0.8435700	total: 200ms	remaining: 7.47s
13:	learn: 0.8434466	total: 214ms	remaining: 7.41s
14:	learn: 0.8431621	total: 228ms	remaining: 7.36s
15:	learn: 0.8443248	total: 241ms	remaining: 7.3s
16:	learn: 0.8446526	total: 257ms	remaining: 7.31s
17:	learn: 0.8457470	total: 273ms	remaining: 7.31s
18:	learn: 0.8465672	total: 289ms	remaining: 7.31s
19:	learn: 0.8471351	total: 304ms	remaini

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('num_imputer',
                                                                   IterativeImputer(max_iter=15))]),
                                                  ['h1n1_concern',
                                                   'h1n1_knowledge',
                                                   'behavioral_antiviral_meds',
                                                   'behavioral_avoidance',
                                                   'behavioral_face_mask',
                                                   'behavioral_wash_hands',
                                                   'behavioral_large_gatherings',
                                                   'behavioral_outside_home',
                                                   'behavioral_touch_face',
                                                 

In [29]:
metrics(y_test, tuned_cat_clf2.predict(X_test))

accuracy: 0.804
recall: 0.742
f1: 0.616
roc_auc: 0.781


{'preprocessor': {}, 'catboost_clf2': {}}

## Gridsearch XGBoost Models
### Max_Depth

In [62]:
XG_params = {
    'gb_clf__max_depth': [1, 2, 3, 4, 5]
}
XG_grid_clf = GridSearchCV(XG_clf, param_grid=XG_params, scoring='f1')
output = XG_grid_clf.fit(X_train, y_train)

NameError: name 'XG_clf' is not defined

In [59]:
metrics(y_test, XG_grid_clf.predict(X_test))

accuracy: 0.853
recall: 0.510
f1: 0.596
roc_auc: 0.728


Just an initial Gridsearch to check
   - The time needed to run
   - How much the F1 score will increase

I've noted that it takes a VERY long time to run gridsearch with XGBoost, so in the interest of time, I'm going to perform less exhaustive grid searches.

## Learning Rate

In [60]:
XG_params_lr = {
    'gb_clf__max_depth': [3],
    'gb_clf__learning_rate': [0.1, 0.3, 0.5, 0.7]
}
XG_grid_clf_lr = GridSearchCV(XG_clf, param_grid=XG_params_lr, scoring='f1')
output = XG_grid_clf_lr.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
print(output.best_params_)
metrics(y_test, XG_grid_clf_lr.predict(X_test))

## Min Split loss

In [None]:
XG_params_msl = {
    'gb_clf__max_depth': [3],
    'gb_clf__learning_rate': [0.5],
    'gb_clf__min_split_loss': [0, 5, 10]
}
XG_grid_clf_msl = GridSearchCV(XG_clf, param_grid=XG_params_msl, scoring='f1')
output = XG_grid_clf_msl.fit(X_train, y_train)
print(output.best_params_)
metrics(y_test, XG_grid_clf_msl.predict(X_test))