# Gradient Boosting and XGBoost

I will be creating models using Sklearn's Gradient Boost, and the XGBoost algorithm.

In [15]:
# Importing Required Packages.
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import pandas as pd
import category_encoders as ce
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

# Training labels
training_labels = pd.read_csv('../../Data/training_set_labels.csv', index_col='respondent_id')
training_features = pd.read_csv('../../Data/training_set_features.csv', index_col='respondent_id')

In [16]:
def metrics(y_test, _preds):
    print('accuracy: {:0.3f}'.format(accuracy_score(y_test, _preds)))
    print('recall: {:0.3f}'.format(recall_score(y_test, _preds)))
    print('f1: {:0.3f}'.format(f1_score(y_test, _preds)))
    print('roc_auc: {:0.3f}'.format(roc_auc_score(y_test , _preds)))

In [17]:
# TTS
X_train, X_test, y_train, y_test = train_test_split(training_features, training_labels['h1n1_vaccine'], test_size=0.33, random_state=42)

## Preproccessing
For the preproccessing, all of the columns are categorical, however, some of them are numerical, and some of them are strings. We will want to handle these these columns differently when imputing missing values.

- Numerical Categories
    - Use Sklearn's Iterative Imputer to fill in the missing values
- String Categories
    - Fill missing values with a new value: 'unknown'
    - One hot encode the results
- Categories with more then 10 unique categories
    - We will frequency code these instead, so we don't have an overwhelming amount of columns in the dataframe.

In [18]:
# Preproccessing columns
num_cols = []
ohe_cols = []
freq_cols = []

# Seperate columns into numerical, categorical, and freq
for c in training_features.columns:
    if training_features[c].dtype in ['float64', 'int64']:
        num_cols.append(c)
    elif training_features[c].nunique() < 10:
        ohe_cols.append(c)
    else:
        freq_cols.append(c)


In [19]:
# Fill NaN values using IterativeImputer
num_transformer = Pipeline(steps=[
    ('num_imputer', IterativeImputer(max_iter=15)),
])

# Onehot Encoding transformer for Categorical variable
ohe_transformer = Pipeline(steps=[
    ('ohe_imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('oh_encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Transformer for categories with more then 10 unique values
freq_transformer = Pipeline(steps=[
    ('freq_encoder', ce.count.CountEncoder(normalize=True, min_group_size=.05)),
    ('freq_imputer', SimpleImputer(strategy='constant', fill_value=0))
])


In [20]:
# Combine transformers into preprocessor.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('ohe', ohe_transformer, ohe_cols),
        ('freq', freq_transformer, freq_cols)
    ])

# Sklearn Gradient Boost
Here I will create a baseline gradient boost model to compare future models too.

In [21]:
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('gb_clf', GradientBoostingClassifier())
])

In [22]:
# Cross validate test
cross_validate(clf, X_train, y_train)

{'fit_time': array([7.06111097, 7.50069499, 6.69995785, 6.82990289, 6.00017095]),
 'score_time': array([0.09743404, 0.10969114, 0.10481215, 0.10366416, 0.08387303]),
 'test_score': array([0.85331098, 0.84772283, 0.85219335, 0.85159307, 0.8529905 ])}

In [23]:
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score
clf.fit(X_train, y_train)
_preds = clf.predict(X_test)

metrics(y_test, _preds)

accuracy: 0.854
recall: 0.486
f1: 0.585
roc_auc: 0.719


accuracy: 0.854
recall: 0.486
f1: 0.585
roc_auc: 0.719

## XGBoost
Let's try a baseline model for XGBoost as well.


In [27]:
XG_clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('gb_clf', XGBClassifier(eval_metric='auc'))
])


In [28]:
cross_validate(XG_clf, X_train, y_train)

{'fit_time': array([7.23860407, 6.64287996, 8.49426484, 7.9869628 , 6.65491295]),
 'score_time': array([0.13163996, 0.25379586, 0.13175607, 0.23549294, 0.12765408]),
 'test_score': array([0.85079631, 0.84353171, 0.84353171, 0.84041364, 0.84684181])}

In [29]:
XG_clf.fit(X_train, y_train)
boost_preds = XG_clf.predict(X_test)
metrics(y_test, boost_preds)

accuracy: 0.845
recall: 0.513
f1: 0.584
roc_auc: 0.724
