# Boosting
Example implementation of boosting algorithms, including:
* Adaboost
* Gradient boosting
* XGboost

In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn import metrics
import xgboost as xgb

%cd "G:/Archive"

G:\Archive


## Data Prep

In [13]:
#load data
data = pd.read_csv("data/wine quality red.csv", header = 0)

#aggregate into binary classification problem
data["quality"] = pd.Categorical(np.where(data["quality"] > 5, "above_avg", "below_avg"))

#split data
X = data.iloc[:, :11]
y = data.iloc[:, 11]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1234)

## Searching Optimal Parameters
### Adaboost

In [8]:
#tuning parameters: n_estimators, learning_rate
#10-fold cv with accuracy as target metrics
grid1 = {"n_estimators": 50 * np.arange(1, 5),
         "learning_rate": 0.1 * np.arange(1, 11)}
model1 = GridSearchCV(AdaBoostClassifier(), param_grid = grid1, cv = 10, scoring = "accuracy")
model1.fit(X_train, y_train)

#show tuning results
print("Best score: ", model1.best_score_)
print("Corresponding parms: ", model1.best_params_)
print("Model description: ", model1.best_estimator_)

#predict on test set
pred1 = model1.predict(X_test)

#show prediction performance
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, pred1)))
print("Recall: {0:.4f}".format(metrics.recall_score(y_test, pred1, pos_label = "above_avg", average = "binary")))
print("Precision: {0:.4f}".format(metrics.precision_score(y_test, pred1, pos_label = "above_avg", average = "binary")))
print("F1 score: {0:.4f}".format(metrics.f1_score(y_test, pred1, pos_label = "above_avg", average = "binary")))

Best score:  0.7536663385826772
Corresponding parms:  {'learning_rate': 0.8, 'n_estimators': 100}
Model description:  AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.8,
                   n_estimators=100, random_state=None)
Accuracy: 0.7688
Recall: 0.8023
Precision: 0.7753
F1 score: 0.7886


### Gradient Descend

In [10]:
#tuning parameters: n_estimators, learning_rate, max_depth
#minimum node size of tree is set as default
#10-fold cv with accuracy as target metrics
grid2 = {"n_estimators": 50 * np.arange(1, 5),
         "learning_rate": 0.1 * np.arange(1, 11),
         "max_depth": range(1, 5)}
model2 = GridSearchCV(GradientBoostingClassifier(), param_grid = grid2, cv = 10, scoring = "accuracy")
model2.fit(X_train, y_train)

#show tuning results
print("Best score: ", model2.best_score_)
print("Corresponding parms: ", model2.best_params_)
print("Model description: ", model2.best_estimator_)

#predict on test set
pred2 = model2.predict(X_test)

#show prediction performance
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, pred2)))
print("Recall: {0:.4f}".format(metrics.recall_score(y_test, pred2, pos_label = "above_avg", average = "binary")))
print("Precision: {0:.4f}".format(metrics.precision_score(y_test, pred2, pos_label = "above_avg", average = "binary")))
print("F1 score: {0:.4f}".format(metrics.f1_score(y_test, pred2, pos_label = "above_avg", average = "binary")))

Best score:  0.7974778543307087
Corresponding parms:  {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 100}
Model description:  GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.2, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)
Accuracy: 0.8000
Recall: 0.8081
Precision: 0.8176
F1 score: 0.8129


### XGboost

In [16]:
#tuning parameters: n_estimators, learning_rate, max_depth
#minimum node size of tree is set as default
#10-fold cv with accuracy as target metrics
#introducing randomness by tree using parameters: subsample and colsample_bytree
#gbtree as the base estimator with no regularization in the example
grid3 = {"n_estimators": 50 * np.arange(1, 5),
         "learning_rate": 0.1 * np.arange(1, 11), #eta in XGboost
         "max_depth": range(1, 5)}
model3 = GridSearchCV(xgb.XGBClassifier(subsample = 0.5, colsample_bytree = 0.5), param_grid = grid3, cv = 10, scoring = "accuracy")
model3.fit(X_train, y_train)

#show tuning results
print("Best score: ", model3.best_score_)
print("Corresponding parms: ", model3.best_params_)
print("Model description: ", model3.best_estimator_)

#predict on test set
pred3 = model3.predict(X_test)

#show prediction performance
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, pred3)))
print("Recall: {0:.4f}".format(metrics.recall_score(y_test, pred3, pos_label = "above_avg", average = "binary")))
print("Precision: {0:.4f}".format(metrics.precision_score(y_test, pred3, pos_label = "above_avg", average = "binary")))
print("F1 score: {0:.4f}".format(metrics.f1_score(y_test, pred3, pos_label = "above_avg", average = "binary")))

Best score:  0.7849840059055119
Corresponding parms:  {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 200}
Model description:  XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.5, verbosity=1)
Accuracy: 0.8125
Recall: 0.8198
Precision: 0.8294
F1 score: 0.8246


## Visualization