In [1]:
#Gradient Boosting:-Fits a series of models by fitting each successive model in order to minimize the error
#of previous ones

#Gradient Boosting | Gradient Boosting Machine | GBM

#Gradient Boosting is an ensemble algorithm that fits boosted decision trees by minimizing error gradient.

#There are many implementations of the gradient boosting algorithm available in Python.

#Lets try a classifier with all of them and compare the 'speed' and 'accuracy' of them.

import pandas as pd
import numpy as np
from time import time

In [2]:
from sklearn.datasets import make_classification

X, y = make_classification(
    n_samples = 100000, n_features = 20, n_informative = 15, n_redundant = 5, random_state = 0
)

In [3]:
X

array([[-0.07328048, -0.72628836,  6.64048251, ...,  0.39476427,
        -4.46968002,  5.67254441],
       [ 4.54645272, -0.6500376 , -5.45173152, ..., -2.7305627 ,
         6.20089877, -1.20162908],
       [ 3.17903912, -1.36282586, -7.05044747, ...,  0.43418885,
         1.50405227, -0.14015885],
       ...,
       [ 3.11720532, -1.9218521 , -7.48327115, ..., -0.86394898,
        -0.63529205, -2.32373136],
       [-3.35282454,  1.77252544,  1.96927206, ...,  4.32916189,
        -1.40119583,  1.43359914],
       [ 0.38985667, -1.66219807,  4.87341974, ..., -0.27866353,
         0.72928873, -2.83408834]])

In [4]:
X.shape

(100000, 20)

In [5]:
#To record the speed and accuracy we are using two dictonaries

accuracy = {}
speed = {}

In [6]:
#Sckit-Learn implementation

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

In [7]:
model = GradientBoostingClassifier()

start = time()
cv = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 2, random_state = 0)
score = cross_val_score(model, X, y, scoring = 'accuracy', cv = cv, n_jobs = -1)

speed['GradientBoosting'] = np.round(time() - start, 3)
accuracy['GradientBoosting'] = np.mean(score).round(3)

print(
    f"Mean Accuracy: {accuracy['GradientBoosting']}\nStd: {np.std(score):.3f}\nRun time: {speed['GradientBoosting']}s"
)

Mean Accuracy: 0.894
Std: 0.003
Run time: 470.614s


In [8]:
#ALternative

#This is based on LGBM which is an alternative for GBM to speed up the application compared to GBM.

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

In [9]:
model = HistGradientBoostingClassifier()

start = time()
cv = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 2, random_state = 0)
score = cross_val_score(model, X, y, scoring = 'accuracy', cv = cv, n_jobs = -1)

speed['HistGradientBoosting'] = np.round(time() - start, 3)
accuracy['HistGradientBoosting'] = np.mean(score).round(3)

print(
    f"Mean Accuracy: {accuracy['HistGradientBoosting']}\nStd: {np.std(score):.3f}\nRun time: {speed['HistGradientBoosting']}s"
)

Mean Accuracy: 0.963
Std: 0.002
Run time: 29.884s


In [10]:
#XGBoost or XGBM

from xgboost import XGBClassifier

In [11]:
model = XGBClassifier()

start = time()
cv = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 2, random_state = 0)
score = cross_val_score(model, X, y, scoring = 'accuracy', cv = cv, n_jobs = -1)

speed['XGB'] = np.round(time() - start, 3)
accuracy['XGB'] = np.mean(score).round(3)

print(
    f"Mean Accuracy: {accuracy['XGB']}\nStd: {np.std(score):.3f}\nRun time: {speed['XGB']}s"
)

Mean Accuracy: 0.976
Std: 0.001
Run time: 396.637s


In [13]:
#LGBM or LightGBM

#This was implemented by Microsoft to improve the speed and accuracy of the GradientBoostingMachines
#This improves the performance and increases spped and accuracy of models

from lightgbm import LGBMClassifier

In [14]:
model = LGBMClassifier()

start = time()
cv = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 2, random_state = 0)
score = cross_val_score(model, X, y, scoring = 'accuracy', cv = cv, n_jobs = -1)

speed['LGBM'] = np.round(time() - start, 3)
accuracy['LGBM'] = np.mean(score).round(3)

print(
    f"Mean Accuracy: {accuracy['LGBM']}\nStd: {np.std(score):.3f}\nRun time: {speed['LGBM']}s"
)

Mean Accuracy: 0.963
Std: 0.001
Run time: 16.9s


In [15]:
#CatGBM or Catboost
#It also supports categorical input variables even it supports accuracy and speed increase.

from catboost import CatBoostClassifier

In [16]:
model = CatBoostClassifier()

start = time()
cv = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 2, random_state = 0)
score = cross_val_score(model, X, y, scoring = 'accuracy', cv = cv, n_jobs = -1)

speed['CatBoost'] = np.round(time() - start, 3)
accuracy['CatBoost'] = np.mean(score).round(3)

print(
    f"Mean Accuracy: {accuracy['CatBoost']}\nStd: {np.std(score):.3f}\nRun time: {speed['CatBoost']}s"
)

Mean Accuracy: 0.983
Std: 0.001
Run time: 493.179s


In [17]:
print("Accuracy:")
{k: v for k, v in sorted(accuracy.items(), key = lambda i: i[1], reverse = True)}

Accuracy:


{'CatBoost': 0.983,
 'XGB': 0.976,
 'HistGradientBoosting': 0.963,
 'LGBM': 0.963,
 'GradientBoosting': 0.894}

In [18]:
print("Speed:")
{k: v for k, v in sorted(speed.items(), key = lambda i: i[1], reverse = False)}

Speed:


{'LGBM': 16.9,
 'HistGradientBoosting': 29.884,
 'XGB': 396.637,
 'GradientBoosting': 470.614,
 'CatBoost': 493.179}