In [24]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer

import numpy as np

# ___Gradient Boosted Decision Trees (GBDT)___
-------------------

In [1]:
# This is another variant of tree based ensemble methods.
# Like random forests, gradient boosted decision trees use an ensemble of trees.

In [2]:
# Unlike random forests where the models are a synthesis of random trees that make predictions independently,
# GBDTs are made of trees that correct mistakes made by previous trees!
# GBDT models use lots of shallow trees (known as weak learners) built non-randomly, to ensure fewer and fewer mistakes are made as we add more trees.
# Number of estimators is an important parameter in controlling model complexity.
# There's another parameter called the learning rate, which controls how thw algorithm builds the correction trees.
# When the learning rate is high, each shallow tree expends more effort on correcting the mistakes made by the predecessor and vice versa.

In [3]:
# Higher learning rates typically lead to more complex models with complex individual trees.

In [5]:
bcancer = load_breast_cancer()
train_x, test_x, train_y, test_y = train_test_split(bcancer.data, bcancer.target, train_size = 0.7)

In [18]:
gbClassifier = GradientBoostingClassifier(n_estimators = 200, learning_rate = 0.85, max_depth = 5, min_samples_split = 10,
                                          min_samples_leaf = 10).fit(train_x, train_y)

In [23]:
gbClassifier.score(test_x, test_y)

0.9707602339181286

In [30]:
for lrate in (0, 0.01, 0.025, 0.05, 0.075, 0.1, 0.2, 0.5, 0.75, 0.9, 1.0):

    gbClassifier = GradientBoostingClassifier(n_estimators = 200, learning_rate = lrate, max_depth = 5, min_samples_split = 10,
                                          min_samples_leaf = 10).fit(train_x, train_y)
    print("Learning rate: {:.2f}, Train set accuracy score {:.5f}, Test set accuracy score: {:.5f}".format(lrate,
                        gbClassifier.score(train_x, train_y), gbClassifier.score(test_x, test_y)))
    

Learning rate: 0.00, Train set accuracy score 0.63317, Test set accuracy score: 0.61404
Learning rate: 0.01, Train set accuracy score 0.98995, Test set accuracy score: 0.94152
Learning rate: 0.03, Train set accuracy score 1.00000, Test set accuracy score: 0.94737
Learning rate: 0.05, Train set accuracy score 1.00000, Test set accuracy score: 0.94152
Learning rate: 0.07, Train set accuracy score 1.00000, Test set accuracy score: 0.95322
Learning rate: 0.10, Train set accuracy score 1.00000, Test set accuracy score: 0.97076
Learning rate: 0.20, Train set accuracy score 1.00000, Test set accuracy score: 0.95322
Learning rate: 0.50, Train set accuracy score 1.00000, Test set accuracy score: 0.96491
Learning rate: 0.75, Train set accuracy score 1.00000, Test set accuracy score: 0.96491
Learning rate: 0.90, Train set accuracy score 1.00000, Test set accuracy score: 0.95906
Learning rate: 1.00, Train set accuracy score 1.00000, Test set accuracy score: 0.97661


In [31]:
# Similar to other tree based models, GBDTs are also prone to overfitting.
# Model is likely overfitting as we increase the learning rate -> can be seen from the perfect predictions for the train set!