In [1]:
########################################
# Ensemble Learning and Random Forests #
########################################
#
# Voting Classifiers
# Bagging and Pasting
#     Bagging and Pasting in Scikit-Learn
#     Out-of-Bag Evaluation
# Random Patches and Random Subspaces
# Rnadom Forests
#     Extra-Trees
#     Feature Importance
# Boosting
#     AdaBoost
#     Gradient Boosting
# Stacking

In [3]:
from sklearn.datasets import make_moons
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

In [4]:
X, y = make_moons(n_samples=10000, noise=0.4)


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression(solver='lbfgs')
rnd_clf = RandomForestClassifier(n_estimators=100)
svm_clf = SVC(gamma='auto', probability=True)

voting_clf = VotingClassifier(
estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
voting='soft')
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     class_weight=None,
                                                     criterion='gini'...
                                        

In [16]:
from sklearn.metrics import accuracy_score
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))
    
# LogisticRegression 0.8325
# RandomForestClassifier 0.843
# SVC 0.865
# VotingClassifier 0.8615

LogisticRegression 0.8325
RandomForestClassifier 0.847
SVC 0.865
VotingClassifier 0.861


In [21]:
#######################
# Bagging and Pasting #
#######################

# Running the same training algorithm, but on different random subsets of the training set
# Bagging: "Bootstrap Aggregating" => Sampling, then replacing the samples
# Pasting: Sampling without replacing the samples
# Bagging and Pasting scale well because the different sample predictions can be run in parallel

# Bagging often results in better models and is generally preferred, 
# but if time and CPU power provide, you can corss validate to evaluate the best option between bagging and pasting

In [12]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
DecisionTreeClassifier(), n_estimators=500,
max_samples=100, bootstrap=True, n_jobs=-1)

bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [22]:
# Out-of-Bag Evaluation:
# The training data not used in any given bag can be used as a test set for that bag

bag_clf = BaggingClassifier(
DecisionTreeClassifier(), n_estimators=500,
bootstrap=True, n_jobs=-1, oob_score=True)

bag_clf.fit(X_train, y_train)
bag_clf.oob_score_
# 0.840875

0.840875

In [24]:
from sklearn.metrics import accuracy_score
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)
# 0.8385

0.8385

In [26]:
# decision function returns the class probabilities for each training instance
# for example, first training instance has a 58.08% chance of belonging to the negative class

bag_clf.oob_decision_function_

array([[0.58083832, 0.41916168],
       [1.        , 0.        ],
       [0.01010101, 0.98989899],
       ...,
       [0.05339806, 0.94660194],
       [0.35263158, 0.64736842],
       [0.        , 1.        ]])

In [27]:
# Random Patches and Random Subspaces:

In [None]:
# Random Subspaces:
# Keeping all training instances, but sampling features
#  - Sampling features results inmore predictor diversity
#  - Results in more bias, but lower variance

In [None]:
# Random Patches: 
# training on random subset of features (not full instances)
#  - This is useful when dealing with high-dimensional inputs (e.g. images)

In [1]:
# Random Forests
# A random forest is an ensemble of decision trees
#! Typically max_samples is set to the size of the training set.

# Theoretically the same as building a BaggingClassifier and passing it a DecisionTreeClassifier, but RandomForestClassifier is more convenient and optimized

In [7]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=16,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [9]:
# RandomForest algorithm searches for best feature among random subset of features,
# Results in greater tree diversity, higher bias, lower variance

In [14]:
# BaggingClassifier roughly == RandomForestClassifier:
bag_clf = BaggingClassifier(
DecisionTreeClassifier(splitter="random",max_leaf_nodes=16), n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1)

In [15]:
# Extra Trees:
# aka Extremely Randomized Trees,
# thresholds for each feature are randomized, rather than searching for best possible thresholds

# This results in more bias, less variance
# Faster to calculate, since it is not necessary to calculate optimal feature thresholds for each node

# Hard to tell in advance whether ExtraTreesClassifier() will perform better than RandomForestClassifier()
# without trying both and cross-validating (with hyperparameter tuning w/ grid search)


In [16]:
# Feature Importance:
# Scikit Learn measures feature's importance 
# based on a weighted average of impurity reduction across all trees.
# This is done automatically during training, and is accessible through the "Classifier.feature_importances_" variable

In [17]:
from sklearn.datasets import load_iris
iris = load_iris()

rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris["data"], iris["target"])

for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
    print(name, score)
    
# sepal length (cm) 0.0897066779989337
# sepal width (cm) 0.021374363629974264
# petal length (cm) 0.44751417122303927
# petal width (cm) 0.4414047871480528

sepal length (cm) 0.0897066779989337
sepal width (cm) 0.021374363629974264
petal length (cm) 0.44751417122303927
petal width (cm) 0.4414047871480528


In [18]:
# Boosting:
# Any Ensemble method that can combine several weak learners into a strong learner.
# Train predictors sequentially, each each correcting the predecessor.

In [19]:
# AdaBoost:
# aka "Adaptive Boosting"
# Each new predictor pays attention to training instances that the predecessor underfitted.
# Translation: each new predector focuses more on the "hard" cases.
# One important drawback: Because learning happens in sequence, adaboost cannot be parallelized
# Each predictor requires the outputs of the previous predictor

In [20]:
# Scikit-Learn uses multiclass version of Adaboost called SAMME 
# SAMME -> Stagewise Additive Modeling using a Multiclass Exponential loss function

# if only 2 classes, SAMME is equivalent to AdaBoost

In [21]:
# Decision Stump:
# Decision Tree with max_depth=1
# translation: tree composed of a single decision node plus 2 leaf nodes

In [23]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm="SAMME.R", learning_rate=0.5)
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(class_weight=None,
                                                         criterion='gini',
                                                         max_depth=1,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort=False,
                                                         random_state=None,
                             

In [24]:
# Gradient Boosting:
# 

In [26]:
from sklearn.tree import DecisionTreeRegressor

tree_reg1 = DecisionTreeRegressor(max_depth=2)

tree_reg1.fit(X, y)

DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [44]:
print(tree_reg1.predict(X[:3]))
print(y[:3])
print(y[:3] - tree_reg1.predict(X[:3]))

[0.8490314  0.8490314  0.14412811]
[1 1 0]
[ 0.1509686   0.1509686  -0.14412811]


In [27]:
y2 = y - tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X, y2)

DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [45]:
print(tree_reg2.predict(X[:3]))
print(y2[:3])
print(y2[:3] - tree_reg2.predict(X[:3]))

[ 0.11730562 -0.00432174 -0.00432174]
[ 0.1509686   0.1509686  -0.14412811]
[ 0.03366299  0.15529034 -0.13980638]


In [28]:
y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X, y3)

DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [46]:
print(tree_reg3.predict(X[:3]))
print(y3[:3])
print(y3[:3] - tree_reg3.predict(X[:3]))

[-0.07482632 -0.07482632  0.06853124]
[ 0.03366299  0.15529034 -0.13980638]
[ 0.10848931  0.23011667 -0.20833762]


In [48]:
# This new -Ensemble makes predictions by adding predictions of all trees
y_pred = sum(tree.predict(X) for tree in (tree_reg1, tree_reg2, tree_reg3))

In [50]:
y_pred[:3]
# array([0.89151069, 0.76988333, 0.20833762])

array([0.89151069, 0.76988333, 0.20833762])

In [53]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
gbrt.fit(X, y)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=1.0, loss='ls', max_depth=2,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=3,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [54]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_val, y_train, y_val = train_test_split(X, y)

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(X_train, y_train)

errors = [mean_squared_error(y_val, y_pred) 
          for y_pred in gbrt.staged_predict(X_val)]
bst_n_estimators = np.argmin(errors)

gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=bst_n_estimators)
gbrt_best.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=2,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=89,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [55]:
# The following code stops training when the validation error does not improve for five iterations in a row:
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True)

min_val_error = float("inf")
error_going_up = 0
for n_estimators in range(1, 120):
    gbrt.n_estimators = n_estimators
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_val)
    val_error = mean_squared_error(y_val, y_pred)
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else: 
        error_going_up += 1
        if error_going_up == 5:
            break # early stopping

In [56]:
# Stacking:
# Prediction aggregation is done with a "blender" model rather than hard/soft voting

In [57]:
# To train a blender:
#   1) Split training data into 2 subsets
#   2) Use 1st subset to train 1st layer of predictors
#   3) Use 1st layer trained predictors to predict values on 2nd subset (hold-out set)
#   4) Create new training set using predicted values as input features, and keep target values
#   5) Train blender model on new training set
#     - Blender learns to predict target value based on 1st layer's predictions


In [None]:
# This can be used to train multiple layers of blenders (by splitting training data into even more subsets)
# e.g. 1st layer creates predictions used by 2nd layer (blenders), 
# which create predictions used by 3rd layer (2nd layer of blenders)