In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [2]:
# initialize predictors
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

In [3]:
# make moon data
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons
X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [4]:
# make a hard voting ensemble classifier
voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard'
)
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf', RandomForestClassifier()), ('svc', SVC())])

In [5]:
# check accuracy on test set 
from sklearn.metrics import accuracy_score
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.88
SVC 0.896
VotingClassifier 0.904


In [6]:
# initialize predictors such that they can compute probabilities
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC(probability=True)

In [7]:
# make a soft voting ensemble classifier
voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft'
)
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf', RandomForestClassifier()),
                             ('svc', SVC(probability=True))],
                 voting='soft')

In [8]:
# check accuracy on test set 
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.896
SVC 0.896
VotingClassifier 0.904


# Bagging and pasting

In [9]:
from sklearn.ensemble import BaggingClassifier  # alternatively BaggingRegressor
from sklearn.tree import DecisionTreeClassifier

In [10]:
# initialize and fit a bagging classifier with trees
# if the base_estimator can compute probabilities, the bagging classifier uses soft voting
bag_clf = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=100,
    bootstrap=True,  # if set to False, it becomes "pasting"
    n_jobs=-1  # -1 means use all available cores
)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [11]:
# initialize and fit a bagging classifier with trees and that computes out-of-bag performance
bag_clf = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=500,
    bootstrap=True,  # if set to False, it becomes "pasting"
    n_jobs=-1,  # -1 means use all available cores
    oob_score=True
)
bag_clf.fit(X_train, y_train)
bag_clf.oob_score_

0.9013333333333333

In [12]:
# the oob score is an estimation of how the model will perform in the test set
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.896

# Random forests

In [13]:
from sklearn.ensemble import RandomForestClassifier

In [14]:
# train a random forest
rnd_clf = RandomForestClassifier(
    n_estimators=500,
    max_leaf_nodes=16,
    n_jobs=-1
    )
rnd_clf.fit(X_train, y_train)
y_pred_rf = rnd_clf.predict(X_test)

In [15]:
# the RandomForestClassifier class is roughly equivalent to this
bag_clf = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(splitter='random', max_leaf_nodes=16),
    n_estimators=500, 
    max_samples=1.0,
    bootstrap=True,
    n_jobs=-1
)

# Feature Importance

In [16]:
from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris["data"], iris["target"])
for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
    print(name, score)

sepal length (cm) 0.09296115300343674
sepal width (cm) 0.025494199316543405
petal length (cm) 0.4231766133895896
petal width (cm) 0.45836803429043027


# AdaBoost

In [17]:
from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier(
    base_estimator=DecisionTreeClassifier(max_depth=1), 
    n_estimators=200,
    algorithm="SAMME.R",
    learning_rate=0.5
    )
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                   learning_rate=0.5, n_estimators=200)

# Gradient Boost

In [18]:
# make fake quadratic data
import numpy as np
np.random.seed(42)
X = np.random.rand(100, 1) - 0.5
y = 3 * X[:, 0]**2 + 0.05 * np.random.randn(100)


In [19]:
from sklearn.tree import DecisionTreeRegressor
tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X, y)

DecisionTreeRegressor(max_depth=2)

In [20]:
# compute the residue and train a new tree to predict it
y2 = y - tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X, y2)

DecisionTreeRegressor(max_depth=2)

In [21]:
# and again
y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X, y3)

DecisionTreeRegressor(max_depth=2)

In [22]:
# the final prediction is the sum of all predictions
X_new = np.array([[0.8]])
y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))

In [23]:
# do the same as before but with a single ensemble predictor
from sklearn.ensemble import GradientBoostingRegressor
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
gbrt.fit(X, y)

GradientBoostingRegressor(learning_rate=1.0, max_depth=2, n_estimators=3)

# Shrinkage

In [24]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [25]:
X_train, X_val, y_train, y_val = train_test_split(X, y)

In [26]:
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(X_train, y_train)

GradientBoostingRegressor(max_depth=2, n_estimators=120)

In [27]:
# Find the number of trees that result in the minimum validation set error
errors = [mean_squared_error(y_val, y_pred) for y_pred in gbrt.staged_predict(X_val)]
bst_n_estimators = np.argmin(errors) + 1
bst_n_estimators

85

In [28]:
gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=bst_n_estimators)
gbrt_best.fit(X_train, y_train)

GradientBoostingRegressor(max_depth=2, n_estimators=85)

# Early Stopping and Warm Up Training

In [29]:
# warm_start ensures that when more estimators are added, only the new ones are fitted
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True)  
min_val_error = float("inf")
error_going_up = 0
for n_estimators in range(1, 120):
    gbrt.n_estimators = n_estimators  # implicitly adds more untrained trees
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_val)
    val_error = mean_squared_error(y_val, y_pred)
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else:
        error_going_up += 1
        if error_going_up == 5:  # If the error goes up more than 5 times, then we have found the minimum
            break  # early stopping

# XGBoost

In [31]:
import xgboost

In [36]:
# make fake quadratic data
import numpy as np
np.random.seed(42)
X = np.random.rand(100, 1) - 0.5
y = 3 * X[:, 0]**2 + 0.05 * np.random.randn(100)

In [37]:
X_train, X_val, y_train, y_val = train_test_split(X, y)

In [38]:
xgb_reg = xgboost.XGBRegressor(random_state=42)
xgb_reg.fit(X_train, y_train)
y_pred = xgb_reg.predict(X_val)

In [42]:
xgb_reg = xgboost.XGBRegressor(random_state=42)
xgb_reg.fit(X_train,
            y_train,
            eval_set=[(X_val, y_val)],
            early_stopping_rounds=2)
y_pred = xgb_reg.predict(X_val)

[0]	validation_0-rmse:0.21512
[1]	validation_0-rmse:0.16304
[2]	validation_0-rmse:0.12217
[3]	validation_0-rmse:0.09683
[4]	validation_0-rmse:0.08110
[5]	validation_0-rmse:0.06992
[6]	validation_0-rmse:0.06311
[7]	validation_0-rmse:0.05919
[8]	validation_0-rmse:0.05735
[9]	validation_0-rmse:0.05613
[10]	validation_0-rmse:0.05612
[11]	validation_0-rmse:0.05593
[12]	validation_0-rmse:0.05599
[13]	validation_0-rmse:0.05610


