# Bagging:  Bootstrap Aggregation 

## Bagging (explicitly)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import sklearn.linear_model
import sklearn.tree
import sklearn.metrics
import sklearn.model_selection

## First: create a dataset to explore

We will start by looking at a very simple set of data with a very simple linear relationship.

Bagging will not give a better result than simple linear regression.  However, this will serve as an easy-to-understand example and we will apply bagging in more appropriate contexts afterwards.

In [None]:
x = np.linspace(0,10,100)

In [None]:
np.random.seed(42)
noise = np.random.normal(0,1.5,100)

y = x + noise

In [None]:
plt.plot(x,y,'ko')

## Straight-forward linear regression

In [None]:
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x.reshape(-1,1), y, random_state=42)

In [None]:
lin_reg = sklearn.linear_model.LinearRegression()
lin_reg.fit(x_train,y_train)
print('R2 score: ',lin_reg.score(x_test, y_test))

In [None]:
plt.plot(x_train,y_train,'ko')
plt.plot(x_test,y_test,'bo')
plt.plot([0,10],lin_reg.predict([[0],[10]]))

## Bootstrap samples and aggregation

Use bootstrapping to get training samples, then take average of the regressor's predictions for any given sample.

We will select `k` samples with replacement, train our linear regression algorithm, and then repeat the process `t` times.

In [None]:
k = 5
t = 5

Since we need to select an element from `x_train` and `y_train` in tandem, we make choices among the list of indices.

In [None]:
ix = range(0,len(x_train))

In [None]:
np.random.choice(ix, k)

In [None]:
newix = np.random.choice(ix, k)

In [None]:
x_train[newix], y_train[newix]

Here's where we repeat the training `t` times on samples with `k` elements chosen from `(x_train, y_train)`

In [None]:
lin_regs = []
for i in range(t):
    
    newix = np.random.choice(ix, k)
    
    lin_regs.append(sklearn.linear_model.LinearRegression())
    
    lin_regs[i].fit(x_train[newix], y_train[newix])

We plot all of the linear fits:

In [None]:
plt.plot(x,y,'ko')
x_edge = [[0],[10]]
for i in range(t):
    plt.plot(x_edge, lin_regs[i].predict(x_edge))

When looking at predictions, we take an average over all the predictors.

In [None]:
plt.plot(x,y,'ko')

xnew = np.linspace(0,10,100)
ynew = []
for i in xnew:
    tmpx = [[i]]
    n = [j.predict(tmpx) for j in lin_regs]
    ynew.append(np.mean(n))

plt.plot(xnew,ynew,'b')
plt.plot([0,10],lin_reg.predict([[0],[10]]), 'g')

In [None]:
y_pred = []
for i in x_test:
    n = [j.predict([i]) for j in lin_regs]
    y_pred.append(np.mean(n))
r2score = sklearn.metrics.r2_score(y_test, y_pred)
print('R2 score: ',r2score)

The above is very good, considering we only trained 5 times with 5 samples each time.

Repeat the above for other numbers.

# Decision Trees

We are going to repeat this process now using Decision Trees instead of Linear Regression.

First, we train one decision tree to see how it does.

In [None]:
tree_reg = sklearn.tree.DecisionTreeRegressor()
tree_reg.fit(x_train,y_train)
print('R2 score: ',tree_reg.score(x_test, y_test))

In [None]:
plt.plot(x_train,y_train,'ko')
plt.plot(x_test,y_test,'bo')
xnew = np.linspace(0,10,1000).reshape(-1,1)
ynew = tree_reg.predict(xnew)
plt.plot(xnew,ynew)

The decision tree here very much overfits to the training data.

We will see whether this can be improved by using bootstrapping.

In [None]:
k = 5
t = 5

In [None]:
tree_regs = []
for i in range(t):
    
    newix = np.random.choice(ix, k)
    
    tree_regs.append(sklearn.tree.DecisionTreeRegressor())
    
    tree_regs[i].fit(x_train[newix], y_train[newix])

In [None]:
plt.plot(x_train,y_train,'ko')
plt.plot(x_test,y_test,'bo')
x_edge = np.linspace(0,10,1000).reshape(-1,1)
for i in range(t):
    plt.plot(x_edge,tree_regs[i].predict(x_edge))

The above shows the various decision trees trained on only `k` points.

For our aggregate prediction, we average the predictions over all of the `t` predictors.

In [None]:
plt.plot(x_train,y_train,'ko')
plt.plot(x_test,y_test,'bo')

xnew = np.linspace(0,10,1000).reshape(-1,1)
ynew = []
for i in xnew:
    n = [j.predict([i]) for j in tree_regs]
    ynew.append(np.mean(n))

plt.plot(xnew,ynew)

In [None]:
y_pred = []
for i in x_test:
    n = [j.predict([i]) for j in tree_regs]
    y_pred.append(np.mean(n))
r2score = sklearn.metrics.r2_score(y_test, y_pred)
print(r2score)

This model outperforms the single decision tree!

## Bagging (with Scikit-Learn)

## BaggingRegressor with Linear Regression

Scikit-Learn's BaggingRegressor: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingRegressor.html

In [None]:
from sklearn.ensemble import BaggingRegressor

In [None]:
# Be careful, by the way, about using Regressor vs Classifier (below we'll also use Classifiers)

bag_reg = BaggingRegressor(sklearn.linear_model.LinearRegression(), 
                           n_estimators=5,
                           max_samples=5, 
                           bootstrap=True, 
                           n_jobs=-1)

bag_reg.fit(x_train, y_train)

y_pred = bag_reg.predict(x_test)
print(bag_reg.__class__.__name__, sklearn.metrics.r2_score(y_test, y_pred))

plt.plot(x_train,y_train,'ko')
plt.plot(x_test,y_test,'bo')
plt.plot([0,10],bag_reg.predict([[0],[10]]), 'g')
plt.plot([0,10],lin_reg.predict([[0],[10]]), 'r')

Try tinkering around with the number of estimators and the number of samples.

Note: max_samples is the integer number of samples if you specify an integer, but it is the fraction of the total number of data points if you specify a float.

## Random Forest

Scikit-Learn's RandomForestRegressor: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

* once you start becoming familiar with the ideas behind the algorithm, try exploring the documentation detailing the input parameters

In [None]:
rf_reg = sklearn.ensemble.RandomForestRegressor(n_jobs=-1,
                                                random_state=42)
rf_reg.fit(x_train,y_train)
print('R2 score: ',rf_reg.score(x_test, y_test))

In [None]:
plt.plot(x_train,y_train,'ko')
plt.plot(x_test,y_test,'bo')
xnew=np.linspace(0,10,1000).reshape(-1,1)
ynew=rf_reg.predict(xnew)
plt.plot(xnew,ynew)

In [None]:
rf_reg = sklearn.ensemble.RandomForestRegressor(max_depth=3,
                                                n_jobs=-1,
                                                random_state=42)
rf_reg.fit(x_train,y_train)
print('R2 score: ',rf_reg.score(x_test, y_test))

In [None]:
plt.plot(x_train,y_train,'ko')
plt.plot(x_test,y_test,'bo')
xnew=np.linspace(0,10,1000).reshape(-1,1)
ynew=rf_reg.predict(xnew)
plt.plot(xnew,ynew)

A random forest is a collection of trees -> we can use our tree visualization methods to look at them.

In [None]:
len(rf_reg.estimators_)

In [None]:
import ipywidgets

def outtree(n):
    text_representation = sklearn.tree.export_text(rf_reg.estimators_[n])
    print(text_representation)
    
ipywidgets.interactive(outtree,n=range(len(rf_reg.estimators_)))

In [None]:
def outtree(n):
    plt.figure(figsize=(12,8))
    sklearn.tree.plot_tree(rf_reg.estimators_[n], 
                   feature_names=['x'],
                   filled=True);

ipywidgets.interactive(outtree,n=range(len(rf_reg.estimators_)))