## Ejemplos de ensambles

A continuación procedemos con un ejemplo básico de bagging para árboles de regresión, en el que comparamos el estimador del árbol de decisión con el estimador bagging correspondiente. Usamos la métrica R2 (coeficiente de determinación) para comparar los resultados.

In [1]:
""" Bagging Example """
import numpy as np
from sklearn.datasets import make_friedman1
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

np.random.seed(100)

# create regression problem
n_points = 1000 # points
x, y =  make_friedman1(n_samples=n_points, n_features=15, 
                       noise=1.0, random_state=100)

# split to train/test set
x_train, x_test, y_train, y_test = \
        train_test_split(x, y, test_size=0.33, random_state=100)

# training
regTree = DecisionTreeRegressor(random_state=100)
regTree.fit(x_train,y_train)

# test
yhat = regTree.predict(x_test)

# Bagging construction
n_estimators=500
bag = np.empty((n_estimators), dtype=object)
bootstrap_ds_arr = np.empty((n_estimators), dtype=object)
for i in range(n_estimators):
    # sample bootsraped dataset
    ids = np.random.choice(range(0,len(x_train)),size=len(x_train), replace=True)

    x_boot = x_train[ids]
    y_boot = y_train[ids]
    bootstrap_ds_arr[i] = np.unique(ids)
    
    bag[i] = DecisionTreeRegressor()
    bag[i].fit(x_boot,y_boot)

# bagging prediction
yhatbag = np.zeros(len(y_test))   
for i in range(n_estimators): 
    yhatbag = yhatbag + bag[i].predict(x_test)
        
yhatbag = yhatbag/n_estimators

# out of bag loss estimation
oob_pred_arr = np.zeros(len(x_train))
for i in range(len(x_train)):
    x = x_train[i].reshape(1, -1)
    C = []
    for b in range(n_estimators):
        if(np.isin(i, bootstrap_ds_arr[b])==False):
            C.append(b)
    for pred in  bag[C]:       
        oob_pred_arr[i] = oob_pred_arr[i] + (pred.predict(x)/len(C))        

L_oob = r2_score(y_train, oob_pred_arr)

print("DecisionTreeRegressor R^2 score = ",r2_score(y_test, yhat),  
      "\nBagging R^2 score = ", r2_score(y_test, yhatbag),
      "\nBagging OOB R^2 score = ",L_oob)

DecisionTreeRegressor R^2 score =  0.5676142169432943 
Bagging R^2 score =  0.8040380122022653 
Bagging OOB R^2 score =  0.8208564723226458


Vamos ahora a observar un ensamble con Random Forest

In [2]:
""" Bagging Example RF """
from sklearn.datasets import make_friedman1
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor

# create regression problem
n_points = 1000 # points
x, y =  make_friedman1(n_samples=n_points, n_features=15, 
                       noise=1.0, random_state=100)
# split to train/test set
x_train, x_test, y_train, y_test = \
        train_test_split(x, y, test_size=0.33, random_state=100)       
rf = RandomForestRegressor(n_estimators=500, oob_score = True, max_features=8,random_state=100)
rf.fit(x_train,y_train)
yhatrf = rf.predict(x_test)

print("RF R^2 score = ", r2_score(y_test, yhatrf), 
      "\nRF OOB R^2 score = ", rf.oob_score_)

RF R^2 score =  0.8098070870916778 
RF OOB R^2 score =  0.8254584737683683


En este siguiente ejemplo utilizamos Gradient Boosting para regresión.

In [3]:
""" Gradient Boosting Regressor """
import numpy as np
from sklearn.datasets import make_friedman1
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# create regression problem
n_points = 1000 # points
x, y =  make_friedman1(n_samples=n_points, n_features=15, 
                       noise=1.0, random_state=100)

# split to train/test set
x_train, x_test, y_train, y_test = \
        train_test_split(x, y, test_size=0.33, random_state=100)

# boosting sklearn
from sklearn.ensemble import GradientBoostingRegressor

breg = GradientBoostingRegressor(learning_rate=0.1, 
            n_estimators=100, max_depth =3, random_state=100)
breg.fit(x_train,y_train)
yhat = breg.predict(x_test)
print("Gradient Boosting R^2 score = ",r2_score(y_test, yhat))

Gradient Boosting R^2 score =  0.8992706169055638


Finalmente observamos un ejemplo con AdaBoost para clasificación

In [None]:
""" AdaBoost Example """
from sklearn.datasets import make_blobs
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import zero_one_loss
import numpy as np


def ExponentialLoss(y,yhat):
    n = len(y)
    loss = 0
    for i in range(n):
        loss = loss+np.exp(-y[i]*yhat[i])
    loss = loss/n
    return loss

# create binary classification problem
np.random.seed(100)

n_points = 100 # points
x, y =  make_blobs(n_samples=n_points, n_features=5,  centers=2,
                      cluster_std=20.0, random_state=100)
y[y==0]=-1  

# AdaBoost implementation
BoostingRounds = 1000
n = len(x)
W = 1/n*np.ones(n)

Learner = []
alpha_b_arr = []

for i in range(BoostingRounds):
    clf = DecisionTreeClassifier(max_depth=1)
    clf.fit(x,y, sample_weight=W)
    
    Learner.append(clf)
    
    train_pred = clf.predict(x)
    err_b = 0
    for i in range(n):
        if(train_pred[i]!=y[i]):
            err_b = err_b+W[i]
    err_b = err_b/np.sum(W)
        
    alpha_b = 0.5*np.log((1-err_b)/err_b)
    
    alpha_b_arr.append(alpha_b)
    
    for i in range(n):
        W[i] = W[i]*np.exp(-y[i]*alpha_b*train_pred[i])        
    
yhat_boost = np.zeros(len(y))

for j in range(BoostingRounds):
    yhat_boost = yhat_boost+alpha_b_arr[j]*Learner[j].predict(x)
    
    
yhat = np.zeros(n)
yhat[yhat_boost>=0]=1
yhat[yhat_boost<0]=-1
print("AdaBoost Classifier exponential loss = ", ExponentialLoss(y, yhat_boost)) 
print("AdaBoost Classifier zero-one loss = ", zero_one_loss(y,yhat) ) 