# Bagging Trees

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV,cross_val_score
from skompiler import skompile
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingRegressor

from warnings import filterwarnings
filterwarnings('ignore')

In [5]:
hit = pd.read_csv("Hitters.csv")
df = hit.copy()
df = df.dropna()
dms = pd.get_dummies(df[['League', 'Division', 'NewLeague']])
y = df["Salary"]
X_ = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis=1).astype('float64')
X = pd.concat([X_, dms[['League_N', 'Division_W', 'NewLeague_N']]], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [6]:
bag_model = BaggingRegressor(bootstrap_features = True)
bag_model.fit(X_train, y_train)

BaggingRegressor(bootstrap_features=True)

In [7]:
bag_model.n_estimators

10

In [9]:
bag_model.estimators_ # bunlarin hepsi birbirinden farkli agaclar

[DecisionTreeRegressor(random_state=1962488388),
 DecisionTreeRegressor(random_state=1727490216),
 DecisionTreeRegressor(random_state=1819456416),
 DecisionTreeRegressor(random_state=790982416),
 DecisionTreeRegressor(random_state=1765918436),
 DecisionTreeRegressor(random_state=1064550299),
 DecisionTreeRegressor(random_state=1241523460),
 DecisionTreeRegressor(random_state=346301746),
 DecisionTreeRegressor(random_state=626498946),
 DecisionTreeRegressor(random_state=1089658259)]

In [11]:
bag_model.estimators_samples_ #Her bir agacdaki örnekler

[array([ 31,  97, 140,  67,   7, 151,  61,  68, 112, 124,  28, 195, 102,
        189,  30,  83, 121,  39, 128,   1, 143,  36, 100,  75,  45, 140,
        140,  34, 159,  30, 165,  12,  34,  31, 119,  11, 113,  23, 148,
        132,  98,  23, 107, 114, 185,  13, 145,  12, 164,  82, 187, 136,
        136,  75, 157, 179, 156, 123,  70,  91,  43,  84, 128,  92,  94,
         78, 125, 180,  30,  17,  72,  10, 152,  71, 186, 179, 133, 107,
         58, 127,  59, 177, 189,  27,  65,  93,  63, 163,  53, 187,  55,
        166, 121, 141, 105,  88, 114, 105, 126, 154, 101, 125, 102, 147,
        121, 125, 145, 131,  40, 152, 144, 144,   9, 102, 102,  75, 160,
        132, 166,  18, 160,  19, 189,   1,  16,  67,  90,  40,  42, 186,
        152,   8, 166,   1,  59, 174, 157,  44,  69, 185,  70, 166,  27,
         42,  63,  85, 172, 183,  81,  59, 122,  13,  90, 175, 129, 141,
         61,  60, 173, 171, 126,  90,  61, 185,   0, 176, 128,  96,  83,
         25,  94, 121,   6,  81, 123, 129,  69,   9

In [13]:
bag_model.estimators_features_ #Butun agaclarin kullandigi degiskenler ve indexleri

[array([ 4, 13, 17,  6,  8, 17,  2,  9, 12,  0,  1,  1, 11,  7,  8, 15, 15,
        15,  0]),
 array([ 8,  4, 15, 17, 11,  1, 15,  4, 11,  3, 16, 16, 10, 16,  4, 18, 15,
        10,  1]),
 array([ 0, 13,  4, 16,  2,  3, 14, 11,  2,  1,  4, 17, 16,  1,  8, 15, 14,
         1, 16]),
 array([16, 14,  9, 15, 10, 18,  8, 12,  7,  7,  2, 12,  7, 13,  2,  7, 13,
        17,  9]),
 array([ 4, 17,  1, 13, 12, 11,  0, 12,  6, 14,  2,  6,  7,  7,  6, 10, 13,
         1,  7]),
 array([ 4,  8,  9, 11,  7,  1,  5,  3, 13,  8,  9, 17, 11, 10, 11, 15, 11,
        14, 17]),
 array([ 4, 18,  2,  2,  5,  8, 15, 12, 11, 18,  2, 13,  9, 15,  3,  2,  1,
        14, 17]),
 array([18, 13, 12, 18, 16,  8, 14,  2,  8,  9, 10, 11, 17,  5, 13,  4, 13,
        10,  6]),
 array([ 2, 12,  1, 16,  0, 18, 13, 13,  1, 16,  2,  5, 11,  3, 10,  4,  2,
         0, 13]),
 array([ 3,  9, 15,  8, 13,  4, 16,  9,  4, 14,  4, 11, 11, 16, 18,  7,  8,
         4, 14])]

# Tahmin

In [16]:
y_pred = bag_model.predict(X_test)

In [17]:
np.sqrt(mean_squared_error(y_test, y_pred))

330.3855619389633

In [21]:
iki_y_pred = bag_model.estimators_[1].fit(X_train, y_train).predict(X_test)

In [22]:
np.sqrt(mean_squared_error(y_test, iki_y_pred))

473.0130992033942

In [23]:
yedi_y_pred = bag_model.estimators_[4].fit(X_train, y_train).predict(X_test)

In [24]:
np.sqrt(mean_squared_error(y_test, yedi_y_pred))

516.6793429895725

# Model Tuning

In [25]:
bag_model = BaggingRegressor(bootstrap_features = True)
bag_model.fit(X_train, y_train)

BaggingRegressor(bootstrap_features=True)

In [26]:
bag_params = {"n_estimators": range(2,20)}

In [27]:
bag_cv_model = GridSearchCV(bag_model, bag_params, cv = 10)

In [28]:
bag_cv_model.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=BaggingRegressor(bootstrap_features=True),
             param_grid={'n_estimators': range(2, 20)})

In [29]:
bag_cv_model.best_params_

{'n_estimators': 16}

In [30]:
bag_tuned = BaggingRegressor( n_estimators = 14, random_state = 45)

In [31]:
bag_tuned.fit(X_train, y_train)

BaggingRegressor(n_estimators=14, random_state=45)

In [32]:
y_pred = bag_tuned.predict(X_test)

In [33]:
np.sqrt(mean_squared_error(y_test, y_pred))

346.457987188104