## Bagged Trees Regression

In [47]:
from sklearn.model_selection import train_test_split, GridSearchCV,cross_val_score
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import scale, StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn import model_selection
from skompiler import skompile
from sklearn.svm import SVR
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from warnings import filterwarnings
filterwarnings('ignore')

In [48]:
hit = pd.read_csv("Hitters.csv")
df = hit.copy()
df = df.dropna()
dms = pd.get_dummies(df[['League', 'Division', 'NewLeague']])
y = df["Salary"]
X_ = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis=1).astype('float64')
X = pd.concat([X_, dms[['League_N', 'Division_W', 'NewLeague_N']]], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25, 
                                                    random_state=42)


In [49]:
bag_model = BaggingRegressor(bootstrap_features = True)
bag_model.fit(X_train, y_train)

BaggingRegressor(base_estimator=None, bootstrap=True, bootstrap_features=True,
                 max_features=1.0, max_samples=1.0, n_estimators=10,
                 n_jobs=None, oob_score=False, random_state=None, verbose=0,
                 warm_start=False)

In [50]:
bag_model.n_estimators

10

In [51]:
bag_model.estimators_

[DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=1222928401, splitter='best'),
 DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=1126452366, splitter='best'),
 DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_de

In [52]:
bag_model.estimators_samples_

[array([  8, 147,  95, 157,   7,  71, 111, 174,  30, 134, 108,  48, 145,
        146, 102, 184,  35, 170,  22,  15,  71, 108,  21,  35, 152, 140,
        171,   7,  84, 178, 134, 149, 196, 188,  35,  70, 133, 114,  35,
         92, 177,  31,  84,  56, 195,  99,  40,  60,  29, 156,  31, 139,
         33,  32, 166,  97, 127,  98, 184, 106,  63, 114,   7,  61,  44,
        178, 112,  81, 174,  80, 121, 115, 162,  61,  51, 139, 165,  31,
         46,  13, 112, 196,  18,  29, 117,  56, 164,  94,  73, 169, 123,
         42, 107,  56,  38,  20, 133, 137, 163,  29,  82,  41, 182,  46,
        100,  41,  42, 128,  65,  62, 139,  21,  16, 155,  52,  36,  96,
        120,  97,  35,  57, 176,  10, 120, 125,  56,  19,  75, 127, 191,
        153, 190, 184, 138,  77,  10,  25, 130,  69,  40,  11,  33, 141,
         98, 193,  52,  61,  23,  66, 163, 102, 149, 130, 166, 166, 126,
         89,  34, 134,  82, 154, 162,   1,  62,   8, 167, 180,  42, 178,
         22, 116,  23, 171, 135,  48,  23,  18,  11

In [53]:
bag_model.estimators_features_

[array([ 8,  4, 15, 16, 15,  3,  9,  3,  8,  0,  0,  8, 10, 15, 10, 16, 11,
        17,  8]),
 array([16,  8, 14, 13,  2,  8, 18,  4, 15,  6,  1,  1, 15, 15,  0, 17, 12,
         2, 18]),
 array([ 0,  0, 10, 10,  7,  2,  6, 16, 12, 13,  4, 18, 12, 17, 18, 18,  0,
        10,  7]),
 array([11,  3,  3,  4, 14, 13, 13,  2,  4, 11,  4,  1,  7, 12, 16, 10,  9,
        15, 15]),
 array([13,  2,  3, 15,  7,  0,  1, 18, 14, 12, 17, 16, 11,  5,  2,  3,  7,
         1, 11]),
 array([11, 14,  8,  4,  8, 13, 12,  6,  6, 18,  0, 11, 12,  4,  6,  2,  8,
         6,  4]),
 array([14,  5,  3, 17,  2,  4,  3,  1,  9, 12,  2,  5,  6,  4,  7, 11, 18,
        10,  5]),
 array([10,  2,  6, 17,  5,  5,  5, 11, 17, 13, 15, 16, 13,  8,  9,  3,  5,
         8, 18]),
 array([ 2, 15,  9, 10, 15,  2, 16,  1, 10, 17,  5,  7, 15, 10, 10,  8, 16,
         9,  1]),
 array([14, 16, 10, 11,  8,  3,  6,  4,  9, 13, 13,  0,  9, 12, 16,  7,  8,
        10,  3])]

In [54]:
bag_model.estimators_[1]

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=1126452366, splitter='best')

### Predict

In [55]:
y_pred = bag_model.predict(X_test)

In [56]:
np.sqrt(mean_squared_error(y_test, y_pred))

331.89452843908657

In [57]:
second_y_pred = bag_model.estimators_[1].fit(X_train, y_train).predict(X_test)

In [58]:
np.sqrt(mean_squared_error(y_test, second_y_pred))

467.7763985633942

### Model Tuning

In [59]:
bag_model = BaggingRegressor(bootstrap_features = True)
bag_model.fit(X_train, y_train)

BaggingRegressor(base_estimator=None, bootstrap=True, bootstrap_features=True,
                 max_features=1.0, max_samples=1.0, n_estimators=10,
                 n_jobs=None, oob_score=False, random_state=None, verbose=0,
                 warm_start=False)

In [60]:
bag_params = {"n_estimators": range(2,20)}

In [61]:
bag_cv_model = GridSearchCV(bag_model, bag_params, cv = 10).fit(X_train, y_train)

In [62]:
bag_cv_model.best_params_

{'n_estimators': 14}

In [67]:
bag_tuned = BaggingRegressor(n_estimators = 14, random_state = 45)

In [68]:
bag_tuned.fit(X_train, y_train)

BaggingRegressor(base_estimator=None, bootstrap=True, bootstrap_features=False,
                 max_features=1.0, max_samples=1.0, n_estimators=14,
                 n_jobs=None, oob_score=False, random_state=45, verbose=0,
                 warm_start=False)

In [69]:
y_pred = bag_tuned.predict(X_test)

In [70]:
np.sqrt(mean_squared_error(y_test, y_pred))

346.97035613871606