<a href="https://colab.research.google.com/github/lisabroadhead/data_science_machine-learning/blob/main/Ensemble_Trees_Exercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Ensemble Trees Exercise
- Lisa Broadhead
- June 29, 2022

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
# Import the bagging regressor
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [None]:
file = '/content/drive/MyDrive/Colab Notebooks/coding_dojo/Machine Learning/files/Boston_Housing_from_Sklearn.csv'

df = pd.read_csv(file)
df.head()

Unnamed: 0,CRIM,NOX,RM,AGE,PTRATIO,LSTAT,PRICE
0,0.00632,0.538,6.575,65.2,15.3,4.98,24.0
1,0.02731,0.469,6.421,78.9,17.8,9.14,21.6
2,0.02729,0.469,7.185,61.1,17.8,4.03,34.7
3,0.03237,0.458,6.998,45.8,18.7,2.94,33.4
4,0.06905,0.458,7.147,54.2,18.7,5.33,36.2


In [None]:
y = df['PRICE']
X = df.drop(columns='PRICE')

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# 1. Try a Decision Tree, Bagged Tree, and Random Forest.

2. Tune each model to optimize performance on the test set.
3. Evaluate your best model using multiple regression metrics

### Decision Tree

In [None]:
dc_tree = DecisionTreeRegressor(random_state=42)

In [None]:
dc_tree.fit(X_train, y_train)

DecisionTreeRegressor(random_state=42)

In [None]:
train_preds_dc = dc_tree.predict(X_train)
test_preds_dc = dc_tree.predict(X_test)

In [None]:
train_score_dc = dc_tree.score(X_train, y_train)
test_score_dc = dc_tree.score(X_test, y_test)

print(train_score_dc)
print(test_score_dc)

1.0
0.6193230918136841


In [None]:
dc_tree.get_depth()

20

In [None]:
# List of values to try for max_depth:
depths = list(range(2, 36)) # will try every value between 2 and 36
# Data frame to store the score for each value of max_depth:
scores = pd.DataFrame(index=depths, columns=['Test Score','Train Score'])
for depth in depths:
    dec_tree = DecisionTreeRegressor(max_depth=depth, random_state=42)
    dec_tree.fit(X_train, y_train)
    train_score = dec_tree.score(X_test, y_test)
    test_score = dec_tree.score(X_test, y_test)
    scores.loc[depth, 'Train Score'] = train_score
    scores.loc[depth, 'Test Score'] = test_score

In [None]:
sorted_scores = scores.sort_values(by='Test Score', ascending=False)
sorted_scores.head()

Unnamed: 0,Test Score,Train Score
7,0.846377,0.846377
10,0.84601,0.84601
11,0.829736,0.829736
12,0.827102,0.827102
6,0.825985,0.825985


### Bagged Tree

In [None]:
y = df['PRICE']
X = df.drop(columns = 'PRICE')

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
bagreg = BaggingRegressor(random_state = 42)

In [None]:
bagreg.get_params()

{'base_estimator': None,
 'bootstrap': True,
 'bootstrap_features': False,
 'max_features': 1.0,
 'max_samples': 1.0,
 'n_estimators': 10,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [None]:
bagreg.fit(X_train, y_train)

BaggingRegressor(random_state=42)

In [None]:
bagreg.predict(X_test)

array([24.04, 30.77, 18.39, 24.04, 16.09, 20.4 , 19.13, 15.03, 21.07,
       21.39, 18.87, 19.46,  7.68, 19.44, 18.93, 25.27, 19.06,  7.87,
       44.92, 14.98, 24.01, 23.58, 14.26, 24.63, 14.15, 12.82, 20.67,
       14.2 , 19.37, 20.33, 20.6 , 23.18, 31.23, 21.4 , 13.94, 15.81,
       36.2 , 19.6 , 20.22, 24.6 , 18.88, 25.75, 44.15, 20.44, 22.72,
       14.5 , 14.95, 24.43, 16.76, 28.32, 22.93, 34.44, 15.92, 25.6 ,
       47.42, 22.56, 15.93, 31.44, 21.34, 20.25, 27.03, 33.4 , 27.06,
       19.23, 28.11, 16.18, 14.58, 22.82, 28.31, 16.5 , 19.59, 25.86,
        9.79, 21.71, 21.47,  6.94, 20.52, 46.12, 11.9 , 14.74, 20.45,
       11.14, 20.37,  9.44, 20.4 , 26.58, 16.95, 23.41, 24.52, 17.98,
       23.  ,  7.34, 18.97, 20.09, 26.26, 20.12, 35.89, 11.42, 12.12,
       12.27, 20.1 , 23.  , 11.72, 23.22, 20.29, 15.51, 18.07, 25.05,
       21.62, 23.58,  7.73, 14.05, 21.61, 22.51, 33.87, 12.38, 43.51,
       16.17, 18.67, 24.28, 20.11, 24.72,  8.68, 20.84, 24.5 , 21.74,
       24.2 ])

In [None]:
bagreg_train_score = bagreg.score(X_train, y_train)
bagreg_test_score = bagreg.score(X_test, y_test)
print(bagreg_train_score)
print(bagreg_test_score)

0.9606756023782893
0.8204208271364619


In [None]:
# List of estimator values
estimators = [10, 20, 30, 40, 50, 100]
# Data frame to store the scores
scores = pd.DataFrame(index=estimators, columns=['Train Score', 'Test Score'])
# Iterate through the values to find the best number of estimators
for num_estimators in estimators:
   bag_reg = BaggingRegressor(n_estimators=num_estimators, random_state=42)
   bag_reg.fit(X_train, y_train)
   train_score = bag_reg.score(X_train, y_train)
   test_score = bag_reg.score(X_test, y_test)
   scores.loc[num_estimators, 'Train Score'] = train_score
   scores.loc[num_estimators, 'Test Score'] = test_score

In [None]:
scores = scores.sort_values(by='Test Score', ascending=False)
scores

Unnamed: 0,Train Score,Test Score
40,0.97395,0.834365
50,0.975185,0.83391
100,0.977246,0.833051
20,0.9701,0.831147
30,0.973401,0.830604
10,0.960676,0.820421


### Random Forest

In [None]:
y = df['PRICE']
X = df.drop(columns = 'PRICE')

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
rf = RandomForestRegressor(random_state=42)

In [None]:
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [None]:
rf.fit(X_train, y_train)

RandomForestRegressor(random_state=42)

In [None]:
rf.predict(X_test)

array([22.986, 31.391, 19.003, 23.141, 16.213, 20.666, 18.768, 15.219,
       21.251, 20.809, 20.253, 20.247,  8.237, 21.228, 19.717, 26.426,
       19.432,  8.497, 46.203, 15.325, 23.637, 23.557, 14.31 , 24.344,
       15.369, 13.575, 21.195, 13.96 , 18.668, 21.416, 19.64 , 23.35 ,
       28.457, 21.533, 14.143, 16.065, 34.532, 19.198, 20.46 , 23.926,
       18.542, 28.025, 45.118, 19.994, 22.885, 14.364, 15.116, 23.797,
       17.815, 28.089, 21.717, 34.018, 16.448, 25.876, 44.673, 21.957,
       16.028, 31.978, 21.921, 20.542, 26.234, 33.55 , 30.222, 19.88 ,
       27.288, 16.302, 14.934, 22.961, 27.268, 17.147, 20.538, 30.51 ,
       10.187, 21.264, 21.262,  7.225, 20.097, 46.97 , 12.082, 13.522,
       22.008, 12.609, 20.435,  8.976, 20.58 , 27.007, 16.026, 23.329,
       24.346, 17.787, 22.135,  7.881, 18.524, 20.042, 25.241, 19.298,
       32.793, 13.215, 12.961, 12.98 , 19.742, 24.277, 13.176, 20.387,
       21.179, 14.004, 19.233, 24.822, 20.402, 24.114,  9.165, 14.91 ,
      

In [None]:
train_score_rf = rf.score(X_train, y_train) 
test_score_rf = rf.score(X_test, y_test)

print(train_score_rf)
print(test_score_rf)

0.9771342521069045
0.8338530730048258


In [None]:
est_depths = [estimator.get_depth() for estimator in rf.estimators_]
max(est_depths)

23

In [None]:
depths = range(1, max(est_depths))
scores = pd.DataFrame(index=depths, columns=['Test Score'])
for depth in depths:    
   model = RandomForestRegressor(max_depth=depth)
   model.fit(X_train, y_train)
   scores.loc[depth, 'Train Score'] = model.score(X_train, y_train)
   scores.loc[depth, 'Test Score'] = model.score(X_test, y_test)
   scores.head()

In [None]:
sorted_scores = scores.sort_values(by='Test Score', ascending=False)
sorted_scores.head()

Unnamed: 0,Test Score,Train Score
9,0.836256,0.972072
21,0.833235,0.97565
16,0.830844,0.973589
13,0.828456,0.975958
12,0.827126,0.974848


## 4. Explain in a text cell how your model will perform if deployed by referring to the metrics. 

Best model will be to use Random Forest. This is better than the bagged tree, because the test values are better overall. This means its learning the data data with a lot of variation and still getting a really high prediction rate.

Whereas the Random Forest is getting a really high training score, but the test score is lower. 