In [18]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor

from sklearn.datasets import load_boston
boston = load_boston()
X = boston.data
y = boston.target

In [19]:
rf = RandomForestRegressor(random_state=2020)
rf

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators='warn',
                      n_jobs=None, oob_score=False, random_state=2020,
                      verbose=0, warm_start=False)

In [20]:
rf.fit(X, y)
# Random forests just split your data in random, uncorrelated ways



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=2020,
                      verbose=0, warm_start=False)

In [26]:
rf.estimators_

[DecisionTreeRegressor(criterion='mse', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       presort=False, random_state=2088543072, splitter='best'),
 DecisionTreeRegressor(criterion='mse', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       presort=False, random_state=639299976, splitter='best'),
 DecisionTreeRegressor(criterion='mse', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2,

In [21]:
rf.estimators_[0].predict(X[:1])

array([29.4])

In [22]:
[tree.predict(X[:1]) for tree in rf.estimators_]

[array([29.4]),
 array([24.]),
 array([24.]),
 array([30.1]),
 array([24.]),
 array([24.]),
 array([24.]),
 array([24.]),
 array([24.]),
 array([24.6])]

In [23]:
np.mean([tree.predict(X[:1]) for tree in rf.estimators_])

25.21

In [25]:
rf.predict(X[:1])

# When you predict with the random forest
# you're just predicting each individual decision tree and averaging their values together
# Just takes each individual tree, makes the prediction, and then averages them all together
# Decision trees are weak predictors in useful ways

array([25.21])

In [29]:
# How do you tune a random forest?

rf.get_params()
# Random forests have some interesting parameters!
# All comes down to how you grow your trees
# Max_depth - how many levels of splitting/branches
# Shallower your trees, the more likely you are to be underfit
# Deeper your trees, the more likely you are to be overfit
# Max Features - maximum number of columns that it accepts. Typically a decimal. .6 is 60% of your columns. More of a top down.
# Min samples per leaf - minimum number of samples that have to be there. More bottom up approach.
# N_estimators - number of trees!

{'bootstrap': True,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 2020,
 'verbose': 0,
 'warm_start': False}

In [32]:
rf.score(X, y)
# This is a heavily overfit random forest
# The default value is for the min_samples to be 1. This is not really the best choice

0.9697998600424886

In [34]:
# Grid search!
# Like a gigantic cannon
# A great way to exhaustively search parameters

from sklearn.model_selection import GridSearchCV

In [35]:
?GridSearchCV
# Lots of different parameters for GridSearchCV, check em out

In [38]:
from sklearn.metrics import mean_squared_error, make_scorer

loss_function = make_scorer(mean_squared_error, greater_is_better=False)
# Normally when you do cross validation it returns R2
# Sometimes you'd want to have a loss function
# Greater is better is a default... but you want to the lowest possible mean squared error

In [40]:
?make_scorer

In [41]:
# Firing out everything and seeing what your best results are
params = {
    'n_estimators': [5, 10, 25, 50],
    'max_features': [.03, 0.4, 0.5, 0.6],
    'min_samples_leaf' : [5, 10, 15]
}

In [42]:
grid = GridSearchCV(estimator=rf, param_grid=params, scoring=loss_function, cv=5)
# Take our random forest, test out all paramaeters, use our loss function and do 5 fold cross validation

In [43]:
grid.fit(X,y)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=10, n_jobs=None,
                                             oob_score=False, random_state=2020,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'max_features': [0.03, 0.4, 0.5, 0.

In [46]:
grid.best_params_
# What combination gave you the best score

{'max_features': 0.6, 'min_samples_leaf': 5, 'n_estimators': 25}

In [50]:
grid_results = pd.DataFrame(grid.cv_results_)
# A big dictionary
# Useful to turn into a dataframe

In [53]:
grid_results
# Results from every single permutation
# Effectively create a data set for the fitting of your model to see how things perform under different conditions

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_features,param_min_samples_leaf,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.008188,0.002976,0.00088,0.000192,0.03,5,5,"{'max_features': 0.03, 'min_samples_leaf': 5, ...",-18.267616,-18.730338,-34.045683,-47.174204,-22.219166,-28.067995,11.117152,36
1,0.010707,0.000497,0.00085,6.1e-05,0.03,5,10,"{'max_features': 0.03, 'min_samples_leaf': 5, ...",-17.361808,-24.38946,-45.486862,-51.502005,-20.228411,-31.765188,13.949263,39
2,0.024029,0.000933,0.001418,0.000109,0.03,5,25,"{'max_features': 0.03, 'min_samples_leaf': 5, ...",-13.68032,-29.596691,-47.567043,-59.980337,-21.011822,-34.326359,17.091962,40
3,0.050411,0.002373,0.00238,0.000129,0.03,5,50,"{'max_features': 0.03, 'min_samples_leaf': 5, ...",-15.648012,-32.926171,-53.592378,-61.596712,-21.224179,-36.955298,17.90539,42
4,0.005582,0.000839,0.000625,3.1e-05,0.03,10,5,"{'max_features': 0.03, 'min_samples_leaf': 10,...",-17.399913,-21.898561,-45.563424,-52.572051,-20.220367,-31.502936,14.559609,38
5,0.009899,0.000242,0.001005,0.0003,0.03,10,10,"{'max_features': 0.03, 'min_samples_leaf': 10,...",-17.046444,-32.844244,-52.427268,-57.968335,-20.53027,-36.125532,16.503047,41
6,0.024342,0.00109,0.001591,0.000302,0.03,10,25,"{'max_features': 0.03, 'min_samples_leaf': 10,...",-17.257105,-37.635165,-60.745758,-62.689847,-23.200587,-40.260142,18.716737,43
7,0.050071,0.003275,0.002435,0.000289,0.03,10,50,"{'max_features': 0.03, 'min_samples_leaf': 10,...",-17.151069,-39.265197,-65.19949,-62.317082,-25.012159,-41.740308,19.321567,45
8,0.004995,0.000257,0.000593,2e-05,0.03,15,5,"{'max_features': 0.03, 'min_samples_leaf': 15,...",-17.303829,-28.165141,-69.201176,-68.155498,-21.664445,-40.851389,22.948064,44
9,0.009674,0.000106,0.000898,0.00012,0.03,15,10,"{'max_features': 0.03, 'min_samples_leaf': 15,...",-17.103864,-33.969564,-72.188548,-67.060102,-21.02726,-42.220132,23.083054,46


In [57]:
grid_results.sort_values(by='rank_test_score')
# The differences between the 1st, 2nd, 3rd, 4th might be a fraction of a percent

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_features,param_min_samples_leaf,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
38,0.034454,0.000601,0.001503,0.000102,0.6,5,25,"{'max_features': 0.6, 'min_samples_leaf': 5, '...",-9.072322,-11.581454,-16.86085,-47.769405,-14.281004,-19.891583,14.16421,1
39,0.067979,0.001131,0.002449,0.000149,0.6,5,50,"{'max_features': 0.6, 'min_samples_leaf': 5, '...",-8.65356,-12.248097,-18.719754,-46.411869,-14.692544,-20.122454,13.532295,2
37,0.014505,0.000772,0.000855,3e-05,0.6,5,10,"{'max_features': 0.6, 'min_samples_leaf': 5, '...",-9.659311,-12.333912,-14.684548,-50.923818,-15.689954,-20.636571,15.267825,3
27,0.064187,0.002175,0.002401,0.000106,0.5,5,50,"{'max_features': 0.5, 'min_samples_leaf': 5, '...",-8.402161,-12.489176,-21.989901,-46.178528,-15.101285,-20.807645,13.419952,4
15,0.060968,0.001447,0.002329,9.3e-05,0.4,5,50,"{'max_features': 0.4, 'min_samples_leaf': 5, '...",-8.120806,-13.693727,-26.005641,-42.40621,-15.286086,-21.076838,12.127184,5
14,0.031178,0.001593,0.001893,0.000632,0.4,5,25,"{'max_features': 0.4, 'min_samples_leaf': 5, '...",-7.995264,-16.243807,-23.782032,-43.362928,-15.243349,-21.299132,12.103714,6
26,0.032124,0.000755,0.001501,0.000126,0.5,5,25,"{'max_features': 0.5, 'min_samples_leaf': 5, '...",-7.863692,-13.138678,-22.799273,-47.603821,-15.685508,-21.391407,13.947204,7
43,0.063335,0.001818,0.002636,0.000421,0.6,10,50,"{'max_features': 0.6, 'min_samples_leaf': 10, ...",-9.455479,-12.400328,-25.79771,-45.819755,-15.717116,-21.813606,13.19437,8
42,0.03193,0.001316,0.001546,0.000103,0.6,10,25,"{'max_features': 0.6, 'min_samples_leaf': 10, ...",-10.02517,-11.565475,-24.593605,-48.041933,-16.003523,-22.022185,13.947286,9
13,0.013695,0.001935,0.00083,3.6e-05,0.4,5,10,"{'max_features': 0.4, 'min_samples_leaf': 5, '...",-7.468959,-16.110047,-26.149814,-45.855373,-15.185688,-22.124954,13.257824,10


In [59]:
grid_results.groupby('param_min_samples_leaf')['mean_test_score'].mean()
# 5 looks like it gives consistently better results
# In real life these numbers are positive.
# Makes it negative automatically to work well with other models.

param_min_samples_leaf
5    -24.666358
10   -27.497689
15   -30.374537
Name: mean_test_score, dtype: float64

In [61]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

pipe = make_pipeline(StandardScaler(), rf)

In [62]:
pipe

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('randomforestregressor',
                 RandomForestRegressor(bootstrap=True, criterion='mse',
                                       max_depth=None, max_features='auto',
                                       max_leaf_nodes=None,
                                       min_impurity_decrease=0.0,
                                       min_impurity_split=None,
                                       min_samples_leaf=1, min_samples_split=2,
                                       min_weight_fraction_leaf=0.0,
                                       n_estimators=10, n_jobs=None,
                                       oob_score=False, random_state=2020,
                                       verbose=0, warm_start=False))],
         verbose=False)

In [63]:
pipe.steps
# You can grid search a pipeline
# You can grid search multiple steps

[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)),
 ('randomforestregressor',
  RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                        max_features='auto', max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=10,
                        n_jobs=None, oob_score=False, random_state=2020,
                        verbose=0, warm_start=False))]

In [64]:
params = {
    'randomforestregressor__min_samples_leaf': [5, 10, 15],
    # You type in the name of the step, two underscores, and then the name of the parameter
    'randomforestregressor__max_features': [0.4, 0.5, 0.6]
}

In [65]:
grid = GridSearchCV(estimator=pipe, param_grid=params, scoring=loss_function, cv=10)
# Once you do that it works exactly the same way

In [67]:
grid.fit(X,y)



GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('randomforestregressor',
                                        RandomForestRegressor(bootstrap=True,
                                                              criterion='mse',
                                                              max_depth=None,
                                                              max_features='auto',
                                                              max_leaf_nodes=None,
                                                              min_impurity_decrease=0.0,
                                                              min_impurity_split