# Regression: Random Forests

In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import numpy as np 
import pandas as pd 


In [2]:
rg = np.random.default_rng(20201201)
p = 12
t = 250
x_values = rg.standard_normal((t,p)) + rg.standard_normal((t,1))
x = pd.DataFrame(x_values, columns=[f"x{i}" for i in range(1,p+1)])
beta = np.linspace(0.01, 0.10, p)
errors = rg.standard_normal(t)
y = x @ beta + errors

In [3]:
rfr = RandomForestRegressor(max_features="sqrt", max_leaf_nodes=75)
rfr = rfr.fit(x,y)
resid = y - rfr.predict(x)
print(f"The SSE is {resid.T@resid}")

The SSE is 43.51767319279574


In [4]:
parameters = {'n_estimators': [100, 250, 500], "max_features":["auto", "sqrt"], "max_leaf_nodes":[3, 5, 10, 25, 50]}

rfr = RandomForestRegressor(random_state=20201231)
gscv = GridSearchCV(rfr, parameters, scoring="neg_mean_squared_error", n_jobs=-1, verbose=1)

gscv = gscv.fit(x, y)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   18.5s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  1.2min finished


In [5]:
rfr_best = gscv.best_estimator_.fit(x, y)
rfr_best

RandomForestRegressor(max_features='sqrt', max_leaf_nodes=5, n_estimators=250,
                      random_state=20201231)

In [6]:
resid = y - rfr_best.predict(x)
resid.T@resid

208.87486045632983

In [7]:
sse_xv = -t * gscv.cv_results_["mean_test_score"]
sse_xv

array([277.7532521 , 277.28561003, 277.40369621, 278.56196471,
       276.19858478, 275.2847604 , 278.72658852, 279.03373182,
       278.41337168, 284.85437131, 284.70039136, 282.68459209,
       290.20944601, 288.25296598, 285.89167538, 272.84551103,
       271.098457  , 272.12212349, 268.28641561, 266.36000982,
       267.24289694, 269.63980852, 267.73518607, 268.23338082,
       274.40713005, 272.60016313, 272.83554643, 274.43703533,
       273.35379917, 273.82693787])

In [8]:
df = pd.DataFrame(gscv.cv_results_["params"])
df["sse_xv"] = sse_xv
df.sort_values("sse_xv").head(10)


Unnamed: 0,max_features,max_leaf_nodes,n_estimators,sse_xv
19,sqrt,5,250,266.36001
20,sqrt,5,500,267.242897
22,sqrt,10,250,267.735186
23,sqrt,10,500,268.233381
18,sqrt,5,100,268.286416
21,sqrt,10,100,269.639809
16,sqrt,3,250,271.098457
17,sqrt,3,500,272.122123
25,sqrt,25,250,272.600163
26,sqrt,25,500,272.835546
