In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import constants as c
import statsmodels.api as sm
from sklearn.decomposition import PCA
import itertools
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import time
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
%matplotlib notebook

# Exploration of random forest grid search parameters

In [47]:
# Define the adjusted r-squared evaluation metrics
def score(y, y_pred, X):
    SS_Residual = sum((y-y_pred)**2)
    SS_Total = sum((y-np.mean(y))**2)
    r_squared = 1 - (float(SS_Residual))/SS_Total
    adjusted_r_squared = 1 - (1-r_squared)*(len(y)-1)/(len(y)-X.shape[1]-1)
    return adjusted_r_squared

adjusted_rsquared_scorer = make_scorer(score, X=X_train)

In [14]:
# Load data
data = pd.read_excel("../data/NationalBodyProjectTurk.xlsx")

X_cat = pd.get_dummies(data.loc[:, c.cat_demo], drop_first=True)
X_num = data.loc[:, c.num_demo + c.survey_data_aggregate]

X = pd.concat([X_cat, X_num], axis=1)

ys = data.loc[:, c.y_variables]


In [27]:
# Train test split
dependent_var = 'AETOTAL'
y = ys[dependent_var]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [28]:
# Estimate time on running one random forest model
t = time.time()
dependent_var = 'AETOTAL'
y = ys[dependent_var]
forest = RandomForestRegressor(random_state=0, max_depth=30, min_samples_leaf=5, min_samples_split=10, n_estimators=500)
forest.fit(X_train, y_train)
elapsed = time.time() - t
print(elapsed)

11.077836990356445


In [29]:
print('training R2:')
print(score(y_train, forest.predict(X_train), X_train))
print('testing R2:')
pred = forest.predict(X_test)
print(score(y_test, pred, X_train))

training R2:
0.8011135293920445
testing R2:
0.5819794801568685


In [30]:
# Exploring various random forest parameters
t = time.time()
forest = RandomForestRegressor(random_state=0, max_depth=50, min_samples_leaf=5, min_samples_split=10, n_estimators=500)
forest.fit(X_train, y_train)
elapsed = time.time() - t
print(elapsed)

11.066832780838013


In [31]:
print('training R2:')
print(score(y_train, forest.predict(X_train), X_train))
print('testing R2:')
pred = forest.predict(X_test)
print(score(y_test, pred, X_train))

training R2:
0.8011135293920445
testing R2:
0.5819794801568685


In [32]:
t = time.time()
forest = RandomForestRegressor(random_state=0, max_depth=100, min_samples_leaf=5, min_samples_split=10, n_estimators=500)
forest.fit(X_train, y_train)
elapsed = time.time() - t
print(elapsed)
print('training R2:')
print(score(y_train, forest.predict(X_train), X_train))
print('testing R2:')
pred = forest.predict(X_test)
print(score(y_test, pred, X_train))

10.188202857971191
training R2:
0.8011135293920445
testing R2:
0.5819794801568685


In [33]:
t = time.time()
forest = RandomForestRegressor(random_state=0, max_depth=20, min_samples_leaf=5, min_samples_split=10, n_estimators=500)
forest.fit(X_train, y_train)
elapsed = time.time() - t
print(elapsed)
print('training R2:')
print(score(y_train, forest.predict(X_train), X_train))
print('testing R2:')
pred = forest.predict(X_test)
print(score(y_test, pred, X_train))

10.190093994140625
training R2:
0.8009693455198915
testing R2:
0.5820070613975936


In [34]:
t = time.time()
forest = RandomForestRegressor(random_state=0, max_depth=15, min_samples_leaf=5, min_samples_split=10, n_estimators=500)
forest.fit(X_train, y_train)
elapsed = time.time() - t
print(elapsed)
print('training R2:')
print(score(y_train, forest.predict(X_train), X_train))
print('testing R2:')
pred = forest.predict(X_test)
print(score(y_test, pred, X_train))

10.074911832809448
training R2:
0.795612881856352
testing R2:
0.5820919698720516


In [37]:
t = time.time()
forest = RandomForestRegressor(random_state=0, max_depth=15, min_samples_leaf=1, min_samples_split=2, n_estimators=500)
forest.fit(X_train, y_train)
elapsed = time.time() - t
print(elapsed)
print('training R2:')
print(score(y_train, forest.predict(X_train), X_train))
print('testing R2:')
pred = forest.predict(X_test)
print(score(y_test, pred, X_train))

12.851594924926758
training R2:
0.9106601321981838
testing R2:
0.5809902863388636


In [38]:
t = time.time()
forest = RandomForestRegressor(random_state=0, max_depth=100, min_samples_leaf=1, min_samples_split=2, n_estimators=500)
forest.fit(X_train, y_train)
elapsed = time.time() - t
print(elapsed)
print('training R2:')
print(score(y_train, forest.predict(X_train), X_train))
print('testing R2:')
pred = forest.predict(X_test)
print(score(y_test, pred, X_train))

14.738137006759644
training R2:
0.9401166348610652
testing R2:
0.5803172282115419


In [40]:
from sklearn.model_selection import cross_val_score
forest = RandomForestRegressor(random_state=0, max_depth=15, min_samples_leaf=5, min_samples_split=5, n_estimators=500)
scores = cross_val_score(forest, X_train, y_train, cv=10)

In [43]:
np.mean(scores)

0.5586994275030317

In [49]:
forest = RandomForestRegressor(random_state=0, max_depth=25, min_samples_leaf=5, min_samples_split=5, n_estimators=500)
scores = cross_val_score(forest, X_train, y_train, cv=10, scoring=adjusted_rsquared_scorer)

np.mean(scores)

0.5499537811396279

In [50]:
forest = RandomForestRegressor(random_state=0, max_depth=15, min_samples_leaf=1, min_samples_split=5, n_estimators=500)
scores = cross_val_score(forest, X_train, y_train, cv=10, scoring=adjusted_rsquared_scorer)

np.mean(scores)

0.5461682976080764

In [52]:
forest = RandomForestRegressor(random_state=0, max_depth=30, min_samples_leaf=1, min_samples_split=5, n_estimators=1000)
scores = cross_val_score(forest, X_train, y_train, cv=10, scoring=adjusted_rsquared_scorer)

np.mean(scores)

0.5464036886420859

In [53]:
forest = RandomForestRegressor(random_state=0, max_depth=100, min_samples_leaf=1, min_samples_split=2, n_estimators=500)
scores = cross_val_score(forest, X_train, y_train, cv=10, scoring=adjusted_rsquared_scorer)

np.mean(scores)

0.5459511795045602

In [54]:
scores

array([0.54432218, 0.52631654, 0.57073995, 0.51730921, 0.54389151,
       0.56962867, 0.55528863, 0.58148859, 0.51410727, 0.53641925])

In [55]:
t = time.time()
forest = RandomForestRegressor(random_state=0, max_depth=100, min_samples_leaf=1, min_samples_split=2, n_estimators=500)
forest.fit(X_train, y_train)
elapsed = time.time() - t
print(elapsed)
print('training R2:')
print(score(y_train, forest.predict(X_train), X_train))
print('testing R2:')
pred = forest.predict(X_test)
print(score(y_test, pred, X_train))

15.139902830123901
training R2:
0.9401166348610652
testing R2:
0.5803172282115419


In [68]:
t = time.time()
forest = RandomForestRegressor(random_state=0, max_depth=1000, min_samples_leaf=1, min_samples_split=2, n_estimators=100)
forest.fit(X_train, y_train)
elapsed = time.time() - t
print(elapsed)
print('training R2:')
print(score(y_train, forest.predict(X_train), X_train))
print('testing R2:')
pred = forest.predict(X_test)
print(score(y_test, pred, X_train))

3.1125869750976562
training R2:
0.937809742558861
testing R2:
0.5781832674347047


In [56]:
t = time.time()
forest = RandomForestRegressor(random_state=0, max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=500)
forest.fit(X_train, y_train)
elapsed = time.time() - t
print(elapsed)
print('training R2:')
print(score(y_train, forest.predict(X_train), X_train))
print('testing R2:')
pred = forest.predict(X_test)
print(score(y_test, pred, X_train))

4.404603958129883
training R2:
0.5476344293147479
testing R2:
0.5362767362401016


In [63]:
t = time.time()
# do stuff
forest = RandomForestRegressor(random_state=0, max_depth=5, min_samples_leaf=5, min_samples_split=10, n_estimators=1000)
forest.fit(X_train, y_train)
elapsed = time.time() - t
print(elapsed)
print('training R2:')
print(score(y_train, forest.predict(X_train), X_train))
print('testing R2:')
pred = forest.predict(X_test)
print(score(y_test, pred, X_train))

9.123662233352661
training R2:
0.5473665146443003
testing R2:
0.5359623813531416


In [None]:
t = time.time()
# do stuff
forest = RandomForestRegressor(random_state=0, max_depth=15, min_samples_leaf=1, min_samples_split=2, n_estimators=500)
forest.fit(X_train, y_train)
elapsed = time.time() - t
print(elapsed)
print('training R2:')
print(score(y_train, forest.predict(X_train), X_train))
print('testing R2:')
pred = forest.predict(X_test)
print(score(y_test, pred, X_train))

In [61]:
t = time.time()
# do stuff
forest = RandomForestRegressor(random_state=0, max_depth=25, min_samples_leaf=1, min_samples_split=2, n_estimators=500)
forest.fit(X_train, y_train)
elapsed = time.time() - t
print(elapsed)
print('training R2:')
print(score(y_train, forest.predict(X_train), X_train))
print('testing R2:')
pred = forest.predict(X_test)
print(score(y_test, pred, X_train))

16.101330041885376
training R2:
0.9399384853036226
testing R2:
0.5803687599973553


In [58]:
forest = RandomForestRegressor(random_state=0, max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=500)
scores = cross_val_score(forest, X_train, y_train, cv=10, scoring=adjusted_rsquared_scorer)

In [60]:
scores

array([0.50197345, 0.47186949, 0.52182834, 0.47372189, 0.51811463,
       0.53814934, 0.51994118, 0.53756225, 0.48248627, 0.50024365])