In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import model_preparation

from model_preparation import prepare_data, get_features, get_bounds, get_interval_accuracy_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, GridSearchCV, RandomizedSearchCV

# Load in pre-split data
X_train, X_test, y_train, y_test = prepare_data()
features = get_features()
train_bounds_5 = get_bounds(y_train, 5)
train_bounds_10 = get_bounds(y_train, 10)
test_bounds_5 = get_bounds(y_test, 5)
test_bounds_10 = get_bounds(y_test, 10)

In [2]:
# Flatten response variables array to list to use in models
y_train = list(np.array(y_train).flatten())
y_test = list(np.array(y_test).flatten())

In [4]:
# Perform cross validation to find best hyper parameters for random forest models
rfr = RandomForestRegressor()
parameters = {
            "n_estimators" : [50, 75, 150, 250, 300, 400],
            "max_depth": [None, 5, 8, 12, 15],
            "max_features": ['auto','sqrt']
            }

ran = RandomizedSearchCV(estimator=rfr, param_distributions=parameters, cv=5, n_jobs=-1, n_iter=10, random_state=0, verbose=2)
ran.fit(X_train, y_train)

# View best parameters to use in final model
ran.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 14.2min finished


{'n_estimators': 150, 'max_features': 'auto', 'max_depth': None}

In [3]:
# Train Random Forest model using best hyper-parameters 
rf_150 = RandomForestRegressor(n_estimators=150, max_depth=None)
rf_150.fit(X_train, y_train)

# Compute predictions 
y_pred_train = rf_150.predict(X_train)
y_pred_test = rf_150.predict(X_test)

# Compute metrics
MSE = mean_squared_error(y_train, y_pred_train)
r2 = r2_score(y_train, y_pred_train)

print("Train Scores:")
print(MSE)
print(r2)

MSE = mean_squared_error(y_test, y_pred_test)
r2 = r2_score(y_test, y_pred_test)

print("Test Scores:")
print(MSE)
print(r2)

Train Scores:
2.4173012606601287
0.9962858884664556
Test Scores:
19.684621725738268
0.9700255811838044


In [5]:
# Compute accuracy scores for 5% and 10% intervals on test data
print("5% +/- limit:")
print(get_interval_accuracy_score(test_bounds_5, y_pred_test))
print("10% +/- limit:")
print(get_interval_accuracy_score(test_bounds_10, y_pred_test))

5% +/- limit:
0.76275
10% +/- limit:
0.88565
