In [31]:
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectPercentile
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')


import numpy as np
import pandas as pd

features = pd.read_csv("./data/prepped/modeling_features.csv")
labels = pd.read_csv("./data/prepped/modeling_outcome.csv")
# print(labels)

features = features.drop(["Unnamed: 0"], axis=1)
labels = labels.drop(["Unnamed: 0"], axis=1)

train_features, test_features, train_outcome, test_outcome = train_test_split(
    features,
    labels,
    test_size=0.25,
    random_state=42
)

# {'n_estimators': 1188,
#  'min_samples_split': 2,
#  'max_features': 'sqrt',
#  'max_depth': 72,
#  'bootstrap': True}



Fitting 10 folds for each of 36 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    8.5s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:  2.4min finished


model score: 0.10823736122731477
best params: {'randomforestregressor__bootstrap': True, 'randomforestregressor__max_depth': 70, 'randomforestregressor__max_features': 'sqrt', 'randomforestregressor__min_samples_split': 2, 'randomforestregressor__n_estimators': 1200, 'selectpercentile__percentile': 90}


#### Initial Model
Base model with `n_estimators` set to 1000

In [13]:
base_model = RandomForestRegressor(n_estimators=1000, random_state=42)

base_model.fit(train_features, train_outcome)

base_predictions = base_model.score(test_features, test_outcome)
print(base_predictions)

0.06243041280006323


#### Tree-Based Feature Selection
Through domain research and exploratory data analysis we were able to determine that some features are not as important as others. Therefore, here we attempt to reduce the number of features used within our model by extracting features that are more significant that others with [Tree-Based Feature Selection](https://scikit-learn.org/stable/modules/feature_selection.html#feature-selection-using-selectfrommodel).

In [32]:
from sklearn.feature_selection import SelectFromModel

feature_model = SelectFromModel(base_model, prefit=True)

# Transform into new training and testing feature sets
refined_train_features = feature_model.transform(train_features)
refined_test_features = feature_model.transform(test_features)

# Retrained with refined features
refined_model = RandomForestRegressor(n_estimators=1000, random_state=42)
refined_model.fit(refined_train_features, train_outcome)

print("Score: " + str(refined_model.score(refined_test_features, test_outcome)))
print("Feature Importance")
print(list(refined_model.feature_importances_))
feature_tuples = [(feature, round(importance, 2)) for feature, importance in zip(list(features.columns), list(refined_model.feature_importances_))]
refined_features = list(dict(feature_tuples))
print("Refined Feature Lists")
print(refined_features)

Score: 0.06943047994228047
Feature Importance
[0.12035150713604256, 0.1492356506341165, 0.06265912161266408, 0.06817454451584039, 0.12726480226889095, 0.10246491606972769, 0.05729493839554666, 0.0579565993618576, 0.03624400706455558, 0.03911480949252455, 0.11003355235493133, 0.06920555109330168]
Refined Feature Lists
['abv', 'ibu', 'diff_g', 'boil_time', 'efficiency', 'ferm_total_weight', 'ferm_type_base_malt', 'ferm_type_crystal_malt', 'ferm_type_roasted_malt', 'ferm_type_other', 'ferm_type_extract', 'ferm_type_sugar']


#### Cross Validation
Cross validation was attempted for a range of parameters within our model. After reading more into [RandomForestClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html), I concluded on five parameters to tune in order to build a better model. Those parameters included that of `n_estimators`, `max_features`, `max_depth`, `min_samples_split` and `bootstrap`. These all are important as they allow us to control the number of trees in the forest as well as the number of features considered for splitting at each leaf node. The values derived here are a result of my intial model with variations to the range of potential values in order to attempt to capture a better fit within the range of parameter values.

In [33]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1500, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 150, num = 10)]

# Minimum number of samples required to split a node
min_samples_split = [2, 3, 5]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'bootstrap': bootstrap}

cv_model = RandomizedSearchCV(estimator=RandomForestRegressor(), param_distributions=random_grid,
                             n_iter = 100, scoring='r2', 
                              cv = 5, verbose=True, random_state=42, n_jobs=-1)

cv_model.fit(refined_train_features, train_outcome)

print(cv_model.score(refined_test_features, test_outcome))


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   14.3s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  1.5min


KeyboardInterrupt: 

#### GridSearch Cross Validation

In [None]:
folds = KFold(n_splits=10, shuffle=True, random_state=42)

pipeline = make_pipeline(
    MinMaxScaler(),
    SelectPercentile(),
    RandomForestRegressor()
)

# Number of trees in random forest
n_estimators = [800, 1000, 1200, 1400]

# Number of features to consider at every split
max_features = ['sqrt']

# Maximum number of levels in tree
max_depth = [40, 55, 70]

# Minimum number of samples required to split a node
min_samples_split = [2]

# Method of selecting samples for training each tree
bootstrap = [True]

pipeline_params = {
    "selectpercentile__percentile": [70, 80, 90],
    "randomforestregressor__n_estimators": n_estimators,
    "randomforestregressor__max_features": max_features,
    "randomforestregressor__max_depth": max_depth,
    "randomforestregressor__min_samples_split": min_samples_split,
    "randomforestregressor__bootstrap": bootstrap,
}

model = GridSearchCV(pipeline, pipeline_params, cv=folds, n_jobs=-1, verbose=True)
model.fit(train_features, train_outcome)
score = model.score(test_features, test_outcome)

print("model score:", score)
print("best params:", model.best_params_)

In [26]:
cv_model.best_params_
# cv_model.best_score_

{'n_estimators': 1188,
 'min_samples_split': 2,
 'max_features': 'sqrt',
 'max_depth': 72,
 'bootstrap': True}

In [28]:
cv_model.best_params_

{'n_estimators': 877,
 'min_samples_split': 2,
 'max_features': 'sqrt',
 'max_depth': 41,
 'bootstrap': True}