# Modeling

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.dummy import DummyRegressor
from catboost import Pool, CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import VotingRegressor

In [None]:
#load dataframe REGULAR
#parse datetime column
#df=pd.read_csv('COVID19_modeling.csv', parse_dates=[0])
#df.set_index('date', inplace= True)
#df.drop(columns='Unnamed: 0', inplace=True)

In [2]:
#load dataframe OHE
#parse datetime column
df_ohe = pd.read_csv('ohe_data.csv', parse_dates=['date'])
df_ohe.drop(columns='Unnamed: 0', inplace=True)
df_ohe.set_index('date', inplace=True)

## Using the mean as a baseline prediction model

Previously, we determined the R2 score of using the mean to predict COVID19 cases for each individual state. Let's do the same thing now that we have all the states in one DataFrame, so that we will have a baseline "dummy" model to compare our future optimized model to. 

In [4]:
# Create features
X, y = df_ohe.drop(columns=['Conf_Cases', 'new_case_percent_pop*']), df_ohe['new_case_percent_pop*']
# Make test and train split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
dummy_mean = DummyRegressor()
# "Train" dummy regressor
dummy_mean.fit(X_train, y_train)
# Get R2 score
score_dummy = dummy_mean.score(X_test, y_test)
print("The R2 score of using the mean to predict COVID19 cases in our states is:", score_dummy)

The R2 score of using the mean to predict COVID19 cases in our states is: -0.0019588491283020204


## Tuning the top performing models 

In the pre-processing step, we determined (with the help of Pycaret) that our top performing models were **CatBoost Regressor**, **Random Forest Regressor**, and **Gradient Boosting Regressor**. Let's now fine tune the hyperparameters of each of these models, in preparation for feeding them into the pipeline of the Voting Regressor. 

**NOTE TO SELF: Determine hyperparameters for each model, but pass unfitted (tuned) models to the VotingRegressor**


#### CatBoost Regressor 

#### Random Forest Regressor
With a few exceptions, a RandomForestClassifier has all the hyperparameters of a DecisionTreeClassifier (to control how trees are grown), plus all the hyperparameters of a BaggingClassifier to control the ensemble itself.

#### Using basic hyperparameter tuning only

In [5]:
X, y = df_ohe.drop(columns=['Conf_Cases', 'new_case_percent_pop*']), df_ohe['new_case_percent_pop*']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Define parameters to search for GridSearchCV
param_grid = {'n_estimators': [100, 200, 300, 400, 500],
              'max_depth': [3, 5, 7, 9],
              }
# Instantiate RandomForestClassifier
rf = RandomForestRegressor()
cv_rf = GridSearchCV(rf, param_grid, cv = 10)
cv_rf_fit = cv_rf.fit(X_train, y_train)

#Print the best parameter
print(cv_rf_fit.best_params_)

{'max_depth': 9, 'n_estimators': 400}


In [8]:
print('The optimal max_depth for the RandomForestRegressor is: {}'.format(cv_rf_fit.best_params_['max_depth']))
print('The optimal n_estimators for the RandomForestRegressor is: {}'.format(cv_rf_fit.best_params_['n_estimators']))

The optimal max_depth for the RandomForestRegressor is: 9
The optimal n_estimators for the RandomForestRegressor is: 400


In [9]:
# Instantiate RFR with optimal hyperparameters
rf = RandomForestRegressor(max_depth = 9, n_estimators = 400)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

In [10]:
# Determine R2 score
print('The R2 score for the RandomForestRegressor is {}'.format(r2_score(y_test, rf_pred)))

The R2 score for the RandomForestRegressor is 0.8327308664092816


****

#### Using RandomSearchCV to find optimal hyperparameter values

In [18]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [19]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   16.2s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  3.1min finished


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [20]:
rf_random.best_params_

{'n_estimators': 800,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 100,
 'bootstrap': True}

In [47]:
# Instantiate RFR with optimal hyperparameters
rf2 = RandomForestRegressor(n_estimators= 800,
                             min_samples_split= 2,
                             min_samples_leaf= 1,
                             max_features= 'auto',
                             max_depth= 100,
                             bootstrap= True)
rf2.fit(X_train, y_train)
rf_pred2 = rf2.predict(X_test)

In [48]:
# Determine R2 score
print('The R2 score for the RandomForestRegressor with RSCV tuning is {}'.format(r2_score(y_test, rf_pred)))

The R2 score for the RandomForestRegressor with RSCV tuning is 0.8396750629461008


In [46]:
print('The MSE of the RandomForestRegressor with RSCV tuning is {}'.format(mean_squared_error(y_test, rf_pred)))

The MSE of the RandomForestRegressor with RSCV tuning is 55.73368057130931


#### Using GridSearchCV to find the optimal hyperparamter values

In [26]:
# Create the parameter grid based on the results of random search 
GS_param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110, 120],
    'max_features': ['auto'],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'min_samples_split': [2, 3, 4, 5],
    'n_estimators': [600, 700, 800, 9000, 1000]
}

In [30]:
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = GS_param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [31]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 500 candidates, totalling 1500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   29.8s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  9.3min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 16.8min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed: 26.6min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed: 38.1min
[Parallel(n_jobs=-1)]: Done 1500 out of 1500 | elapsed: 39.6min finished


GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'bootstrap': [True],
                         'max_depth': [80, 90, 100, 110, 120],
                         'max_features': ['auto'],
                         'min_samples_leaf': [1, 2, 3, 4, 5],
                         'min_samples_split': [2, 3, 4, 5],
                         'n_estimators': [600, 700, 800, 9000, 1000]},
             verbose=2)

In [32]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 110,
 'max_features': 'auto',
 'min_samples_leaf': 2,
 'min_samples_split': 3,
 'n_estimators': 800}

In [34]:
# Instantiate RFR with optimal hyperparameters
rf3 = RandomForestRegressor(n_estimators= 800,
                             min_samples_split= 3,
                             min_samples_leaf= 2,
                             max_features= 'auto',
                             max_depth= 110,
                             bootstrap= True)
rf3.fit(X_train, y_train)
rf_pred3 = rf3.predict(X_test)

In [36]:
# Determine R2 score
print('The R2 score for the RandomForestRegressor with GSCV tuning is {}'.format(r2_score(y_test, rf_pred3)))

The R2 score for the RandomForestRegressor with tuning is 0.8311230661025624


In [41]:
print('The MSE of the RandomForestRegressor with GSCV tuning is {}'.format(mean_squared_error(y_test, rf_pred3)))

The MSE of the RandomForestRegressor with GSCV tuning is 58.70660711089296


It looks like the optimal hyperparameters chosen by RandomSearchCV performed the best, both in terms of the R2 and mean squared error, so we'll use those hyperparameters (as defined in `rf2`).

***

****

#### Voting Regressor

```
from sklearn.ensemble import VotingRegressor
reg_voting = VotingRegressor(
        estimators = [
            ('label1', reg_1),
            ('label2', reg_2),
            ...
            ('labelN', reg_N)],
        voting = 'soft',
        weights = [w_1, w_2, ..., w_N]
)
```

#### Determining optimal weights

In [None]:
# Create empty lists that will storage the different weights

weights1 = []
weights2 = []
weights3 = []
scores = []

# Create a for loop to evaluate different combinations of weights

for i in np.arange(0.1,1, 0.1):
    for j in np.arange(0.1,1, 0.1):
        for k in np.arange(0.1,1, 0.1):
            clf_voting = VotingClassifier(estimators = [('est1', clf1), ('est2', clf2),
                                           ('est3', clf3)], voting = 'soft', weights = [i, j, k])
            clf_voting.fit(X_train, y_train)
            pred = clf_voting.predict(X_test)
            score = accuracy_score(y_test, pred)
            scores.append(score)
            weights1.append(i)
            weights2.append(j)
            weights3.append(k)

In [None]:
# Save the results in a data frame

test_scores = pd.DataFrame()
test_scores['Weight1'] = weights1
test_scores['Weight2'] = weights2
test_scores['Weight3'] = weights3
test_scores['Test Score'] = scores

# Create an additional column to save the sum of all the weights

test_scores['sum_weights'] = test_scores['Weight1'].add(test_scores['Weight2']).add(test_scores['Weight3'])

#We are only getting the rows that the sum of all weights were equal to one

condition = test_scores['sum_weights'] == 1

test_scores = test_scores.loc[condition]

In [None]:
# Sort the values to see the different test scores depending on the weights
test_scores.sort_values(by = 'Test Score', ascending = False).head(15)


****

***

****

In [None]:
reg_voting = VotingRegressor(
                estimators = [
                    ('rf_reg', rf),
                    ('gb_reg', gb),
                    ('cb_reg', cb)],
                voting = 'soft',
                weights = [w_1, w_2, w_3]
)