# Modeling

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV, RepeatedKFold
from sklearn.metrics import r2_score, mean_squared_error as mse, mean_absolute_error as mae
from sklearn.dummy import DummyRegressor
from catboost import Pool, CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
#load dataframe REGULAR
#parse datetime column
#df=pd.read_csv('COVID19_modeling.csv', parse_dates=[0])
#df.set_index('date', inplace= True)
#df.drop(columns='Unnamed: 0', inplace=True)

In [2]:
#load dataframe OHE
#parse datetime column
df_ohe = pd.read_csv('ohe_data.csv', parse_dates=['date'])
df_ohe.drop(columns='Unnamed: 0', inplace=True)
df_ohe.set_index('date', inplace=True)

## Train-test split

Before training any models on the data, let's do a train-test split to keep training and testing data consistent and separated.

In [3]:
# Create features
X, y = df_ohe.drop(columns=['Conf_Cases', 'new_case_percent_pop*']), df_ohe['new_case_percent_pop*']
# Make test and train split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Using the mean as a baseline prediction model

Previously, we determined the R2 score of using the mean to predict COVID19 cases for each individual state. Let's do the same thing now that we have all the states in one DataFrame, so that we will have a baseline "dummy" model to compare our future optimized models to. 

In [4]:
dummy_mean = DummyRegressor()
# "Train" dummy regressor
dummy_mean.fit(X_train, y_train)
# Get R2 score
score_dummy = dummy_mean.score(X_test, y_test)
print("The R2 score of using the mean to predict COVID19 cases in our states is:", score_dummy)

The R2 score of using the mean to predict COVID19 cases in our states is: -0.0019588491283020204


Let's store the evaluation metric values for the dummy regressor so we can compare them with our future models. 

In [5]:
dummy_pred = dummy_mean.predict(X_test)
dummy_r2 = r2_score(y_test, dummy_pred)
dummy_mse = mse(y_test, dummy_pred)
dummy_rmse = np.sqrt(mse(y_test, dummy_pred))

## Tuning the top performing models for ensemble model

In the pre-processing step, we determined (with the help of Pycaret) that our top performing models were **CatBoost Regressor**, **Random Forest Regressor**, and **Extra Trees Regressor**. Let's now fine tune the hyperparameters of each of these models, in preparation for feeding them into the pipeline of the Voting Regressor. 

**NOTE TO SELF: Determine hyperparameters for each model, but pass unfitted (tuned) models to the VotingRegressor**


### 1. Random Forest Regressor
With a few exceptions, a RandomForestClassifier has all the hyperparameters of a DecisionTreeClassifier (to control how trees are grown), plus all the hyperparameters of a BaggingClassifier to control the ensemble itself. I'll explore a couple different ways to determine optimal hyperparameters and will choose those that produce the best R2 and MSE for the final ensemble model (a VotingRegressor).

#### Using basic hyperparameter tuning only

In [6]:
# Define parameters to search for GridSearchCV
basic_param_grid = {'n_estimators': [100, 200, 300, 400, 500],
              'max_depth': [3, 5, 7, 9],
              }
# Instantiate RandomForestRegressor
basic_rf = RandomForestRegressor(random_state=42)
cv_rf = GridSearchCV(basic_rf, basic_param_grid, cv = 5)
cv_rf_fit = cv_rf.fit(X_train, y_train)

#Print the best parameter
#print(cv_rf_fit.best_params_)

{'max_depth': 9, 'n_estimators': 400}


In [7]:
print('The optimal max_depth for the RandomForestRegressor is: {}'.format(cv_rf_fit.best_params_['max_depth']))
print('The optimal n_estimators for the RandomForestRegressor is: {}'.format(cv_rf_fit.best_params_['n_estimators']))

The optimal max_depth for the RandomForestRegressor is: 9
The optimal n_estimators for the RandomForestRegressor is: 400


In [8]:
# Instantiate RFR with optimal hyperparameters
basic_rf = RandomForestRegressor(max_depth = 9, n_estimators = 400, random_state=42)
basic_rf.fit(X_train, y_train)
basic_rf_pred = basic_rf.predict(X_test)

In [9]:
# Calculate evaluation metrics on basic_rf
basic_rf_r2 = r2_score(y_test, basic_rf_pred)
basic_rf_mse = mse(y_test, basic_rf_pred)
basic_rf_rmse = np.sqrt(basic_rf_mse)

Create a DataFrame to store the evaluation metric scores for our Random Forest Regressor, as well as the dummy regressor created in the previous step. This way, once we've built all the models, we'll be able to easily compare each model's metrics and determine which performs the best.

In [10]:
rf_results = pd.DataFrame({'Model':['dummy_reg', 'basic_rf'], 'R2': [dummy_r2, basic_rf_r2], 'MSE':[dummy_mse, basic_rf_mse], 'RMSE':[dummy_rmse, basic_rf_rmse]})

In [11]:
rf_results

Unnamed: 0,Model,R2,MSE,RMSE
0,dummy_reg,-0.001959,348.310472,18.663078
1,basic_rf,0.84063,55.401741,7.443235


In [25]:
# Determine R2 score
#print('The R2 score for the RandomForestRegressor is {}'.format(r2_score(y_test, basic_rf_pred)))

****

#### Using RandomSearchCV to find optimal hyperparameter values

In [26]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
max_depth.append(9)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [31]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, n_jobs=-1, random_state=42)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   19.0s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  5.3min finished


RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None, 9],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [32]:
rf_random.best_params_

{'n_estimators': 1400,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_features': 'auto',
 'max_depth': 50,
 'bootstrap': True}

In [33]:
# Instantiate RFR with optimal hyperparameters
rf2 = RandomForestRegressor(n_estimators= 1400,
                             min_samples_split= 2,
                             min_samples_leaf= 2,
                             max_features= 'auto',
                             max_depth= 50,
                             bootstrap= True, 
                             random_state=42)
rf2.fit(X_train, y_train)
rf_pred2 = rf2.predict(X_test)

In [35]:
rf_random_r2 = r2_score(y_test, rf_pred2)
rf_random_mse = mse(y_test, rf_pred2)
rf_random_rmse = np.sqrt(mse(y_test, rf_pred2))

In [36]:
rf_results.loc[len(rf_results)]= ['rf_randomCV', 
                                 r2_score(y_test, rf_pred2),
                                 mse(y_test, rf_pred2),
                                 np.sqrt(mse(y_test, rf_pred2))]

In [37]:
rf_results

Unnamed: 0,Model,R2,MSE,RMSE
0,dummy_reg,-0.001959,348.310472,18.663078
1,basic_rf,0.84063,55.401741,7.443235
2,rf_randomCV,0.83107,58.725149,7.663234
3,rf_randomCV,0.830081,59.068859,7.685627


In [23]:
# Determine R2 score
#print('The R2 score for the RandomForestRegressor with RSCV tuning is {}'.format(r2_score(y_test, rf_pred)))

In [24]:
#print('The MSE of the RandomForestRegressor with RSCV tuning is {}'.format(mean_squared_error(y_test, rf_pred)))

***

#### Using GridSearchCV to find the optimal hyperparamter values

In [None]:
# Create the parameter grid based on the results of random search 
GS_param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110, 120],
    'max_features': ['auto'],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'min_samples_split': [2, 3, 4, 5],
    'n_estimators': [600, 700, 800, 9000, 1000]
}

In [None]:
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = GS_param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [None]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
# Instantiate RFR with optimal hyperparameters
rf3 = RandomForestRegressor(n_estimators= 800,
                             min_samples_split= 3,
                             min_samples_leaf= 2,
                             max_features= 'auto',
                             max_depth= 110,
                             bootstrap= True)
rf3.fit(X_train, y_train)
rf_pred3 = rf3.predict(X_test)

In [None]:
# Determine R2 score
print('The R2 score for the RandomForestRegressor with GSCV tuning is {}'.format(r2_score(y_test, rf_pred3)))

In [None]:
print('The MSE of the RandomForestRegressor with GSCV tuning is {}'.format(mean_squared_error(y_test, rf_pred3)))

It looks like the optimal hyperparameters chosen by RandomSearchCV performed the best, both in terms of the R2 and mean squared error, so we'll use those hyperparameters (as defined in `rf2`).

***

## 2. Extra Trees Regressor

#### Using basic hyperparameter tuning only

In [None]:
# Define parameters to search for GridSearchCV
param_grid = {'n_estimators': [100, 200, 300, 400, 500],
              'max_depth': [3, 5, 7, 9],
              }
# Instantiate ExtraTreesRegressor
et = ExtraTreesRegressor()
cv_et = GridSearchCV(et, param_grid, cv = 10)
cv_et_fit = cv_et.fit(X_train, y_train)

#Print the best parameter
print(cv_et_fit.best_params_)

In [None]:
print('The optimal max_depth for the ExtraTreesRegressor is: {}'.format(cv_et_fit.best_params_['max_depth']))
print('The optimal n_estimators for the ExtraTreesRegressor is: {}'.format(cv_et_fit.best_params_['n_estimators'])) cf

In [None]:
# Instantiate ETR with optimal hyperparameters
et = RandomForestRegressor(max_depth = 9, n_estimators = 500)
et.fit(X_train, y_train)
et_pred = et.predict(X_test)

In [None]:
# Determine R2 score
print('The R2 score for the basic Extra Trees Regressor is {}'.format(r2_score(y_test, et_pred)))

In [None]:
print('The MSE of the basic ExtraTreesRegressor is {}'.format(mean_squared_error(y_test, et_pred)))

#### Using RandomSearchCV to find optimal hyperparameter values

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid_et = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
et = ExtraTreesRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
et_random = RandomizedSearchCV(estimator = et, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
et_random.fit(X_train, y_train)

In [None]:
et_random.best_params_

In [None]:
# Instantiate RFR with optimal hyperparameters
et2 = ExtraTreesRegressor(n_estimators= 1800,
                             min_samples_split= 2,
                             min_samples_leaf= 2,
                             max_features= 'auto',
                             max_depth= None,
                             bootstrap= True)
et2.fit(X_train, y_train)
et_pred2 = et2.predict(X_test)

In [None]:
# Determine R2 score
print('The R2 score for the ExtraTreesRegressor with RSCV tuning is {}'.format(r2_score(y_test, et_pred2)))

In [None]:
print('The MSE of the ExtraTreesRegressor with RSCV tuning is {}'.format(mean_squared_error(y_test, et_pred2)))

#### Using GridSearchCV to find the optimal hyperparamter values

In [None]:
# Create the parameter grid based on the results of random search 
GS_param_grid_et = {
    'bootstrap': [True],
    'max_depth': [80, 100, 120, None],
    'max_features': ['auto'],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [2, 3, 4],
    'n_estimators': [1600, 1700, 1800, 1900, 2000]
}

In [None]:
# Create a based model
et3 = ExtraTreesRegressor()
# Instantiate the grid search model
grid_search_et3 = GridSearchCV(estimator = et2, param_grid = GS_param_grid_et, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [None]:
# Fit the grid search to the data
grid_search_et3.fit(X_train, y_train)

In [None]:
grid_search_et3.best_params_

In [None]:
# Instantiate RFR with optimal hyperparameters
et3 = ExtraTreesRegressor(n_estimators= 1800,
                             min_samples_split= 4,
                             min_samples_leaf= 2,
                             max_features= 'auto',
                             max_depth= 80,
                             bootstrap= True)
et3.fit(X_train, y_train)
et_pred3 = et3.predict(X_test)

In [None]:
# Determine R2 score
print('The R2 score for the Extra Trees Regressor with GSCV is {}'.format(r2_score(y_test, et_pred3)))

In [None]:
print('The MSE of the ExtraTreesRegressor with GSCV is {}'.format(mean_squared_error(y_test, et_pred3)))

It seems that here again, the parameters suggested by RandomizedSearchCV (stored in `et2`) produced the best MSE and R2, so we'll use those 

## 3. CatBoost Regressor

CatBoost is an open-sourced gradient boosting library. 
* 1. run CatBoost without tuning any hyperparameters to get baseline mse and r2
* 2. RandomizedSearchCV for hyperparameters
* 3. GridSearchCV for hyperparameters

***

****

#### Voting Regressor

```
from sklearn.ensemble import VotingRegressor
reg_voting = VotingRegressor(
        estimators = [
            ('label1', reg_1),
            ('label2', reg_2),
            ...
            ('labelN', reg_N)],
        voting = 'soft',
        weights = [w_1, w_2, ..., w_N]
)
```

In [None]:
reg_voting = VotingRegressor(
                estimators = [
                    ('rf_reg', rf),
                    ('et_reg', et),
                    ('cb_reg', cb)],
                voting = 'soft',
                weights = [w_1, w_2, w_3]
)

#### Determining optimal weights

In [None]:
# Create empty lists that will storage the different weights

weights1 = []
weights2 = []
weights3 = []
scores = []

# Create a for loop to evaluate different combinations of weights

for i in np.arange(0.1,1, 0.1):
    for j in np.arange(0.1,1, 0.1):
        for k in np.arange(0.1,1, 0.1):
            clf_voting = VotingClassifier(estimators = [('est1', clf1), ('est2', clf2),
                                           ('est3', clf3)], voting = 'soft', weights = [i, j, k])
            clf_voting.fit(X_train, y_train)
            pred = clf_voting.predict(X_test)
            score = accuracy_score(y_test, pred)
            scores.append(score)
            weights1.append(i)
            weights2.append(j)
            weights3.append(k)

In [None]:
# Save the results in a data frame

test_scores = pd.DataFrame()
test_scores['Weight1'] = weights1
test_scores['Weight2'] = weights2
test_scores['Weight3'] = weights3
test_scores['Test Score'] = scores

# Create an additional column to save the sum of all the weights

test_scores['sum_weights'] = test_scores['Weight1'].add(test_scores['Weight2']).add(test_scores['Weight3'])

#We are only getting the rows that the sum of all weights were equal to one

condition = test_scores['sum_weights'] == 1

test_scores = test_scores.loc[condition]

In [None]:
# Sort the values to see the different test scores depending on the weights
test_scores.sort_values(by = 'Test Score', ascending = False).head(15)


```
from sklearn.model_selection import GridSearchCV
# function to create a grid search containing pipeline
def gridsearch_maker(pipeline, params):
    '''Takes in a pipeline and param grid, returns GridSearchCV object'''
    return GridSearchCV(estimator=pipeline,
                        param_grid=params,
                        scoring='recall',
                        cv=10,
                        n_jobs=-1)
```

```
from sklearn.pipeline import Pipeline
# function to make a pipeline with classifier input
def pipe_maker(classifier):
    '''Takes in a classifier, returns a pipeline'''
    pipe = Pipeline([('scl', StandardScaler()),
                    ('clf', classifier(class_weight= 'balanced', random_state=42))])
    return pipe
```

```
from sklearn.pipeline import Pipeline
# function to make a pipeline with classifier input
def pipe_maker(classifier):
    '''Takes in a classifier, returns a pipeline'''
    pipe = Pipeline([('scl', StandardScaler()),
                    ('clf', classifier(class_weight= 'balanced', random_state=42))])
    return pipe
```