# Modeling

In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.dummy import DummyRegressor
from catboost import Pool, CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import VotingRegressor

In [11]:
#load dataframe
#parse datetime column
df=pd.read_csv('COVID19_modeling.csv', parse_dates=[0])
df.set_index('date', inplace= True)
df.drop(columns='Unnamed: 0', inplace=True)

In [13]:
df

Unnamed: 0_level_0,Avg_Temp(F),Conf_Cases,day_of_week,day_of_year,Year,Month,Day,new_case_percent_pop*,state_id
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2020-03-01,26.42,1.0,6,61,2020,3,1,0.014225,MA
2020-03-02,36.50,1.0,0,62,2020,3,2,0.014225,MA
2020-03-03,55.94,1.0,1,63,2020,3,3,0.014225,MA
2020-03-04,46.94,2.0,2,64,2020,3,4,0.028450,MA
2020-03-05,42.98,8.0,3,65,2020,3,5,0.113799,MA
...,...,...,...,...,...,...,...,...,...
2021-09-08,70.16,935.0,2,251,2021,9,8,25.929410,CT
2021-09-09,71.42,626.0,3,252,2021,9,9,17.360225,CT
2021-09-10,66.92,625.0,4,253,2021,9,10,17.332493,CT
2021-09-11,63.14,0.0,5,254,2021,9,11,0.000000,CT


## Using the mean as a baseline prediction model

Previously, we determined the R2 score of using the mean to predict COVID19 cases for each individual state. Let's do the same thing now that we have all the states in one DataFrame, so that we will have a baseline "dummy" model to compare our future optimized model to. 

In [20]:
# Create features
X, y = df.drop(columns=['Conf_Cases', 'new_case_percent_pop*']), df['new_case_percent_pop*']
# Make test and train split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
dummy_mean = DummyRegressor()
# "Train" dummy regressor
dummy_mean.fit(X_train, y_train)
# Get R2 score
score_dummy = dummy_mean.score(X_test, y_test)
print("The R2 score of using the mean to predict COVID19 cases in our states is:", score_dummy)

The R2 score of using the mean to predict COVID19 cases in our states is: -0.0019588491283020204


## Tuning the top performing models 

In the pre-processing step, we determined (with the help of Pycaret) that our top performing models were **CatBoost Regressor**, **Random Forest Regressor**, and **Gradient Boosting Regressor**. Let's now fine tune the hyperparameters of each of these models, in preparation for feeding them into the pipeline of the Voting Regressor. 

#### CatBoost Regressor 