# Pre-processing 

In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from pycaret.regression import *
from statsmodels.tsa.stattools import adfuller
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import pycaret

In [2]:
#load dataframe
#parse datetime column
df=pd.read_csv('state_cov_weather2.csv', parse_dates=[0])
df.set_index('date', inplace= True)

In [3]:
df.head()

Unnamed: 0_level_0,VT_Avg_Temp(F),VT_PRCP(mm),ME_Avg_Temp(F),ME_PRCP(mm),CT_Avg_Temp(F),CT_PRCP(mm),MA_Avg_Temp(F),MA_PRCP(mm),CT_Conf_Cases,VT_Conf_Cases,ME_Conf_Cases,MA_Conf_Cases,weekday
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2020-03-01,19.58,0.3,14.18,0.3,26.24,0.0,26.42,0.0,0.0,0.0,0.0,1.0,6
2020-03-02,32.0,0.0,17.06,4.6,37.58,0.0,36.5,0.0,0.0,0.0,0.0,1.0,0
2020-03-03,44.06,2.8,39.56,0.0,51.26,3.8,55.94,0.0,0.0,0.0,0.0,1.0,1
2020-03-04,41.54,0.0,32.0,6.6,48.02,0.0,46.94,2.5,0.0,0.0,0.0,2.0,2
2020-03-05,37.04,0.0,30.02,0.0,43.16,0.0,42.98,0.0,0.0,0.0,0.0,8.0,3


In [4]:
df_vt=df.drop(columns=['ME_Avg_Temp(F)', 'ME_PRCP(mm)', 'ME_Conf_Cases', 'CT_Avg_Temp(F)', 'CT_PRCP(mm)', 'CT_Conf_Cases', 'MA_Avg_Temp(F)', 'MA_PRCP(mm)', 'MA_Conf_Cases','weekday'])

In [5]:
df_me=df.drop(columns=['VT_Avg_Temp(F)', 'VT_PRCP(mm)', 'VT_Conf_Cases', 'CT_Avg_Temp(F)', 'CT_PRCP(mm)', 'CT_Conf_Cases', 'MA_Avg_Temp(F)', 'MA_PRCP(mm)', 'MA_Conf_Cases','weekday'])

In [6]:
df_ma=df.drop(columns=['ME_Avg_Temp(F)', 'ME_PRCP(mm)', 'ME_Conf_Cases', 'CT_Avg_Temp(F)', 'CT_PRCP(mm)', 'CT_Conf_Cases', 'VT_Avg_Temp(F)', 'VT_PRCP(mm)', 'VT_Conf_Cases','weekday'])

In [7]:
df_ct= df.drop(columns=['ME_Avg_Temp(F)', 'ME_PRCP(mm)', 'ME_Conf_Cases', 'VT_Avg_Temp(F)', 'VT_PRCP(mm)', 'VT_Conf_Cases', 'MA_Avg_Temp(F)', 'MA_PRCP(mm)', 'MA_Conf_Cases','weekday'])

***

In [8]:
# Make duplicates of datasets for processing with Pycaret:
df_vt3 = df_vt.reset_index()
df_ct3 = df_ct.reset_index()
df_me3 = df_me.reset_index()
df_ma3 = df_ma.reset_index()

In [9]:
df_list = [df_ct3, df_ma3, df_me3, df_vt3]

In [10]:
# Add 'day_of_week' and 'day_of_year' columns to each dataframe
for df in df_list:
    df['day_of_week'] = [i.dayofweek for i in df['date']]
    df['day_of_year'] = [i.dayofyear for i in df['date']]
    df['Year'] =[i.year for i in df['date']]
    df['Month'] = [i.month for i in df['date']]
    df['Day'] = [i.day for i in df['date']]

### US Census Bureau state populations
* Massachusetts: 7,029,917 
* Vermont: 643,077	
* Connecticut: 3,605,944	
* Maine: 1,362,359

Source: [U.S. Census Bureau](https://www.census.gov/quickfacts/fact/table/VT,CT,MA,ME/PST045219) April 1, 2020 Census.

Because some states are very densely populated, while others are much more sparsely populated, I'll calculate confirmed Covid cases as a percentage of overall population. 

In order to do this, I'll divide by the appropriate state's total population (as per the April 1, 2020 U.S. Census Bureau), and multiply by 100,000 for ten thousandths of a percent. 

In [11]:
df_ma3['new_case_percent_pop*'] = (df_ma3['MA_Conf_Cases'] / 70.29917)

In [12]:
df_ct3['new_case_percent_pop*'] = (df_ct3['CT_Conf_Cases'] / 36.05944)

In [13]:
df_me3['new_case_percent_pop*'] = (df_me3['ME_Conf_Cases'] / 13.62359)

In [14]:
df_vt3['new_case_percent_pop*'] = (df_vt3['VT_Conf_Cases'] / 6.43077)

Try the model using new cases as ten thousandths of a percent of population, rather than count.

## Using the mean as a baseline prediction model

#### Massachusetts

In [15]:
# Create features
X_ma, y_ma = df_ma3.drop('MA_Conf_Cases', axis =1), df_ma3.MA_Conf_Cases
# Make test and training split
X_train_ma, X_test_ma, y_train_ma, y_test_ma = train_test_split(X_ma, y_ma, test_size = 0.2, random_state=42)
dummy_mean_ma = DummyRegressor(strategy='mean')
# "Train" dummy regressor
dummy_mean_ma.fit(X_train_ma, y_train_ma)
# Get R-squared score
score_mean_ma = dummy_mean_ma.score(X_test_ma, y_test_ma) 
print("The R2 score of using the mean to predict Massachusetts' COVID19 cases is:", score_mean_ma)

The R2 score of using the mean to predict Massachusetts' COVID19 cases is: -0.053284810131488225


#### Connecticut

In [16]:
# Create features
X_ct, y_ct = df_ct3.drop('CT_Conf_Cases', axis =1), df_ct3.CT_Conf_Cases
# Make test and training split
X_train_ct, X_test_ct, y_train_ct, y_test_ct = train_test_split(X_ct, y_ct, test_size=0.2, random_state=42)
# Create a dummy regressor
dummy_mean_ct = DummyRegressor(strategy='mean')
# "Train" dummy regressor
dummy_mean_ct.fit(X_train_ct, y_train_ct)
# Get R-squared score
score_mean_ct = dummy_mean_ct.score(X_test_ct, y_test_ct) 
print("The R2 score of using the mean to predict Connecticut's COVID19 cases is:", score_mean_ct)

The R2 score of using the mean to predict Connecticut's COVID19 cases is: -0.006702217631290974


#### Vermont

In [17]:
# Create features
X_vt, y_vt = df_vt3.drop('VT_Conf_Cases', axis = 1), df_vt3.VT_Conf_Cases
# Make test and training split
X_train_vt, X_test_vt, y_train_vt, y_test_vt = train_test_split(X_vt, y_vt, test_size = 0.2,random_state=42)
# Create a dummy regressor
dummy_mean_vt = DummyRegressor(strategy='mean')
# "Train" dummy regressor
dummy_mean_vt.fit(X_train_vt, y_train_vt)
# Get R-squared score
score_mean_vt = dummy_mean_vt.score(X_test_vt, y_test_vt) 
print("The R2 score of using the mean to predict Vermont's COVID19 cases is:", score_mean_vt)

The R2 score of using the mean to predict Vermont's COVID19 cases is: -0.013915984644440993


#### Maine

In [18]:
# Create features
X_me, y_me = df_me3.drop('ME_Conf_Cases', axis =1), df_me3.ME_Conf_Cases
# Make test and training split
X_train_me, X_test_me, y_train_me, y_test_me = train_test_split(X_me, y_me, test_size=0.2,random_state=42)
# Create a dummy regressor
dummy_mean_me = DummyRegressor(strategy='mean')
# "Train" dummy regressor
dummy_mean_me.fit(X_train_me, y_train_me)
# Get R-squared score
score_mean_me = dummy_mean_me.score(X_test_me, y_test_me) 
print("The R2 score of using the mean to predict Maine's COVID19 cases is:", score_mean_me)

The R2 score of using the mean to predict Maine's COVID19 cases is: -0.10540068459291052


## Merge state dataframes back into one dataframe

In order to do this, first we'll make a separate column in each state's dataframe indicating state_id. With this new identifier column, we'll no longer need state-specific identifiers in the column names themselves, and we can update each state's column names to mirror each other in preparation for the merge. Before we do this, we can go ahead and drop the PRCP columns for each state, as precipitation has proved to have little correlation with (and even less predictive power of) COVID19 cases.

In [19]:
# Create identifying state_id columns for each state df
df_ma3['state_id'] = 'MA'
df_ct3['state_id'] = 'CT'
df_vt3['state_id'] = 'VT'
df_me3['state_id'] = 'ME'

In [20]:
# Prepare df_ma3 for merge by updating column names
df_ma3.drop('MA_PRCP(mm)', axis=1, inplace=True)
ma_cols = {'MA_Avg_Temp(F)': 'Avg_Temp(F)',
           'MA_Conf_Cases' : 'Conf_Cases'}
df_ma3.rename(columns=ma_cols, inplace=True)

In [21]:
# Prepare df_ct3 for merge by updating column names
df_ct3.drop('CT_PRCP(mm)', axis=1, inplace=True)
ct_cols = {'CT_Avg_Temp(F)': 'Avg_Temp(F)',
           'CT_Conf_Cases' : 'Conf_Cases'}
df_ct3.rename(columns=ct_cols, inplace=True)

In [22]:
# Prepare df_vt3 for merge by updating column names
df_vt3.drop('VT_PRCP(mm)', axis=1, inplace=True)
vt_cols = {'VT_Avg_Temp(F)': 'Avg_Temp(F)',
           'VT_Conf_Cases' : 'Conf_Cases'}
df_vt3.rename(columns=vt_cols, inplace=True)

In [23]:
# Prepare df_me3 for merge by updating column names
df_me3.drop('ME_PRCP(mm)', axis=1, inplace=True)
me_cols = {'ME_Avg_Temp(F)': 'Avg_Temp(F)',
           'ME_Conf_Cases' : 'Conf_Cases'}
df_me3.rename(columns=me_cols, inplace=True)

In [24]:
df = pd.concat([df_ma3, df_me3, df_vt3, df_ct3], axis=0).set_index('date').reset_index()

In [25]:
# save the data to a new csv file
df.to_csv('COVID19_modeling.csv')

In [26]:
df

Unnamed: 0,date,Avg_Temp(F),Conf_Cases,day_of_week,day_of_year,Year,Month,Day,new_case_percent_pop*,state_id
0,2020-03-01,26.42,1.0,6,61,2020,3,1,0.014225,MA
1,2020-03-02,36.50,1.0,0,62,2020,3,2,0.014225,MA
2,2020-03-03,55.94,1.0,1,63,2020,3,3,0.014225,MA
3,2020-03-04,46.94,2.0,2,64,2020,3,4,0.028450,MA
4,2020-03-05,42.98,8.0,3,65,2020,3,5,0.113799,MA
...,...,...,...,...,...,...,...,...,...,...
2219,2021-09-08,70.16,935.0,2,251,2021,9,8,25.929410,CT
2220,2021-09-09,71.42,626.0,3,252,2021,9,9,17.360225,CT
2221,2021-09-10,66.92,625.0,4,253,2021,9,10,17.332493,CT
2222,2021-09-11,63.14,0.0,5,254,2021,9,11,0.000000,CT


## Moving average plots

In [None]:
'''
for i in df['state_id'].unique():
    subset = df[df['state_id'] == i]
    subset.loc[:,'moving_average'] = subset['new_case_percent_pop*'].rolling(7).mean()
    fig = px.line(subset, x="date", y=["new_case_percent_pop*","moving_average"], title = (i + ' weekly rolling average'), template = 'plotly_dark')
    fig.show()
    #plt.savefig(i)
'''

In [None]:
'''
for i in df['state_id'].unique():
    subset = df[df['state_id'] == i]
    subset.loc[:,'moving_average'] = subset['new_case_percent_pop*'].rolling(14).mean()
    fig = px.line(subset, x="date", y=["new_case_percent_pop*","moving_average"], title = (i + ' weekly rolling average'), template = 'plotly_dark')
    #fig.show()
'''

## Create a baseline model for optimization

In [27]:
all_ts = df['state_id'].unique()

all_results = []
#final_model = {}

for i in all_ts:
    df_subset = df[df['state_id'] == i]
    
    # initialize setup from pycaret.regression
    s = setup(df_subset, target ='new_case_percent_pop*' , train_size = 0.8,
              data_split_shuffle = True, fold = 3,
              ignore_features = ['date', 'Conf_Cases'],
              numeric_features = ['day_of_year', 'Year', 'Avg_Temp(F)'],
              categorical_features = ['Month', 'day_of_week'],
              silent = True, verbose = False, session_id = 123, normalize=False)
    # compare all models and select best one based on MAE
    best_model = compare_models(sort = 'R2', verbose=False)
    
    # capture the compare result grid and store best model in list
    p = pull().iloc[0:1]
    p['time_series'] = str(i)
    all_results.append(p)
best = compare_models(sort = 'R2')
best

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,5.4085,82.3826,8.9722,0.7989,0.5617,1.2713,0.2767
rf,Random Forest Regressor,5.4007,88.5067,9.2734,0.785,0.5154,0.8268,0.0833
gbr,Gradient Boosting Regressor,6.1456,94.8521,9.6102,0.7697,0.6479,1.8508,0.02
et,Extra Trees Regressor,5.6182,104.7353,10.1806,0.7377,0.4865,0.7529,0.0667
lightgbm,Light Gradient Boosting Machine,6.5687,117.5741,10.708,0.7138,0.6299,1.4978,0.09
xgboost,Extreme Gradient Boosting,6.1255,115.8177,10.748,0.7035,0.5549,1.5577,0.1967
dt,Decision Tree Regressor,6.399,138.5668,11.6107,0.6635,0.5451,0.6479,0.0067
lr,Linear Regression,8.1727,138.6516,11.6809,0.6586,0.8615,3.317,0.0067
br,Bayesian Ridge,8.1854,141.192,11.7737,0.6537,0.8714,3.283,0.0067
ridge,Ridge Regression,8.2038,142.5635,11.8263,0.6507,0.8718,3.2807,0.0067


<catboost.core.CatBoostRegressor at 0x7fde20dbc700>

In [28]:
concat_results = pd.concat(all_results,axis=0)
concat_results.head()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec),time_series
catboost,CatBoost Regressor,4.2306,48.0835,6.831,0.8989,0.4429,2.5155,0.3033,MA
catboost,CatBoost Regressor,3.1247,30.053,5.4149,0.7938,0.4447,0.9482,0.2967,ME
rf,Random Forest Regressor,2.4798,17.0064,4.1136,0.8164,0.3687,0.7144,0.0767,VT
catboost,CatBoost Regressor,5.4085,82.3826,8.9722,0.7989,0.5617,1.2713,0.3133,CT


***

## One-hot-encoding categorical column, 'state_id'

Most ML models can't handle categorical data as objects types. We'll need to one-hot encode the only categorical variable in our dataset, `state_id` before running our data through any ML models.

In [30]:
#one-hot-encode data
one_hot_encoded_data = pd.get_dummies(df, columns = ['state_id'])

           date  Avg_Temp(F)  Conf_Cases  day_of_week  day_of_year  Year  \
0    2020-03-01        26.42         1.0            6           61  2020   
1    2020-03-02        36.50         1.0            0           62  2020   
2    2020-03-03        55.94         1.0            1           63  2020   
3    2020-03-04        46.94         2.0            2           64  2020   
4    2020-03-05        42.98         8.0            3           65  2020   
...         ...          ...         ...          ...          ...   ...   
2219 2021-09-08        70.16       935.0            2          251  2021   
2220 2021-09-09        71.42       626.0            3          252  2021   
2221 2021-09-10        66.92       625.0            4          253  2021   
2222 2021-09-11        63.14         0.0            5          254  2021   
2223 2021-09-12        69.44         0.0            6          255  2021   

      Month  Day  new_case_percent_pop*  state_id_CT  state_id_MA  \
0         3    1  

In [32]:
#all_ts = df['state_id'].unique()

#all_results = []
#final_model = {}

#for i in all_ts:
    #df_subset = df[df['state_id'] == i]
    
    # initialize setup from pycaret.regression
s = setup(one_hot_encoded_data, target ='new_case_percent_pop*', train_size = 0.8,
            data_split_shuffle = True, fold = 3,
            ignore_features = ['date', 'Conf_Cases'],
            numeric_features = ['day_of_year', 'Year', 'Avg_Temp(F)'],
            categorical_features = ['Month', 'day_of_week', 'state_id_MA', 'state_id_CT', 'state_id_VT', 'state_id_ME'],
            silent = True, verbose = False, session_id = 123, normalize=False)
    # compare all models and select best one based on MAE
best_model = compare_models(sort = 'R2', verbose=False)
    
    # capture the compare result grid and store best model in list
p = pull().iloc[0:1]
p['time_series'] = str(i)
all_results.append(p)
best = compare_models(sort = 'R2')
best

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,4.0767,51.4359,7.1543,0.828,0.5081,1.42,0.43
rf,Random Forest Regressor,3.9441,52.8238,7.2373,0.8233,0.4268,1.0654,0.13
et,Extra Trees Regressor,3.8059,54.6242,7.3398,0.8205,0.3953,0.7701,0.1067
lightgbm,Light Gradient Boosting Machine,4.2811,55.2234,7.4053,0.8149,0.5077,1.5072,0.2267
xgboost,Extreme Gradient Boosting,4.5415,63.319,7.9373,0.7888,0.5242,1.3935,0.22
gbr,Gradient Boosting Regressor,4.9521,67.3713,8.1814,0.776,0.6091,2.4909,0.0433
dt,Decision Tree Regressor,5.0958,98.2415,9.8295,0.6763,0.5202,0.7874,0.01
ridge,Ridge Regression,7.4452,116.5455,10.779,0.6101,0.8441,4.687,0.01
br,Bayesian Ridge,7.4453,116.5529,10.7793,0.6101,0.8439,4.6872,0.0133
lr,Linear Regression,7.4607,116.5554,10.7801,0.6099,0.8448,4.7224,0.0067


<catboost.core.CatBoostRegressor at 0x7fde51736e20>

In [37]:
# save the data to a new csv file
one_hot_encoded_data.to_csv('ohe_data.csv')