# Pre-processing 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from pycaret.regression import *
from statsmodels.tsa.stattools import adfuller
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import pycaret
from IPython import display

In [2]:
#load dataframe
#parse datetime column
df=pd.read_csv('state_cov_weather2.csv', parse_dates=[0])
df.set_index('date', inplace= True)

In [3]:
df.head()

Unnamed: 0_level_0,VT_Avg_Temp(F),VT_PRCP(mm),ME_Avg_Temp(F),ME_PRCP(mm),CT_Avg_Temp(F),CT_PRCP(mm),MA_Avg_Temp(F),MA_PRCP(mm),CT_Conf_Cases,VT_Conf_Cases,ME_Conf_Cases,MA_Conf_Cases,weekday
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2020-03-01,19.58,0.3,14.18,0.3,26.24,0.0,26.42,0.0,0.0,0.0,0.0,1.0,6
2020-03-02,32.0,0.0,17.06,4.6,37.58,0.0,36.5,0.0,0.0,0.0,0.0,1.0,0
2020-03-03,44.06,2.8,39.56,0.0,51.26,3.8,55.94,0.0,0.0,0.0,0.0,1.0,1
2020-03-04,41.54,0.0,32.0,6.6,48.02,0.0,46.94,2.5,0.0,0.0,0.0,2.0,2
2020-03-05,37.04,0.0,30.02,0.0,43.16,0.0,42.98,0.0,0.0,0.0,0.0,8.0,3


In [4]:
df_vt=df.drop(columns=['ME_Avg_Temp(F)', 'ME_PRCP(mm)', 'ME_Conf_Cases', 'CT_Avg_Temp(F)', 'CT_PRCP(mm)', 'CT_Conf_Cases', 'MA_Avg_Temp(F)', 'MA_PRCP(mm)', 'MA_Conf_Cases','weekday'])

In [5]:
df_me=df.drop(columns=['VT_Avg_Temp(F)', 'VT_PRCP(mm)', 'VT_Conf_Cases', 'CT_Avg_Temp(F)', 'CT_PRCP(mm)', 'CT_Conf_Cases', 'MA_Avg_Temp(F)', 'MA_PRCP(mm)', 'MA_Conf_Cases','weekday'])

In [6]:
df_ma=df.drop(columns=['ME_Avg_Temp(F)', 'ME_PRCP(mm)', 'ME_Conf_Cases', 'CT_Avg_Temp(F)', 'CT_PRCP(mm)', 'CT_Conf_Cases', 'VT_Avg_Temp(F)', 'VT_PRCP(mm)', 'VT_Conf_Cases','weekday'])

In [7]:
df_ct= df.drop(columns=['ME_Avg_Temp(F)', 'ME_PRCP(mm)', 'ME_Conf_Cases', 'VT_Avg_Temp(F)', 'VT_PRCP(mm)', 'VT_Conf_Cases', 'MA_Avg_Temp(F)', 'MA_PRCP(mm)', 'MA_Conf_Cases','weekday'])

***

In [8]:
# Make duplicates of datasets for processing with Pycaret:
df_vt3 = df_vt.reset_index()
df_ct3 = df_ct.reset_index()
df_me3 = df_me.reset_index()
df_ma3 = df_ma.reset_index()

In [9]:
df_list = [df_ct3, df_ma3, df_me3, df_vt3]

In [10]:
# Add 'day_of_week' and 'day_of_year' columns to each dataframe
for df in df_list:
    df['day_of_week'] = [i.dayofweek for i in df['date']]
    df['day_of_year'] = [i.dayofyear for i in df['date']]
    df['Year'] =[i.year for i in df['date']]
    df['Month'] = [i.month for i in df['date']]
    df['Day'] = [i.day for i in df['date']]

### US Census Bureau state populations
* Massachusetts: 7,029,917 
* Vermont: 643,077	
* Connecticut: 3,605,944	
* Maine: 1,362,359

Source: [U.S. Census Bureau](https://www.census.gov/quickfacts/fact/table/VT,CT,MA,ME/PST045219) April 1, 2020 Census.

Because some states are very densely populated, while others are much more sparsely populated, I'll calculate confirmed Covid cases as a percentage of overall population. 

In order to do this, I'll divide by the appropriate state's total population (as per the April 1, 2020 U.S. Census Bureau), and multiply by 100,000 for ten thousandths of a percent. 

In [11]:
df_ma3['new_case_percent_pop*'] = (df_ma3['MA_Conf_Cases'] / 70.29917)

In [12]:
df_ct3['new_case_percent_pop*'] = (df_ct3['CT_Conf_Cases'] / 36.05944)

In [13]:
df_me3['new_case_percent_pop*'] = (df_me3['ME_Conf_Cases'] / 13.62359)

In [14]:
df_vt3['new_case_percent_pop*'] = (df_vt3['VT_Conf_Cases'] / 6.43077)

Try the model using new cases as ten thousandths of a percent of population, rather than count.

## Using the mean as a baseline prediction model

#### Massachusetts

In [15]:
# Create features
X_ma, y_ma = df_ma3.drop('MA_Conf_Cases', axis =1), df_ma3.MA_Conf_Cases
# Make test and training split
X_train_ma, X_test_ma, y_train_ma, y_test_ma = train_test_split(X_ma, y_ma, test_size = 0.2, random_state=42)
dummy_mean_ma = DummyRegressor(strategy='mean')
# "Train" dummy regressor
dummy_mean_ma.fit(X_train_ma, y_train_ma)
# Get R-squared score
score_mean_ma = dummy_mean_ma.score(X_test_ma, y_test_ma) 
print("The R2 score of using the mean to predict Massachusetts' COVID19 cases is:", score_mean_ma)

The R2 score of using the mean to predict Massachusetts' COVID19 cases is: -0.053284810131488225


#### Connecticut

In [16]:
# Create features
X_ct, y_ct = df_ct3.drop('CT_Conf_Cases', axis =1), df_ct3.CT_Conf_Cases
# Make test and training split
X_train_ct, X_test_ct, y_train_ct, y_test_ct = train_test_split(X_ct, y_ct, test_size=0.2, random_state=42)
# Create a dummy regressor
dummy_mean_ct = DummyRegressor(strategy='mean')
# "Train" dummy regressor
dummy_mean_ct.fit(X_train_ct, y_train_ct)
# Get R-squared score
score_mean_ct = dummy_mean_ct.score(X_test_ct, y_test_ct) 
print("The R2 score of using the mean to predict Connecticut's COVID19 cases is:", score_mean_ct)

The R2 score of using the mean to predict Connecticut's COVID19 cases is: -0.006702217631290974


#### Vermont

In [17]:
# Create features
X_vt, y_vt = df_vt3.drop('VT_Conf_Cases', axis = 1), df_vt3.VT_Conf_Cases
# Make test and training split
X_train_vt, X_test_vt, y_train_vt, y_test_vt = train_test_split(X_vt, y_vt, test_size = 0.2,random_state=42)
# Create a dummy regressor
dummy_mean_vt = DummyRegressor(strategy='mean')
# "Train" dummy regressor
dummy_mean_vt.fit(X_train_vt, y_train_vt)
# Get R-squared score
score_mean_vt = dummy_mean_vt.score(X_test_vt, y_test_vt) 
print("The R2 score of using the mean to predict Vermont's COVID19 cases is:", score_mean_vt)

The R2 score of using the mean to predict Vermont's COVID19 cases is: -0.013915984644440993


#### Maine

In [18]:
# Create features
X_me, y_me = df_me3.drop('ME_Conf_Cases', axis =1), df_me3.ME_Conf_Cases
# Make test and training split
X_train_me, X_test_me, y_train_me, y_test_me = train_test_split(X_me, y_me, test_size=0.2,random_state=42)
# Create a dummy regressor
dummy_mean_me = DummyRegressor(strategy='mean')
# "Train" dummy regressor
dummy_mean_me.fit(X_train_me, y_train_me)
# Get R-squared score
score_mean_me = dummy_mean_me.score(X_test_me, y_test_me) 
print("The R2 score of using the mean to predict Maine's COVID19 cases is:", score_mean_me)

The R2 score of using the mean to predict Maine's COVID19 cases is: -0.10540068459291052


## Correlation between COVID cases and lagged tempurature values

We know that there is a period of incubation between the moment of COVID-19 infection and the manifestation of symptoms, so it might be reasonable to expect a higher correlation between average temperatures and some lagged value of COVID-19 cases. If COVID-19 infection rates truly are correlated with temperature, cases that got tested *because* they manifested symptoms, would likely demonstrate some lag with the temperature on the day they were infected. On the other hand, those cases that get tested regularly (for work, medical reasons, etc.), may not demonstrate much of a lag, if any at all, with the temperature on the day they were infected. It would be worthwhile to explore lagged values of COVID cases to determine if there isn't a higher correlation with a lagged value of cases than with the current value of COVID-19 cases. 

#### Massachusetts temperature lag cross-correlation

In [None]:
df_ma4=df_ma3

In [None]:
temp_cov_lags = {'1_wk': -7, '1.5_wks': -10, '2_wks': -14, '3_wks': -21, '4_wks': -28}
for key, value in temp_cov_lags.items():
    df_ma4[key] = df_ma4['MA_Conf_Cases'].shift(value, axis=0)
df_ma4 = df_ma4.drop(['MA_PRCP(mm)', 'day_of_week', 'day_of_year', 'Year', 'Month', 'Day'], axis =1).dropna()
ma_corr = pd.DataFrame(df_ma4.corr())
ma_corr

In [None]:
ma_corr.drop(columns=['MA_Avg_Temp(F)', 'new_case_percent_pop*'], inplace=True)
ma_corr.rename(columns={'MA_Conf_Cases': '0_wks'}, inplace=True)
ma_corr.loc['MA_Avg_Temp(F)', :].plot()
plt.title('Correlation of temp vs lagged COV cases: MA')
plt.xlabel('Lag time')
plt.ylabel('Correlation with average temperature')
plt.show()

#### Connecticut temperature lag cross-correlation

In [None]:
df_ct4=df_ct3

In [None]:
for key, value in temp_cov_lags.items():
    df_ct4[key] = df_ct4['CT_Conf_Cases'].shift(value, axis=0)
df_ct4 = df_ct4.drop(['CT_PRCP(mm)', 'day_of_week', 'day_of_year', 'Year', 'Month', 'Day'], axis =1).dropna()
ct_corr = pd.DataFrame(df_ct4.corr())
ct_corr

In [None]:
ct_corr.drop(columns=['CT_Avg_Temp(F)', 'new_case_percent_pop*'], inplace=True)
ct_corr.rename(columns={'CT_Conf_Cases': '0_wks'}, inplace=True)
ct_corr.loc['CT_Avg_Temp(F)', :].plot()
plt.title('Correlation of temp vs lagged COV cases: CT')
plt.xlabel('Lag time')
plt.ylabel('Correlation with average temperature')
plt.plot()

#### Vermont temperature lag cross-correlation

In [None]:
df_vt4=df_vt3

In [None]:
for key, value in temp_cov_lags.items():
    df_vt4[key] = df_vt4['VT_Conf_Cases'].shift(value, axis=0)
df_vt4 = df_vt4.drop(['VT_PRCP(mm)', 'day_of_week', 'day_of_year', 'Year', 'Month', 'Day'], axis =1).dropna()
vt_corr = pd.DataFrame(df_vt4.corr())
vt_corr

In [None]:
vt_corr.drop(columns=['VT_Avg_Temp(F)', 'new_case_percent_pop*'], inplace=True)
vt_corr.rename(columns={'VT_Conf_Cases': '0_wks'}, inplace=True)
vt_corr.loc['VT_Avg_Temp(F)', :].plot()
plt.title('Correlation of temp vs lagged COV cases: VT')
plt.xlabel('Lag time')
plt.ylabel('Correlation with average temperature')
plt.show()

#### Maine temperature lag cross-correlation

In [None]:
df_me4=df_me3

In [None]:
for key, value in temp_cov_lags.items():
    df_me4[key] = df_me4['ME_Conf_Cases'].shift(value, axis=0)
df_me4 = df_me4.drop(['ME_PRCP(mm)', 'day_of_week', 'day_of_year', 'Year', 'Month', 'Day'], axis =1).dropna()
me_corr = pd.DataFrame(df_me4.corr())
me_corr

In [None]:
me_corr.drop(columns=['ME_Avg_Temp(F)', 'new_case_percent_pop*'], inplace=True)
me_corr.rename(columns={'ME_Conf_Cases': '0_wks'}, inplace=True)
plt.title('Correlation of temp vs lagged COV cases: ME')
me_corr.loc['ME_Avg_Temp(F)', :].plot()
plt.xlabel('Lag time')
plt.ylabel('Correlation with average temperature')
plt.show()

Maine seems to have the strongest negative correlation between temperature and a 1.5 week lag of COVID-19 cases. Vermont seems to have the strongest negative correlations with current cases, as well as those lagged by two weeks. However, Massachusetts and Connecticut both very clearly have the strongest correlation between temperature and lag 0 (or, current) case values. While it seems that for some states there may be a strong correlation with lagged COVID case values, since 50-75% of the data don't show any increased correlation using lags (and the remaining 25-50% of data disagree on an optimal lagged value), let's stick with our original lag 0 COVID-19 case values.  

In [None]:
lag_cols = ['1_wk', '1.5_wks', '2_wks', '3_wks', '4_wks']
df_list = [df_ma3, df_ct3, df_me3, df_vt3]

In [None]:
for df in df_list:
    df.drop(columns=lag_cols, inplace=True)

## Merge state dataframes back into one dataframe

In order to do this, first we'll make a separate column in each state's dataframe indicating state_id. With this new identifier column, we'll no longer need state-specific identifiers in the column names themselves, and we can update each state's column names to mirror each other in preparation for the merge. Before we do this, we can go ahead and drop the PRCP columns for each state, as precipitation has proved to have little correlation with (and even less predictive power of) COVID19 cases.

In [19]:
# Create identifying state_id columns for each state df
df_ma3['state_id'] = 'MA'
df_ct3['state_id'] = 'CT'
df_vt3['state_id'] = 'VT'
df_me3['state_id'] = 'ME'

In [20]:
# Prepare df_ma3 for merge by updating column names
#df_ma3.drop('MA_PRCP(mm)', axis=1, inplace=True)
ma_cols = {'MA_Avg_Temp(F)': 'Avg_Temp(F)',
           'MA_Conf_Cases' : 'Conf_Cases',
           'MA_PRCP(mm)' : 'PRCP(mm)'}
df_ma3.rename(columns=ma_cols, inplace=True)

In [21]:
# Prepare df_ct3 for merge by updating column names
#df_ct3.drop('CT_PRCP(mm)', axis=1, inplace=True)
ct_cols = {'CT_Avg_Temp(F)': 'Avg_Temp(F)',
           'CT_Conf_Cases' : 'Conf_Cases',
           'CT_PRCP(mm)' : 'PRCP(mm)'}
df_ct3.rename(columns=ct_cols, inplace=True)

In [22]:
# Prepare df_vt3 for merge by updating column names
#df_vt3.drop('VT_PRCP(mm)', axis=1, inplace=True)
vt_cols = {'VT_Avg_Temp(F)': 'Avg_Temp(F)',
           'VT_Conf_Cases' : 'Conf_Cases',
           'VT_PRCP(mm)' : 'PRCP(mm)'}
df_vt3.rename(columns=vt_cols, inplace=True)

In [23]:
# Prepare df_me3 for merge by updating column names
#df_me3.drop('ME_PRCP(mm)', axis=1, inplace=True)
me_cols = {'ME_Avg_Temp(F)': 'Avg_Temp(F)',
           'ME_Conf_Cases' : 'Conf_Cases',
           'ME_PRCP(mm)' : 'PRCP(mm)'}
df_me3.rename(columns=me_cols, inplace=True)

In [24]:
df = pd.concat([df_ma3, df_me3, df_vt3, df_ct3], axis=0).set_index('date').reset_index()

In [25]:
# save the data to a new csv file
df.to_csv('COVID19_modeling_PRCP.csv')

In [26]:
df

Unnamed: 0,date,Avg_Temp(F),PRCP(mm),Conf_Cases,day_of_week,day_of_year,Year,Month,Day,new_case_percent_pop*,state_id
0,2020-03-01,26.42,0.0,1.0,6,61,2020,3,1,0.014225,MA
1,2020-03-02,36.50,0.0,1.0,0,62,2020,3,2,0.014225,MA
2,2020-03-03,55.94,0.0,1.0,1,63,2020,3,3,0.014225,MA
3,2020-03-04,46.94,2.5,2.0,2,64,2020,3,4,0.028450,MA
4,2020-03-05,42.98,0.0,8.0,3,65,2020,3,5,0.113799,MA
...,...,...,...,...,...,...,...,...,...,...,...
2219,2021-09-08,70.16,0.0,935.0,2,251,2021,9,8,25.929410,CT
2220,2021-09-09,71.42,3.8,626.0,3,252,2021,9,9,17.360225,CT
2221,2021-09-10,66.92,0.0,625.0,4,253,2021,9,10,17.332493,CT
2222,2021-09-11,63.14,0.0,0.0,5,254,2021,9,11,0.000000,CT


## Moving average plots

In [None]:
'''
for i in df['state_id'].unique():
    subset = df[df['state_id'] == i]
    subset.loc[:,'moving_average'] = subset['new_case_percent_pop*'].rolling(7).mean()
    fig = px.line(subset, x="date", y=["new_case_percent_pop*","moving_average"], title = (i + ' weekly rolling average'), template = 'plotly_dark')
    fig.show()
    #plt.savefig(i)
    #plt.show()
'''

In [None]:
display.Image("MA_weekly_rolling_average_cases.png")

In [None]:
display.Image("ME_weekly_rolling_average_cases.png")

In [None]:
display.Image("VT_weekly_rolling_average_cases.png")

In [None]:
display.Image("CT_weekly_rolling_average_cases.png")

In [None]:
'''
for i in df['state_id'].unique():
    subset = df[df['state_id'] == i]
    subset.loc[:,'moving_average'] = subset['new_case_percent_pop*'].rolling(14).mean()
    fig = px.line(subset, x="date", y=["new_case_percent_pop*","moving_average"], title = (i + ' weekly rolling average'), template = 'plotly_dark')
    fig.show()
'''

## Create a baseline model for optimization

In [27]:
all_ts = df['state_id'].unique()

all_results = []
#final_model = {}

for i in all_ts:
    df_subset = df[df['state_id'] == i]
    
    # initialize setup from pycaret.regression
    s = setup(df_subset, target ='new_case_percent_pop*' , train_size = 0.8,
              data_split_shuffle = True, fold = 3,
              ignore_features = ['date', 'Conf_Cases'],
              numeric_features = ['day_of_year', 'Year', 'Avg_Temp(F)', 'PRCP(mm)'],
              categorical_features = ['Month', 'day_of_week'],
              silent = True, verbose = False, session_id = 123, normalize=False)
    # compare all models and select best one based on MAE
    best_model = compare_models(sort = 'R2', verbose=False)
    
    # capture the compare result grid and store best model in list
    p = pull().iloc[0:1]
    p['time_series'] = str(i)
    all_results.append(p)
best = compare_models(sort = 'R2')
best

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,5.5603,86.5185,9.209,0.7878,0.5745,1.4644,0.2867
rf,Random Forest Regressor,5.3958,87.9785,9.2383,0.7867,0.5187,0.83,0.0733
gbr,Gradient Boosting Regressor,6.1777,97.8087,9.7578,0.7627,0.6435,1.8704,0.0267
et,Extra Trees Regressor,5.7417,113.3245,10.5381,0.7207,0.497,0.7767,0.0633
xgboost,Extreme Gradient Boosting,6.0259,110.454,10.4877,0.7174,0.5672,1.5433,0.2233
lightgbm,Light Gradient Boosting Machine,6.6946,121.6435,10.8744,0.7047,0.6252,1.5284,0.0867
lr,Linear Regression,8.1348,138.595,11.6781,0.6588,0.857,3.3243,0.01
br,Bayesian Ridge,8.1589,141.3282,11.7775,0.6535,0.869,3.2871,0.0067
ridge,Ridge Regression,8.1756,142.4923,11.822,0.651,0.8684,3.2857,0.0067
dt,Decision Tree Regressor,6.6025,147.1341,12.0002,0.6408,0.5876,0.6789,0.01


<catboost.core.CatBoostRegressor at 0x7fd9d0246a30>

In [28]:
concat_results = pd.concat(all_results,axis=0)
concat_results.head()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec),time_series
catboost,CatBoost Regressor,4.4432,54.3323,7.2719,0.8858,0.4553,2.0923,0.3767,MA
catboost,CatBoost Regressor,3.317,32.6158,5.6439,0.7759,0.4651,1.0744,0.2933,ME
rf,Random Forest Regressor,2.4548,16.7262,4.0801,0.8195,0.3689,0.7073,0.0733,VT
catboost,CatBoost Regressor,5.5603,86.5185,9.209,0.7878,0.5745,1.4644,0.3,CT


***

## One-hot-encoding categorical column, 'state_id'

Most ML models can't handle categorical data as objects types. We'll need to one-hot encode the only categorical variable in our dataset, `state_id` before running our data through any ML models.

In [29]:
#one-hot-encode data
one_hot_encoded_data = pd.get_dummies(df, columns = ['state_id'])

In [30]:
one_hot_encoded_data

Unnamed: 0,date,Avg_Temp(F),PRCP(mm),Conf_Cases,day_of_week,day_of_year,Year,Month,Day,new_case_percent_pop*,state_id_CT,state_id_MA,state_id_ME,state_id_VT
0,2020-03-01,26.42,0.0,1.0,6,61,2020,3,1,0.014225,0,1,0,0
1,2020-03-02,36.50,0.0,1.0,0,62,2020,3,2,0.014225,0,1,0,0
2,2020-03-03,55.94,0.0,1.0,1,63,2020,3,3,0.014225,0,1,0,0
3,2020-03-04,46.94,2.5,2.0,2,64,2020,3,4,0.028450,0,1,0,0
4,2020-03-05,42.98,0.0,8.0,3,65,2020,3,5,0.113799,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2219,2021-09-08,70.16,0.0,935.0,2,251,2021,9,8,25.929410,1,0,0,0
2220,2021-09-09,71.42,3.8,626.0,3,252,2021,9,9,17.360225,1,0,0,0
2221,2021-09-10,66.92,0.0,625.0,4,253,2021,9,10,17.332493,1,0,0,0
2222,2021-09-11,63.14,0.0,0.0,5,254,2021,9,11,0.000000,1,0,0,0


In [31]:
s = setup(one_hot_encoded_data, target ='new_case_percent_pop*', train_size = 0.8,
            data_split_shuffle = True, fold = 3,
            ignore_features = ['date', 'Conf_Cases'],
            numeric_features = ['day_of_year', 'Year', 'Avg_Temp(F)'],
            categorical_features = ['Month', 'day_of_week', 'state_id_MA', 'state_id_CT', 'state_id_VT', 'state_id_ME'],
            silent = True, verbose = False, session_id = 123, normalize=False)
    # compare all models and select best one based on MAE
best_model = compare_models(sort = 'R2', verbose=False)
    
    # capture the compare result grid and store best model in list
p = pull().iloc[0:1]
p['time_series'] = str(i)
all_results.append(p)
best = compare_models(sort = 'R2')
best

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,4.1276,53.1296,7.2686,0.8217,0.5139,1.5909,0.3867
et,Extra Trees Regressor,3.8163,54.3522,7.3203,0.8215,0.3946,0.7792,0.1
rf,Random Forest Regressor,4.0274,55.8256,7.4439,0.8128,0.4317,1.0925,0.1167
lightgbm,Light Gradient Boosting Machine,4.4054,57.538,7.5667,0.8066,0.5269,1.6681,0.4333
xgboost,Extreme Gradient Boosting,4.6518,66.2809,8.1212,0.779,0.5351,1.5308,0.24
gbr,Gradient Boosting Regressor,5.0253,68.5794,8.2497,0.7726,0.6128,2.5169,0.0467
dt,Decision Tree Regressor,5.341,107.4004,10.3083,0.6446,0.5347,0.8756,0.01
ridge,Ridge Regression,7.4483,116.5412,10.779,0.6101,0.8436,4.6809,0.01
br,Bayesian Ridge,7.448,116.5545,10.7795,0.6101,0.8434,4.6798,0.0067
lr,Linear Regression,7.463,116.5451,10.7799,0.6099,0.8435,4.7164,0.01


<catboost.core.CatBoostRegressor at 0x7fd9f98f5a00>

In [32]:
# save the data to a new csv file
one_hot_encoded_data.to_csv('ohe_data_PRCP.csv')

## One-hot-encode all categorical data

In [None]:
#one-hot-encode data
one_hot_encoded_data3 = pd.get_dummies(df, columns = ['state_id', 'Month', 'day_of_week'])

In [None]:
one_hot_encoded_data3.columns

In [None]:
s = setup(one_hot_encoded_data3, target ='new_case_percent_pop*', train_size = 0.8,
            data_split_shuffle = True, fold = 3,
            ignore_features = ['date', 'Conf_Cases'],
            numeric_features = ['day_of_year', 'Year', 'Avg_Temp(F)'],
            categorical_features = ['state_id_CT', 'state_id_MA', 'state_id_ME',
                                       'state_id_VT', 'Month_1', 'Month_2', 'Month_3', 'Month_4', 'Month_5',
                                       'Month_6', 'Month_7', 'Month_8', 'Month_9', 'Month_10', 'Month_11',
                                       'Month_12', 'day_of_week_0', 'day_of_week_1', 'day_of_week_2',
                                       'day_of_week_3', 'day_of_week_4', 'day_of_week_5', 'day_of_week_6'],
            silent = True, verbose = False, session_id = 123, normalize=False)
    # compare all models and select best one based on MAE
best_model = compare_models(sort = 'R2', verbose=False)
    
    # capture the compare result grid and store best model in list
p = pull().iloc[0:1]
p['time_series'] = str(i)
all_results.append(p)
best = compare_models(sort = 'R2')
best

In [None]:
# save the data to a new csv file
one_hot_encoded_data3.to_csv('ohe_data2.csv')

*****

In [None]:
s = setup(one_hot_encoded_data, target ='new_case_percent_pop*', train_size = 0.8,
            data_split_shuffle = True, fold = 3,
            ignore_features = ['date', 'Conf_Cases'],
            numeric_features = ['day_of_year', 'Year', 'Avg_Temp(F)'],
            categorical_features = ['Month', 'day_of_week', 'state_id_MA', 'state_id_CT', 'state_id_VT', 'state_id_ME'],
            silent = True, verbose = False, session_id = 123, normalize=True)
    # compare all models and select best one based on MAE
best_model = compare_models(sort = 'R2', verbose=False)
    
    # capture the compare result grid and store best model in list
p = pull().iloc[0:1]
p['time_series'] = str(i)
all_results.append(p)
best = compare_models(sort = 'R2')
best