# Extra project code

In [None]:
df_me3['pop_per_100sqmi'] = 1362359/30843
df_ma3['pop_per_100sqmi'] = 7029917/7800
df_ct3['pop_per_100sqmi'] = 3605944/4842
df_vt3['pop_per_100sqmi'] = 643077/9217

In [None]:
from pycaret.regression import *
all_results=[]
df_subset = df_ma3

# initialize setup from pycaret.regression
s = setup(df_subset, target = 'new_case_percent_pop*', train_size = 0.8,
            data_split_shuffle = True, fold = 3,
            ignore_features = ['date', 'MA_Conf_Cases', 'MA_PRCP(mm)'],
            numeric_features = ['day_of_year', 'Year', 'MA_Avg_Temp(F)', 'pop_per_100sqmi'],
            categorical_features = ['Month', 'day_of_week'],
            silent = True, verbose = False, session_id = 123,
            normalize=False)
    
# compare all models and select best one based on MAE
best_model = compare_models(sort = 'R2', verbose=False)
    
# capture the compare result grid and store best model in list
p = pull().iloc[0:1]
p['time_series'] = str("MA")
all_results.append(p)
    
# finalize model i.e. fit on entire data including test set
f = finalize_model(best_model)
best_model

From the above data, it looks like [Massachusetts](https://www.mass.gov/info-details/covid-19-response-reporting) and [Connecticut](https://www.mass.gov/info-details/covid-19-response-reporting) have similar Monday-Friday reporting schedules. It also seems that [Maine](https://www.maine.gov/dhhs/mecdc/infectious-disease/epi/airborne/coronavirus/data.shtml) reports cases Tuesday-Saturday and Vermont may report cases 7 days a week.

From the data above, it would seem Vermont reports cases every day, but according to this [Associated Press article](https://apnews.com/article/health-coronavirus-pandemic-vermont-c781aa063d30e8f665500deaf8902ab9), Vermont only began reporting cases daily as of 2021-08-23, due to a large surge in cases. Prior to 2021-08-23, [Vermont](https://www.healthvermont.gov/covid-19) was also reporting cases Monday-Friday (like Connecticut and Massachusetts). Let's do a quick check and plot our data to make sure our data reflects this.

In [None]:
(https://www.mass.gov/info-details/covid-19-response-reporting)
(https://www.mass.gov/info-details/covid-19-response-reporting)
(https://www.maine.gov/dhhs/mecdc/infectious-disease/epi/airborne/coronavirus/data.shtml) 

We can see that the last time Vermont reported 0 cases was 2021-08-16. The above zero values all correspond to weekend days (and one Monday holiday), so we can confirmed that Vermont had the same reporting schedule as Massachusetts and Connecticut prior to 2021-08-23 (after which Vermont reported cases 7 days a week).

Each respective state government webpage also indicates that on the day following a two-day weekend period (whether Saturday-Sunday or Sunday-Monday) the value reported is the cumulative sum of weekend cases and the the following weekday's cases (in other words, three days' worth of cases). Please see hyperlinks above for more details.

We could leave all zero case counts as is, but this would skew the data. We could also drop all weekend values, but this isn't ideal either. If we drop all Saturday-Sunday periods, we're losing Maine's Saturday data and Vermont's Saturday-Sunday data from 2021-08-23 onwards. Additionally, Maine will still have missing values every Monday. There are three different reporting schedules, so we would be losing a lot of useful data if we dropped an entire row each time a state had a "non-reporting" day. Because the Monday (or Tuesday, in the case of Maine) case count is a cumulative sum of Saturday-Monday (or, in the case of Maine, Sunday-Tuesday), those cumulative Monday (or, Tuesday) counts will also skew the data.

A much better alternative would be to take the Monday (or Tuesday, in the case of Maine) cumulative counts, divide by three, and replace weekend and Monday (or Tuesday) values with one third of the original cumulative count. We can also remove dates at the beginning of the pandemic before any of these states had seen their first COVID case.

In [None]:
# Check for 0 values in VT_Conf_Cases from just prior to 2021-08-23, onwards:
df6[(df6['VT_Conf_Cases'] == 0) & (df6.index >= '2021-08-01')]

It looks like [Massachusetts](https://www.mass.gov/info-details/covid-19-response-reporting) and [Connecticut](https://www.mass.gov/info-details/covid-19-response-reporting) have similar Monday-Friday reporting schedules. It also seems that [Maine](https://www.maine.gov/dhhs/mecdc/infectious-disease/epi/airborne/coronavirus/data.shtml) reports cases Tuesday-Saturday. See hyperlinks to each state governments' web page with more detailed COVID19 reporting schedule information. 

From the data above, it would seem Vermont reports cases every day, but according to this [Associated Press article](https://apnews.com/article/health-coronavirus-pandemic-vermont-c781aa063d30e8f665500deaf8902ab9), Vermont only began reporting cases daily as of 2021-08-23, due to a large surge in cases. Prior to 2021-08-23, [Vermont](https://www.healthvermont.gov/covid-19) was also reporting cases Monday-Friday (like Connecticut and Massachusetts). Let's do a quick check and plot our data to make sure our data reflects this.

In [1]:
concat_results = pd.concat(all_results,axis=0)
concat_results.head()

NameError: name 'pd' is not defined

[Census Bureau state areas in sq mi](https://www.census.gov/geographies/reference-files/2010/geo/state-area.html)

In [None]:
for i in ME_index_list:
    (df9.loc[i]['ME_Conf_Cases'])=((df9.loc[i]['ME_Conf_Cases'])/3)

In [None]:
ma_counter =0
for row in df5[df5['MA_Conf_Cases']<0]:
    ma_counter+=1
print("MA has", ma_counter/len(df5.columns), "negative Conf_Cases value.")

In [None]:
vt_counter =0
for row in df5[df5['VT_Conf_Cases']<0]:
    vt_counter+=1
print("VT has", vt_counter/len(df5.columns), "negative Conf_Cases value.")

In [None]:
ct_counter =0
for row in df5[df5['CT_Conf_Cases']<0]:
    ct_counter+=1
print("CT has", ct_counter/len(df5.columns), "negative Conf_Cases value")

In [None]:
me_counter =0
for row in df5[df5['ME_Conf_Cases']<0]:
    me_counter+=1
print("ME has", me_counter/len(df5.columns), "negative Conf_Cases value")

In [None]:

for row in df5[df5['CT_Conf_Cases']<0]:
    print(row)

In [None]:
# split data into train-test set
#train_me = df_me2[(df_me2['Year'] < 2021) | ((df_me2['Year']==2021) & (df_me2['Month']<=7))]
#test_me = df_me2[(df_me2['Year']==2021) & (df_me2['Month']>7)]

In [None]:
#train_me.tail()

In [None]:
# import the regression module
#from pycaret.regression import *
# initialize setup
#s = setup(data = train_me, test_data = test_me, target = 'MA_Conf_Cases', fold_strategy = 'timeseries', numeric_features = ['Month', 'Year', 'Series'], fold = 3, transform_target = True, session_id = 123)

In [None]:
#best = compare_models(sort = 'MAE')

In [None]:
# split data into train-test set
train_ma = df_ma[df_ma.index < '2021-06-01']
test_ma = df_ma[df_ma.index >= '2021-06-01']
# check shape
train_ma.shape, test_ma.shape

In [None]:
# initialize setup
s = setup(data = train_ma, test_data = test_ma, target = 'MA_Conf_Cases', fold_strategy = 'timeseries', numeric_features = ['Year', 'Series'], fold = 3, transform_target = True, session_id = 123)

In [None]:
# initialize setup
s = setup(data = train_ma, test_data = test_ma, target = 'MA_Conf_Cases', fold_strategy = 'timeseries', numeric_features = ['Year', 'Series'], fold = 3, transform_target = True, session_id = 123)

In [None]:
# import the regression module
from pycaret.regression import *
# initialize setup
s = setup(data = train_ma, test_data = test_ma, target = 'MA_Conf_Cases', fold_strategy = 'timeseries', numeric_features = df_ma.index, fold = 3, transform_target = True, session_id = 123)



In [None]:
ADF test for confirmation
(test from second row on, as .diff() creates a NaN in the first row):

In [None]:
adfuller(df_vt['VT_Conf_Cases_stationary']['2020-01-23':])

In [None]:
##################
# extract day, month, and year from dates
df_me2['Year'] =[i.year for i in df_me2['date']]
df_me2['Month'] = [i.month for i in df_me2['date']]
df_me2['Day'] = [i.day for i in df_me2['date']]
# drop unnecessary columns and re-arrange
#df_me2.drop(['date'], axis=1, inplace=True)
df_me2 = df_me2[['date', 'Series', 'Year', 'Month', 'Day', 'MA_Avg_Temp(F)', 'MA_PRCP(mm)', 'MA_Conf_Cases']]

In [None]:
#df_vt2 = df_vt2[['date', 'Year', 'Month', 'Day', 'day_of_week', 'day_of_year', 'VT_Avg_Temp(F)', 'VT_PRCP(mm)', 'VT_Conf_Cases']]
#df_ct2 = df_ct2[['date', 'Year', 'Month', 'Day', 'day_of_week', 'day_of_year', 'CT_Avg_Temp(F)', 'CT_PRCP(mm)', 'CT_Conf_Cases']]
#df_me2 = df_me2[['date', 'Year', 'Month', 'Day', 'day_of_week', 'day_of_year', 'ME_Avg_Temp(F)', 'ME_PRCP(mm)', 'ME_Conf_Cases']]
#df_ma2 = df_ma2[['date', 'Year', 'Month', 'Day', 'day_of_week', 'day_of_year', 'MA_Avg_Temp(F)', 'MA_PRCP(mm)', 'MA_Conf_Cases']]

In [None]:
df_vt3 = df_vt.reset_index()
df_ct3 = df_ct.reset_index()
df_me3 = df_me.reset_index()
df_ma3 = df_ma.reset_index()

In [None]:
from pycaret.regression import *
df_subset = df_me2
all_results=[]

# initialize setup from pycaret.regression
s = setup(df_subset, target = 'MA_Conf_Cases', train_size = 0.8,
            data_split_shuffle = True, fold = 3,
            ignore_features = ['date', 'Series', 'MA_PRCP(mm)'],
            numeric_features = ['day_of_year', 'Year', 'MA_Avg_Temp(F)'],
            categorical_features = ['Month', 'day_of_week'],
            silent = True, verbose = False, session_id = 123,
            normalize=True)
    
# compare all models and select best one based on MAE
best_model = compare_models(sort = 'MAE', verbose=False)
    
# capture the compare result grid and store best model in list
p = pull().iloc[0:1]
p['time_series'] = str("ME")
all_results.append(p)
    
# finalize model i.e. fit on entire data including test set
f = finalize_model(best_model)
best_model

In [None]:
concat_results = pd.concat(all_results,axis=0)
concat_results.head()

In [None]:
all_results=[]
df_subset = df_me2


# initialize setup from pycaret.regression
s = setup(df_subset, target = 'ME_Conf_Cases', train_size = 0.8,
            data_split_shuffle = True, fold = 3,
            ignore_features = ['date', 'ME_PRCP(mm)'],
            numeric_features = ['day_of_year', 'Year', 'ME_Avg_Temp(F)'],
            categorical_features = ['Month', 'day_of_week'],
            silent = True, verbose = False, session_id = 123,
            normalize=True)
    
# compare all models and select best one based on MAE
best_model = compare_models(sort = 'MAE', verbose=False)
    
# capture the compare result grid and store best model in list
p = pull().iloc[0:1]
p['time_series'] = str("ME")
all_results.append(p)
    
# finalize model i.e. fit on entire data including test set
f = finalize_model(best_model)
best_model

In [None]:
concat_results = pd.concat([pd.concat(all_results, axis=0), concat_results], axis=0)
concat_results

In [None]:
print(all_results)

In [None]:
print(concat_results)

In [None]:
pd.concat([pd.DataFrame(all_results), concat_results], axis=0)

In [None]:
type(all_results)

In [None]:
results_df = pd.concat(all_results, axis=0)

In [None]:
results_df.head()

In [None]:
concat_results = pd.concat([results_df, results_df], axis=0)

In [None]:
concat_

In [None]:
# Filter out all rows where at least one state reports a COVID case:
filter = (df6.CT_Conf_Cases + df6.VT_Conf_Cases + df6.ME_Conf_Cases + df6.MA_Conf_Cases) > 0

In [None]:
# Filter the dataframe so that any rows where every state has 0 COVID cases becomes NaN and drop NaNs
x=(df6.where(filter).dropna())
# Find earliest date where there is at least one COVID case reported
min(x.index)

In [None]:
# Drop rows before 2020-01-29 
df6 = df6.loc['2020-01-29':]

Let's take a look at Maine first 
According to [Maine.gov](https://www.maine.gov/covid19/timeline), Maine's first COVID19 case was recorded on March 12th, 2020. We don't need to fill in any zero values before that date. 

In [None]:
df7[(df7['weekday']==6)&(df7['ME_Conf_Cases']==0)&(df7.index>'2020-11-01')]

In [None]:
df7.ME_Conf_Cases[(df7['weekday']==6)&(df7.index>'2020-04-01')]

In [None]:
& (df7.index > '2020-03-12')

In [None]:
df6[(df6['VT_Conf_Cases'] == 0) & (df.index >= '2021-08-01')]

In [None]:
# Replace cases negative values with np.nan
df5.MA_Conf_Cases['2020-09-03'] = np.nan
df5.CT_Conf_Cases[['2020-05-27', '2020-08-18']] = np.nan
df5.VT_Conf_Cases[['2020-05-11', '2020-06-17']] = np.nan
df5.ME_Conf_Cases[['2020-03-15','2020-07-22', '2020-09-09', '2021-08-09']]= np.nan

In [None]:
for row in df7['ME_Conf_Cases']:
    if df7['weekday'] == 5:
        df7['ME_Third'] = (df7['ME_Conf_Cases']//3)

In [None]:
# Loop through NaNs and fill with average of previous and following cell values 
state_col_list=['CT_Conf_Cases', 'VT_Conf_Cases', 'ME_Conf_Cases', 'MA_Conf_Cases']
for col in state_col_list:
    df5[col] = df5[col].fillna((df5[col].shift() + df5[col].shift(-1))/2)

In [None]:
df6= df5
df6["weekday"] = df5.index.dayofweek

In [None]:
df6.loc[(df6["weekday" == 0]) & (df6['ME_Conf_Cases']==0) & (df6['MA_Conf_Cases']==0) & (df6['VT_Conf_Cases']==0) & (df6['CT_Conf_Cases']==0)]

In [None]:
df6[(df6['weekday'] == 0) & (df6['MA_Conf_Cases']==0)& (df6['ME_Conf_Cases']==0) & (df6['VT_Conf_Cases']==0) & (df6['CT_Conf_Cases']==0)]

In [None]:
train_me

In [None]:
pd.DataFrame((df6[(df6['weekday'] == 0) & ((df6['MA_Conf_Cases']==0))]).index)

In [None]:
df6[(df6['weekday'] == 0) & ((df6['ME_Conf_Cases']==0))]

In [None]:
df6.loc['2021-09-06']

In [None]:
df6= df5
df6["weekday"] = df5.index.dayofweek

It looks like, for every five days of values for `Conf_Cases`, there are two days of zero values. A quick calendar check confirms that the dates corresponding to zero values are weekend dates (besides 09/06/21, which was Labor Day- a bank holiday in the US). Because there is only data for weekdays, I'll remove weekend data, as it can only skew our summary statistics and plots.

In [None]:
#Create column to designate day of week:
df6= df5
df6["weekday"] = df6.index.dayofweek

In [None]:
#Filter DataFrame to only include weekdays (days 0-4)
df6 = df6[(df6.weekday != 5) & (df6.weekday != 6)]

In [None]:
def merge(list1, list2):
    merged_list = [(p1, p2) for idx1, p1 in enumerate(list1) 
    for idx2, p2 in enumerate(list2) if idx1 == idx2]
    return merged_list

In [None]:
idk = merge(ME_case_list, ME_case_list_new)

In [None]:
idk2 =dict(zip(ME_index_list, idk))
idk2

In [None]:
for key, value in idk2.items():
    (df9.loc[key]['ME_Conf_Cases']).replace(value[0], value[1])

In [None]:
rep_vals_dict=dict(zip(ME_case_list, ME_case_list_new))
rep_vals_dict

In [None]:
for index in ME_index_list:
    df9.loc[index].ME_Conf_Cases.replace(rep_vals_dict)

In [None]:
type(ME_index_list[1])

In [None]:
for index in indexes:
    to_modify[indexes[index]] = replacements[index]

In [None]:
for index in ME_index_list:
    print(df9['ME_Conf_Cases'].index)

In [None]:
for i in ME_index_list:
    df9.loc[i].ME_Conf_Cases = (df9.loc[i].ME_Conf_Cases/3)

In [None]:
for i in ME_index_list:
    (df9.loc[i]['ME_Conf_Cases'])=((df9.loc[i]['ME_Conf_Cases'])/3)

In [None]:
for i in ME_index_list:
    (df9.loc[i]['ME_Conf_Cases'])=((df9.loc[i]['ME_Conf_Cases'])/3)

In [None]:
df9.loc['2021-07-13']

In [None]:
df9.loc['2021-07-13']

In [None]:
for index, row in df9.iterrows():
    if index >= pd.Timestamp('2021-07-01'):
        if row['weekday'] == 1:
            if row['ME_Conf_Cases'] != 0:
                df9.index['ME_Conf_Cases'] /= 3

In [None]:
(pd.DataFrame(df9[['CT_Conf_Cases', 'MA_Conf_Cases', 'ME_Conf_Cases', 
                            'VT_Conf_Cases', 'weekday']].tail(25)).style.applymap(highlight_zero))

In [None]:
ct_counter =0
for row in df5[df5['CT_Conf_Cases']<0]:
    ct_counter+=1
print("CT has", ct_counter/len(df5.columns), "negative Conf_Cases value")

In [None]:
for row in df9:
    if df9.weekday == 1:
        print('yes')

In [None]:
for row in df9:
    if row.index == '2021-07-01':
        print('yes')

In [None]:
import pandas as pd

df = pd.DataFrame({'c1': [10, 11, 12], 'c2': [100, 110, 120]})

for index, row in df.iterrows():
    print(row['c1'], row['c2'])

I'll replace all zero values with `NaN`s so we can replace them using the `.fillna()` method. Once we've filled all appropriate `NaN`s, any remaining `NaN`s can be reset to zero as actual zero case counts.

In [None]:
# Iterate over indices, rows and if row corresponds to a Tuesday after 2021-07-01 with a non-zero value for 
# ME_Conf_Cases, append index, case value, to corresponding lists
ME_tuesday_index_list = []
ME_tuesday_case_list =[]
for index, row in df9.iterrows():
    if (index >= pd.Timestamp('2021-07-01'))&(row['weekday'] == 1)&(row['ME_Conf_Cases'] != 0):
        ME_tuesday_index_list.append(index)
        ME_tuesday_case_list.append(row['ME_Conf_Cases'])

In [None]:
# Divide each Tuesday non-zero ME case count by three and round to nearest integer; append to new list
ME_case_list_new=[]
for i in ME_tuesday_case_list:
    ME_case_list_new.append(round(i/3))

In [None]:
# Create dictionary of indices and new case count value 
ME_tuesday_case_dict= dict(zip(ME_tuesday_index_list, ME_case_list_new))
ME_tuesday_case_dict

In [None]:
# Iterate through dictionary keys, values, and replace each ME_Conf_Case at index/key, with corresponding value
for key, value in ME_tuesday_case_dict.items():
    df9['ME_Conf_Cases'].loc[key] = value

In [None]:
df10=df9

In [None]:
ME_monday_index_list = []
ME_monday_case_list = []
for index, row in df10.iterrows():
    if (index >= pd.Timestamp('2021-07-01')):
        if (row['weekday'] == 0):
            if row['ME_Conf_Cases']==0:
                ME_monday_index_list.append(index)
                ME_monday_case_list.append(row['ME_Conf_Cases'])

In [None]:
ME_monday_case_dict= dict(zip(ME_monday_index_list, ME_case_list_new))
ME_monday_case_dict

In [None]:
# Iterate through dictionary keys, values, and replace each ME_Conf_Case at index/key, with corresponding value
for key, value in ME_monday_case_dict.items():
    df10['ME_Conf_Cases'].loc[key] = value

In [None]:
ME_sunday_index_list = []
ME_sunday_case_list = []
for index, row in df10.iterrows():
    if (index >= pd.Timestamp('2021-07-01')):
        if (row['weekday'] == 6):
            if row['ME_Conf_Cases']==0:
                ME_sunday_index_list.append(index)
                ME_sunday_case_list.append(row['ME_Conf_Cases'])

In [None]:
ME_sunday_index_list = []
ME_sunday_case_list = []
for index, row in df10.iterrows():
    if (index >= pd.Timestamp('2021-07-01')):
        if (row['weekday'] == 6):
            if row['ME_Conf_Cases']==0:
                ME_sunday_index_list.append(index)
                ME_sunday_case_list.append(row['ME_Conf_Cases'])

In [None]:
ME_sunday_case_dict= dict(zip(ME_sunday_index_list, ME_case_list_new))
len(ME_sunday_case_dict)

In [None]:
# Iterate through dictionary keys, values, and replace each ME_Conf_Case at index/key, with corresponding value
for key, value in ME_sunday_case_dict.items():
    df10['ME_Conf_Cases'].loc[key] = value

#### Analyzing time series data checklist:
$\times$ 1) Convert index to datetime object \
$\times$ 2) Plot the data \
$\times$ 3) Run Augmented Dickey Fuller Test to see whether the data is a random walk \
$\times$ 4) Take first differences of the data to transform it into a stationary series \
5) Compute ACF (Autocorrelation Function) and PACF (Partial Autocorrelation Funcion) \
6) Using that as a guide, fit a few AR, MA, and ARMA models to the data \
7) Use information criterion to choose the best model \
8) Forecast 

can you please try XGBOOST

for this dataset

separately

and not use pycaret


Raghunandan, 11:10 PM
one last observation, please combine data from all 4 states and use pycaret and let me know the results

Good night!!

## Connecticut

In [None]:
# Iterate over indices, rows and if row corresponds to a Monday after 2020-07-01 with a non-zero value for 
# CT_Conf_Cases, append index, case value, to corresponding lists
CT_monday_index_list = []
CT_monday_case_list =[]
for index, row in df10.iterrows():
    if (index >= pd.Timestamp('2020-07-01'))&(row['weekday'] == 0)&(row['CT_Conf_Cases'] != 0):
        CT_monday_index_list.append(index)
        CT_monday_case_list.append(row['CT_Conf_Cases'])

In [None]:
# Divide each Tuesday non-zero ME case count by three and round to nearest integer; append to new list
CT_case_list_new=[]
for i in CT_monday_case_list:
    CT_case_list_new.append(round(i/3))
len(CT_case_list_new)

In [None]:
# Divide each Tuesday non-zero ME case count by three and round to nearest integer; append to new list
CT_case_list_new=[]
for i in CT_monday_case_list:
    CT_case_list_new.append(round(i/3))
len(CT_case_list_new)

In [None]:
# Create dictionary of indices and new case count value 
CT_monday_case_dict= dict(zip(CT_monday_index_list, CT_case_list_new))
len(CT_monday_case_dict)

In [None]:
# Iterate through dictionary keys, values, and replace each CT_Conf_Case at index/key, with corresponding value
for key, value in CT_monday_case_dict.items():
    df10['CT_Conf_Cases'].loc[key] = value

In [None]:
CT_sunday_index_list = []
CT_sunday_case_list = []
for index, row in df10.iterrows():
    if (index >= pd.Timestamp('2020-07-01')):
        if (row['weekday'] == 6):
            if row['CT_Conf_Cases']== 0:
                CT_sunday_index_list.append(index)
                CT_sunday_case_list.append(row['CT_Conf_Cases'])

In [None]:
CT_sunday_case_dict= dict(zip(CT_sunday_index_list, CT_case_list_new))

In [None]:
# Iterate through dictionary keys, values, and replace each CT_Conf_Case at index/key, with corresponding value
for key, value in CT_sunday_case_dict.items():
    df10['CT_Conf_Cases'].loc[key] = value

In [None]:
CT_saturday_index_list = []
CT_saturday_case_list = []
for index, row in df10.iterrows():
    if (index >= pd.Timestamp('2020-07-01')):
        if (row['weekday'] == 5):
            if row['CT_Conf_Cases']==0:
                CT_saturday_index_list.append(index)
                CT_saturday_case_list.append(row['CT_Conf_Cases'])

In [None]:
CT_saturday_case_dict= dict(zip(CT_saturday_index_list, CT_case_list_new))

In [None]:
# Iterate through dictionary keys, values, and replace each ME_Conf_Case at index/key, with corresponding value
for key, value in CT_saturday_case_dict.items():
    df10['CT_Conf_Cases'].loc[key] = value

In [None]:
# Iterate over indices, rows and if row corresponds to a Monday after 2021-07-01 with a non-zero value for 
# MA_Conf_Cases, append index, case value, to corresponding lists
MA_monday_index_list = []
MA_monday_case_list =[]
for index, row in df9.iterrows():
    if (index >= pd.Timestamp('2021-07-01'))&(row['weekday'] == 0)&(row['MA_Conf_Cases'] != 0):
        MA_monday_index_list.append(index)
        MA_monday_case_list.append(row['MA_Conf_Cases'])

In [None]:
# Divide each Tuesday non-zero ME case count by three and round to nearest integer; append to new list
MA_case_list_new=[]
for i in MA_monday_case_list:
    MA_case_list_new.append(round(i/3))

In [None]:
# Create dictionary of indices and new case count value 
MA_monday_case_dict= dict(zip(MA_monday_index_list, MA_case_list_new))

In [None]:
# Iterate through dictionary keys, values, and replace each CT_Conf_Case at index/key, with corresponding value
for key, value in MA_monday_case_dict.items():
    df10['MA_Conf_Cases'].loc[key] = value

In [None]:
MA_sunday_index_list = []
MA_sunday_case_list =[]
for index, row in df10.iterrows():
    if (index >= pd.Timestamp('2021-07-01')):
        if (row['weekday'] == 6):
            if row['MA_Conf_Cases']== 0:
                MA_sunday_index_list.append(index)
                MA_sunday_case_list.append(row['MA_Conf_Cases'])

In [None]:
MA_sunday_case_dict= dict(zip(MA_sunday_index_list, MA_case_list_new))

In [None]:
# Iterate through dictionary keys, values, and replace each CT_Conf_Case at index/key, with corresponding value
for key, value in MA_sunday_case_dict.items():
    df10['MA_Conf_Cases'].loc[key] = value

In [None]:
MA_saturday_index_list = []
MA_saturday_case_list =[]
for index, row in df10.iterrows():
    if (index >= pd.Timestamp('2021-07-01')):
        if (row['weekday'] == 5):
            if row['MA_Conf_Cases']==0:
                MA_saturday_index_list.append(index)
                MA_saturday_case_list.append(row['ME_Conf_Cases'])

In [None]:
MA_saturday_case_dict= dict(zip(MA_saturday_index_list, MA_case_list_new))

In [None]:
# Iterate through dictionary keys, values, and replace each ME_Conf_Case at index/key, with corresponding value
for key, value in MA_saturday_case_dict.items():
    df10['MA_Conf_Cases'].loc[key] = value

In [None]:
# Loop through NaNs and fill with average of previous and following cell values 
state_col_list=['CT_Conf_Cases', 'VT_Conf_Cases', 'ME_Conf_Cases', 'MA_Conf_Cases']
for col in state_col_list:
    df5[col] = df5[col].fillna((df5[col].shift() + df5[col].shift(-1))/2)

In [None]:
# Replace cases negative values with np.nan
df5.MA_Conf_Cases['2020-09-03'] = np.nan
df5.CT_Conf_Cases[['2020-05-27', '2020-08-18']] = np.nan
df5.VT_Conf_Cases[['2020-05-11', '2020-06-17']] = np.nan
df5.ME_Conf_Cases[['2020-03-15','2020-07-22', '2020-09-09', '2021-08-09']]= np.nan

In [None]:
df9.ME_Conf_Cases.shift(-1) = df9.ME_Conf_Cases / 3
            df9.ME_Conf_Cases.shift(-2) = df9.ME_Conf_Cases / 3

In [None]:
# Loop through NaNs and fill with average of previous and following cell values 
state_col_list=['CT_Conf_Cases', 'VT_Conf_Cases', 'ME_Conf_Cases', 'MA_Conf_Cases']
for col in state_col_list:
    df5[col] = df5[col].fillna((df5[col].shift() + df5[col].shift(-1))/2)

In [None]:
df9.ME_Conf_Cases.shift(-1).replace(0,1)
            df9.ME_Conf_Cases.shift(-2).replace(0,1)

In [None]:
for row in df9['2021-07-01':]:
    if df9[df9['weekday']] == 2: # Tuesday 
        if df9.ME_Conf_Cases != 0: # Don't impute missing values if this is a bank holiday 
            df9.ME_Conf_Cases /= 3
            #df9.ME_Conf_Casas.fillna(how='bfill')

In [None]:
state_col_list=['CT_Conf_Cases', 'VT_Conf_Cases', 'ME_Conf_Cases', 'MA_Conf_Cases']
for col in state_col_list:
    df11[col] = df5[col].fillna((df5[col].shift() + df5[col].shift(-1))/2)

In [None]:
for index, row in df11.iterrows():
    if (index >= pd.Timestamp('2021-07-01')):
        if (row['weekday']==0) | (row['weekday'] ==6):
            df11['ME_Conf_Cases'].replace(0, np.nan, inplace=True)  

In [None]:
#df9.ME_Conf_Cases.fillna(method='bfill', inplace=True)

In [None]:
# Replace zero values after 2021-07-01 with np.nan
#for index, row in df9.iterrows():
    #if (index>=pd.Timestamp('2021-07-01'))&((row['weekday']==6)|(row['weekday']== 6))&(row['ME_Conf_Cases']==0):
        #df9['ME_Conf_Cases'].replace(0, np.nan, inplace=True)   

In [None]:
#df9['ME_Conf_Cases'].replace(0, np.nan, inplace=True)

In [None]:
sns.set(rc={'figure.figsize':(18,8)})
with sns.axes_style("ticks"):
    fig, ax = plt.subplots()
    plt.plot(df10.index, df10['ME_Conf_Cases'], color = 'blue')
    ax.set_ylabel('Maine COVID cases')
    plt.xticks(rotation=45)
    for i in sunday_ind_me:    
        ax.axvline(x=i, color='red', alpha=0.7)
    for i in monday_ind_me:    
        ax.axvline(x=i, color='orange', alpha=0.7)
    for i in other_ind_me:    
        ax.axvline(x=i, color='green', alpha=0.7)
    plt.title('Zero values by weekday of Maine COV cases', fontsize=20)
    sns.despine(left=True, bottom=True)
    plt.show()

In [None]:
sunday_ind_me=(df10[(df10.weekday == 6)&(df10.ME_Conf_Cases==0)& (df10.index >'2020-01-29')]).index # Sunday zero values
monday_ind_me=(df10[(df10.weekday == 0)&(df10.ME_Conf_Cases==0)&(df10.index>'2020-01-29')]).index # Monday zero values
# Zero values that are neither Sunday nor Monday
other_ind_me=(df10[(df10.weekday != 0)& (df10.weekday != 6)&(df10.ME_Conf_Cases==0)& (df10.index >'2020-01-29')]).index 

In [None]:
ME_sunday_index_list = []
for index, row in df10.iterrows():
    if (index >= pd.Timestamp('2021-07-01')):
        if (row['weekday'] == 6):
            if row['ME_Conf_Cases']==0:
                ME_sunday_index_list.append(index)

In [None]:
for index in ME_monday_index_list:
    if df10['weekday'].loc[index] == 0:
        df10['ME_Conf_Cases'] = df10['ME_Conf_Cases'].fillna(df10['ME_Conf_Cases'].shift())

In [None]:
#df10.replace({'ME_Conf_Cases':{0:np.nan}})

In [None]:
with sns.axes_style("ticks"):
    fig, ax = plt.subplots()
    plt.plot(df10.index, df10['VT_Conf_Cases'], color = 'blue')
    ax.set_ylabel('Vermont COVID cases')
    plt.xticks(rotation=45)
    for i in sunday_ind_vt:    
        ax.axvline(x=i, color='red', alpha=0.7)
    for i in saturday_ind_vt:    
        ax.axvline(x=i, color='orange', alpha=0.7)
    for i in other_ind_vt:    
        ax.axvline(x=i, color='green', alpha=0.7)
    plt.title('Zero values by weekday of Vermont COV cases', fontsize=16)
    sns.despine(left=True, bottom=True)
    plt.show()

In [None]:
sunday_ind_vt=(df10[(df10.weekday == 6)&(df10.VT_Conf_Cases==0)]).index # Sunday zero values
saturday_ind_vt=(df10[(df10.weekday == 5)&(df10.VT_Conf_Cases==0)]).index # Saturday zero values
# Zero values that are neither Sunday nor Monday
other_ind_vt=(df10[(df10.weekday != 5)& (df10.weekday != 6)&(df10.VT_Conf_Cases==0)]).index 

In [None]:
# Iterate through dictionary keys, values, and replace each ME_Conf_Case at index/key, with corresponding value
for key, value in VT_saturday_case_dict.items():
    df10['VT_Conf_Cases'].loc[key] = value

In [None]:
VT_saturday_case_dict= dict(zip(VT_saturday_index_list, VT_case_list_new))

In [None]:
VT_saturday_index_list = []
VT_saturday_case_list = []
for index, row in df10.iterrows():
    if (index >= pd.Timestamp('2021-06-01')):
        if (row['weekday'] == 5):
            if row['VT_Conf_Cases']==0:
                VT_saturday_index_list.append(index)
                VT_saturday_case_list.append(row['VT_Conf_Cases'])

In [None]:
# Iterate through dictionary keys, values, and replace each CT_Conf_Case at index/key, with corresponding value
for key, value in VT_sunday_case_dict.items():
    df10['VT_Conf_Cases'].loc[key] = value

In [None]:
VT_sunday_case_dict= dict(zip(VT_sunday_index_list, VT_case_list_new))

In [None]:
VT_sunday_index_list = []
VT_sunday_case_list = []
for index, row in df10.iterrows():
    if (index >= pd.Timestamp('2021-06-01')):
        if (row['weekday'] == 6):
            if row['VT_Conf_Cases']== 0:
                VT_sunday_index_list.append(index)
                VT_sunday_case_list.append(row['VT_Conf_Cases'])

In [None]:
# Iterate through dictionary keys, values, and replace each CT_Conf_Case at index/key, with corresponding value
for key, value in VT_monday_case_dict.items():
    df10['VT_Conf_Cases'].loc[key] = value

In [None]:
# Create dictionary of indices and new case count value 
VT_monday_case_dict= dict(zip(VT_monday_index_list, VT_case_list_new))

In [None]:
# Divide each Tuesday non-zero ME case count by three and round to nearest integer; append to new list
VT_case_list_new=[]
for i in VT_monday_case_list:
    VT_case_list_new.append(round(i/3))

In [None]:
VT_monday_case_list =[]
VT_monday_case_list.append(row['ME_Conf_Cases'])

In [None]:
# Iterate over indices, rows and if row corresponds to a Monday after 2021-07-01 with a non-zero value for 
# VT_Conf_Cases, append index, case value, to corresponding lists
VT_monday_index_list = []
VT_monday_case_list =[]
for index, row in df9.iterrows():
    if (index >= pd.Timestamp('2021-06-01'))&(index <= pd.Timestamp('2021-08-24'))&(row['weekday'] == 0)&(row['VT_Conf_Cases'] != 0):
        VT_monday_index_list.append(index)
        VT_monday_case_list.append(row['ME_Conf_Cases'])

In [None]:
#list_states = [MA, ME, CT, VT]

In [None]:
#interpolated_features = pd.DataFrame()
#cols = [ , , , , ]
#shift_dict = {'2w_ks': 14, '4wks':28, '6wks':42}
#for col in cols:
    #for key in shift_dict.keys():
        #periods = shift_dict[key]
        #temp[col+'_'+key] = temp[col].shift(periods=periods).copy().fillna(method='bfill')
        #temp = temp.resample('W').mean()
        #interpolated_features = pd.concat([site_interpolated, temp])

In [None]:
MA2 = MA 

In [None]:
#df['SPX_Ret'] = df['SPX_Prices'].pct_change()

In [None]:
MA2['temp_change'] =MA2['MA_Avg_Temp(F)'].diff()

In [None]:
MA2=MA2.iloc[1:,:]

In [None]:
sns.heatmap(MA2.corr(), square=True, annot=True)
plt.yticks(rotation =45)

AUTOCORRELATION: Correlation of a series with a lagged copy of itself (usually we mean lag 1)
* For daily data, lag 1 would be the series lagged by one day

__Note:__ Not all bank holidays listed in the above resources appear in my `US_bank_holidays` list. I used the resources above as a general guide of dates that might not contain any data, and then checked each of the dates individually to make sure I wasn't deleting any significant data from the set. I found that data __was__ recorded for some of the bank holidays listed in the resources; whenever this was the case, I did __not__ drop that row of data. The dates listed in `US_bank_holidays` reflect only those bank holidays listed in these resources that I was __also__ able to confirm did not have any significant data.

In [None]:
#Create a list of US bank holidays in 2020-2021:
#Please refer to note below, explaining how this list of dates was put together.
#US_bank_holidays = ['2020-02-17', '2020-12-25', '2020-11-26', '2020-12-25', '2021-01-01', '2021-05-31', '2021-09-06']

In [None]:
#Check rows have been dropped:
#print(len(df6.index))
#print(len(df7.index))

***
***
***

# CORRECT CODE

In [None]:
df_me2 = df_ma

In [None]:
df_me2.reset_index(inplace=True)

In [None]:
df_me2.head()

In [None]:
df_me2.dtypes

In [None]:
# create a sequence of numbers
df_me2['Series'] = np.arange(1,len(df_me2)+1)
# extract day, month, and year from dates
df_me2['Year'] =[i.year for i in df_me2['date']]
df_me2['Month'] = [i.month for i in df_me2['date']]
df_me2['Day'] = [i.day for i in df_me2['date']]
# drop unnecessary columns and re-arrange
#df_me2.drop(['date'], axis=1, inplace=True)
df_me2 = df_me2[['date', 'Series', 'Year', 'Month', 'Day', 'MA_Avg_Temp(F)', 'MA_PRCP(mm)', 'MA_Conf_Cases']]

In [None]:
df_me2.head()

In [None]:
df_me2['day_of_week'] = [i.dayofweek for i in df_me2['date']]
df_me2['day_of_year'] = [i.dayofyear for i in df_me2['date']]

In [None]:
df_me2.head()

In [None]:
from pycaret.regression import *
df_subset = df_me2
all_results=[]

# initialize setup from pycaret.regression
s = setup(df_subset, target = 'MA_Conf_Cases', train_size = 0.8,
            data_split_shuffle = True, fold = 3,
            ignore_features = ['date', 'Series', 'MA_PRCP(mm)'],
            numeric_features = ['day_of_year', 'Year', 'MA_Avg_Temp(F)'],
            categorical_features = ['Month', 'day_of_week'],
            silent = True, verbose = False, session_id = 123,
            normalize=True)
    
# compare all models and select best one based on MAE
best_model = compare_models(sort = 'MAE', verbose=False)
    
# capture the compare result grid and store best model in list
p = pull().iloc[0:1]
p['time_series'] = str("ME")
all_results.append(p)
    
# finalize model i.e. fit on entire data including test set
f = finalize_model(best_model)
best_model

In [None]:
concat_results = pd.concat(all_results,axis=0)
concat_results.head()

In [None]:
from pycaret.regression import *
all_results=[]
for df in df_list:
    county_df = df
    s = setup(county_df, target = 'MA_Conf_Cases', train_size = 0.8,
            data_split_shuffle = True, fold = 3,
            ignore_features = ['date', 'Series', 'MA_PRCP(mm)'],
            numeric_features = ['day_of_year', 'Year', 'MA_Avg_Temp(F)'],
            categorical_features = ['Month', 'day_of_week'],
            silent = True, verbose = False, session_id = 123,
            normalize=True)