# Exploratory Data Analysis

In [None]:
import pandas as pd
import numpy as np 
import json
import requests
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import ppscore
from scipy import stats

In [None]:
# Load dataframe created in data wrangling phase
# Parse datetime column and make index
df= pd.read_csv('cleaned_cov_weather4.csv', parse_dates=['date'], index_col ='date')

In [None]:
df.head()

In [None]:
# Check for duplicated DateTimeIndices
df[df.index.duplicated()]

In [None]:
# Remove duplicated DateTimeIndices (but keep first instances of indices)
# Check to confirm duplicated indices have been removed 
df = df[~df.index.duplicated(keep='first')]
df[df.index.duplicated()]

In [None]:
# double check data types
df.dtypes

In [None]:
# Convert int columns to float in preparation for calculations:
int_columns= {"CT_conf_cases": 'float', "ME_conf_cases": 'float', "MA_conf_cases":"float", "VT_conf_cases":'float', "vt_avg_temp":'float', "vt_prcp":'float'}
df = df.astype(int_columns)

In [None]:
df.dtypes

In [None]:
# Any missing values?
df[pd.isnull(df).any(axis=1)]

No missing values.

However, the units of TAVG and PRCP are clearly off. 

According to [NOAA documentation](https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt), temperatures are reported in __*tenths*__ of a degree Celsius. I'll convert to degrees Fahrenheit instead (since this is US data here).

In [None]:
# (C*9/50)+32=F (since C is reported here as tenths of degrees)
df2=df
df2['vt_avg_temp'] = ((df['vt_avg_temp']*9)/50) + 32
df2['me_avg_temp'] = ((df['me_avg_temp']*9)/50) + 32
df2['ct_avg_temp'] = ((df['ct_avg_temp']*9)/50) + 32
df2['ma_avg_temp'] = ((df['ma_avg_temp']*9)/50) + 32

The [same documention](https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/readme.txt) cited above also explains that PRCP data is reported in __*tenths*__ of a mm. [Wikipedia](https://en.wikipedia.org/wiki/Precipitation) reports that generally PRCP data is reported in millimeters, so I'll convert to millimeters instead.

In [None]:
df3=df2
df3['vt_prcp'] = df2['vt_prcp']/10
df3['me_prcp'] = df2['me_prcp']/10
df3['ct_prcp'] = df2['ct_prcp']/10
df3['ma_prcp'] = df2['ma_prcp']/10

In [None]:
df3.head()

For the sake of clarity, I'll update column names to reflect this unit change.

In [None]:
#Update column names to reflect new units:
units_dict= {'vt_avg_temp': 'VT_Avg_Temp(F)', 'vt_prcp': 'VT_PRCP(mm)', 'me_avg_temp':'ME_Avg_Temp(F)', 'me_prcp': 'ME_PRCP(mm)', 'ct_avg_temp': 'CT_Avg_Temp(F)', 'ct_prcp':'CT_PRCP(mm)', 'ma_avg_temp': 'MA_Avg_Temp(F)', 'ma_prcp':'MA_PRCP(mm)'}
df3.rename(columns=units_dict, inplace=True)
df3.head()

In [None]:
#Examine timespan of data:
print('Beginning date: ',min(df3.index))
print('End date: ', max(df3.index))
print('Total timespan: ', max(df3.index-min(df3.index)))

In [None]:
# Make copy of df 
df4=df3

The confirmed cases column represented total number of cumulative new COVID cases per date. To explore new infection *rates* over time, rather than cumulative COVID cases, we'll have to take the first differences of the cumulative counts. There shouldn't be any `NaN`s left, but just in case, we'll chain the `.fillna()` method to the end of our first differences calls.

In [None]:
# Make column of non-cumulative counts of confirmed cases for each state:
df4["CT_Conf_Cases"] = df4['CT_conf_cases'].diff().fillna(0)
df4["VT_Conf_Cases"] = df4['VT_conf_cases'].diff().fillna(0)
df4["ME_Conf_Cases"] = df4['ME_conf_cases'].diff().fillna(0)
df4["MA_Conf_Cases"] = df4['MA_conf_cases'].diff().fillna(0)

In [None]:
# Drop cumulative confirmed case columns:
df5 = df4.drop(columns=['CT_conf_cases', 'VT_conf_cases', 'ME_conf_cases', 'MA_conf_cases'])

In [None]:
# Take a look at some summary statistics of the data:
# First, we'll define a function to highlight any negative values (as we can't have negative new COVID case values)
def highlight_neg(cell):
    if cell < 0 :
        return 'background: yellow'  
df5.describe().style.applymap(highlight_neg)

The negative value in `ME_Avg_Temp(F)` column is fine, however it looks like some of the states have `Conf_Cases` values that are less than zero. That can't be right; this needs to be looked into. Let's first see how many negative values are in each state's confirmed case counts and how large or small the values are. These will be considerations when we decide how to best address them.

In [None]:
CT_neg = df5[df5['CT_Conf_Cases']<0]
print("CT has", len(CT_neg), "negative case value(s):")
print(CT_neg.CT_Conf_Cases)

In [None]:
VT_neg = df5[df5['VT_Conf_Cases']<0]
print("VT has", len(VT_neg), "negative case value(s):")
print(VT_neg.VT_Conf_Cases)

In [None]:
ME_neg = df5[df5['ME_Conf_Cases']<0]
print("ME has", len(ME_neg), "negative case value(s):")
print(ME_neg.ME_Conf_Cases)

In [None]:
MA_neg = df5[df5['MA_Conf_Cases']<0]
print("MA has", len(MA_neg), "negative case value(s):")
print(MA_neg.MA_Conf_Cases)

Connecticut has two negative case counts, Vermont also has two negative case counts, Maine has four negative case counts, and Massachusetts has one negative case count (for a total of 9 negative case counts). Thankfully we don't have *a lot* of these erroneous counts, but before blindly filling them all with zeroes, let's check to see what the case counts around these dates look like. If they're very low, we may be able to fill the negative values with zeroes without much impact. If these values occur during a surge, however, we may need to look a little deeper. 

In [None]:
# Print day before and after each negative case value in CT:
ct_neg_df = pd.DataFrame(pd.concat([df5['2020-05-26':'2020-05-28'].CT_Conf_Cases, 
                                    df5['2020-08-17':'2020-08-19'].CT_Conf_Cases]))
ct_neg_df.style.applymap(highlight_neg)

In [None]:
# Print day before and after each negative case value in VT:
vt_neg_df = pd.DataFrame(pd.concat([df5['2020-05-10':'2020-05-12'].VT_Conf_Cases, 
                                    df5['2020-06-16':'2020-06-18'].VT_Conf_Cases]))
vt_neg_df.style.applymap(highlight_neg)

In [None]:
# Print day before and after each negative case value in ME:
me_neg_df = pd.DataFrame(pd.concat([df5['2020-03-14':'2020-03-16'].ME_Conf_Cases, 
                                    df5['2020-07-21':'2020-07-23'].ME_Conf_Cases, 
                                    df5['2020-09-08':'2020-09-10'].ME_Conf_Cases, 
                                    df5['2021-08-08':'2021-08-10'].ME_Conf_Cases]))
me_neg_df.style.applymap(highlight_neg)

In [None]:
# Print day before and after each negative case value in MA:
ma_neg_df = pd.DataFrame(pd.concat([df5['2020-09-02':'2020-09-04'].MA_Conf_Cases]))
ma_neg_df.style.applymap(highlight_neg)

If Vermont was the only state with negative case counts, it may be alright to fill negative case counts with zeroes, but all other states would likely be adversely affected by this. For example, Massachusetts has a case count value of -280, between dates with case counts of positive 404 and 212. Maine, too, has a value of -1 right before a value of 361 cases (which is a large case count for this less-populous state). Therefore, let's replace these negative case count values with the average of the case counts of the date before and after them. To do this, we'll first convert the negative values to `NaN`s, and then we'll use the `.fillna()` method. 

In [None]:
# Replace cases negative values with np.nan
df5.MA_Conf_Cases['2020-09-03'] = np.nan
df5.CT_Conf_Cases[['2020-05-27', '2020-08-18']] = np.nan
df5.VT_Conf_Cases[['2020-05-11', '2020-06-17']] = np.nan
df5.ME_Conf_Cases[['2020-03-15','2020-07-22', '2020-09-09', '2021-08-09']]= np.nan

In [None]:
# Loop through NaNs and fill with average of previous and following cell values 
state_col_list=['CT_Conf_Cases', 'VT_Conf_Cases', 'ME_Conf_Cases', 'MA_Conf_Cases']
for col in state_col_list:
    df5[col] = df5[col].fillna((df5[col].shift() + df5[col].shift(-1))/2)

Now all of the negative case values should have been replaced with the average of the value preceding and following it. Let's double check the summary statistics of the full DataFrame just to make sure everything looks good.

In [None]:
 df5.describe().style.applymap(highlight_neg)

Now there are no more negative COVID case counts in the DataFrame; the only negative values belong to an average temperature column (which makes sense- Maine gets pretty cold in the winter!).

A quick look at the tail of the DataFrame reveals another interesting detail, however.

In [None]:
# Define a function to highlight zero values (which may be indicative of missing values)
def highlight_zero(cell):
    if cell == 0:
        return 'background: yellow'  

In [None]:
(pd.DataFrame(df5['CT_Conf_Cases'].tail(25))).style.applymap(highlight_zero)

Zeroes are sometimes, but not always, indicative of missing values. We would probably expect a certain proportion of zero values for confirmed cases in a dataset like ours, but the above highlighted values follow a pretty clear pattern (which is suspicious). For every five days of values for `Conf_Cases`, there are two days of zero values. It seems that values have not been recorded for weekends dates in Connecticut, and a quick calendar check confirms the above highlighted zero values all correspond to weekend dates. But what about Monday 2021-09-06? The calendar shows that this was [Labor Day](https://www.officeholidays.com/countries/usa/2021), a bank holiday in the US.


Let's check if the other states follow Connecticut's case reporting schedule, or some other schedule entirely. Before we do that, though, let's add a column, `weekday`, indicating the day of the week for each row of data.

In [None]:
df6= df5
# Add a weekday column
df6["weekday"] = df5.index.dayofweek

In [None]:
# Compare all states' missing value patterns
(pd.DataFrame(df5[['CT_Conf_Cases', 'MA_Conf_Cases', 'ME_Conf_Cases', 
                            'VT_Conf_Cases', 'weekday']].tail(25)).style.applymap(highlight_zero))

From the above data, it looks like Massachusetts and Connecticut may have similar Monday-Friday reporting schedules. It also seems that Maine may report cases Tuesday-Saturday and Vermont may report cases seven days a week (at least for these last 25 rows). Three out of four of the states also reported zero values for 2021-09-06 (a Monday) which we've already determined was a US bank holiday. Notably, Vermont did report cases on this date however.

It's clear that each state's reporting schedule varies, and also that there may be some unsual exceptions to these schedules (like bank holidays) for some states. Clearly we'll need to investigate each state's individual reporting schedule more closely. We may get a better idea of the overall distribution and pattern of these zero values by representing them visually. 

Below I plot each state's daily new confirmed case counts. In the plots for Massachusetts, Connecticut, and Vermont, there is an orange vertical line for every zero value that falls on a Saturday, a red vertical line for every zero value that falls on a Sunday, and a green vertical line for every zero value that falls on any day of the week other than Saturday or Sunday.

Because Maine seems to follow a different schedule, I've placed a red vertical line for every zero count that falls on *Sunday*, an orange vertical line for every zero value that falls on a *Monday*, and a green vertical line for every zero value that falls on any day of the week other than Sunday or Monday.

By plotting the orange and red lines, we'll be able to check our hypothesis regarding each state's reporting schedule. By plotting the green lines, we'll be able to double check for any unusual cases of zero values that do not fit within our hypothesized reporting schedules.

In [None]:
df7 = df6
sunday_ind_me=(df7[(df7.weekday == 6)&(df7.ME_Conf_Cases==0)& (df7.index >'2020-01-29')]).index # Sunday zero values
monday_ind_me=(df7[(df7.weekday == 0)&(df7.ME_Conf_Cases==0)&(df7.index>'2020-01-29')]).index # Monday zero values
# Zero values that are neither Sunday nor Monday
other_ind_me=(df7[(df7.weekday != 0)& (df7.weekday != 6)&(df7.ME_Conf_Cases==0)& (df7.index >'2020-01-29')]).index  

In [None]:
sns.set(rc={'figure.figsize':(18,8)})
with sns.axes_style("ticks"):
    fig, ax = plt.subplots()
    plt.plot(df7.index, df7['ME_Conf_Cases'], color = 'blue')
    ax.set_ylabel('Maine COVID cases')
    plt.xticks(rotation=45)
    for i in sunday_ind_me:    
        ax.axvline(x=i, color='red', alpha=0.7)
    for i in monday_ind_me:    
        ax.axvline(x=i, color='orange', alpha=0.7)
    for i in other_ind_me:    
        ax.axvline(x=i, color='green', alpha=0.7)
    plt.title('Zero values by weekday of Maine COV cases', fontsize=20)
    sns.despine(left=True, bottom=True)
    plt.show()

It seems that (with a few individual exceptions), Maine actually reported new case counts every day of the week until around July of 2021. This means that **we'll only need to address Sunday-Monday zero case counts for Maine from July of 2021 onwards.** We can also look into the few exceptions (green lines) that occur prior to July, 2021, and check for bank holidays. We can also see that Maine did not report its first COVID case until March of 2020.

In [None]:
sunday_ind_ma=(df7[(df7.weekday == 6)&(df7.MA_Conf_Cases==0)& (df7.index >'2020-01-29')]).index # Sunday zero values
saturday_ind_ma=(df7[(df7.weekday == 5)&(df7.MA_Conf_Cases==0)&(df7.index>'2020-01-29')]).index # Saturday zero values
# Zero values that are neither Sunday nor Monday
other_ind_ma=(df7[(df7.weekday != 5)& (df7.weekday != 6)&(df7.MA_Conf_Cases==0)& (df7.index >'2020-01-29')]).index 

In [None]:
with sns.axes_style("ticks"):
    fig, ax = plt.subplots()
    plt.plot(df7.index, df7['MA_Conf_Cases'], color = 'blue')
    ax.set_ylabel('Massachusetts COVID cases')
    plt.xticks(rotation=45)
    for i in sunday_ind_ma:    
        ax.axvline(x=i, color='red', alpha=0.7)
    for i in saturday_ind_ma:    
        ax.axvline(x=i, color='orange', alpha=0.7)
    for i in other_ind_ma:    
        ax.axvline(x=i, color='green', alpha=0.7)
    plt.title('Zero values by weekday of Massachusetts COV cases', fontsize=20)
    sns.despine(left=True, bottom=True)
    plt.show()

It looks like Massachusetts was also reporting case counts daily (seven days a week) until around July of 2021 (when it switched to a Monday-Friday reporting schedule). We can also examine the few exceptions (green lines) for Massachusetts, but it looks like **we'll only need to examine Saturday-Sunday zero case values for Massachusetts from July, 2021 onwards** also. We can see that Massachusetts hasn't reported any significant case values until around March of 2020 either. We may want to further slice our data to begin in March, 2020 to avoid skewing our data with these zero values, but let's first check the Connecticut and Vermont case counts to make sure we aren't dropping any useful data.

In [None]:
sunday_ind_ct=(df7[(df7.weekday == 6)&(df7.CT_Conf_Cases==0)& (df7.index >'2020-01-29')]).index # Sunday zero values
saturday_ind_ct=(df7[(df7.weekday == 5)&(df7.CT_Conf_Cases==0)&(df7.index>'2020-01-29')]).index # Saturday zero values
# Zero values that are neither Sunday nor Monday
other_ind_ct=(df7[(df7.weekday != 5)& (df7.weekday != 6)&(df7.CT_Conf_Cases==0)& (df7.index >'2020-01-29')]).index 

In [None]:
with sns.axes_style("ticks"):
    fig, ax = plt.subplots()
    plt.plot(df7.index, df7['CT_Conf_Cases'],  color = 'blue')
    ax.set_ylabel('Connecticut COVID cases')
    plt.xticks(rotation=45)
    for i in sunday_ind_ct:    
        ax.axvline(x=i, color='red', alpha=0.7)
    for i in saturday_ind_ct:    
        ax.axvline(x=i, color='orange', alpha=0.7)
    for i in other_ind_ct:    
        ax.axvline(x=i, color='green', alpha=0.7)
    plt.title('Zero values by weekday of Connecticut COV cases', fontsize=20)
    sns.despine(left=True, bottom=True)
    plt.show()

Connecticut clearly has a very different reporting schedule than Massachusetts and Maine. It looks like Connecticut only reported case counts seven days a week until around July of *2020*. **We'll need to address Saturday-Sunday zero value counts for Connecticut from July, *2020* onwards.** Connecticut also seems to have reported its first COVID case sometime in March of 2020.

In [None]:
sunday_ind_vt=(df7[(df7.weekday == 6)&(df7.VT_Conf_Cases==0)& (df7.index >'2020-01-29')]).index # Sunday zero values
saturday_ind_vt=(df7[(df7.weekday == 5)&(df7.VT_Conf_Cases==0)&(df7.index>'2020-01-29')]).index # Saturday zero values
# Zero values that are neither Sunday nor Monday
other_ind_vt=(df7[(df7.weekday != 5)& (df7.weekday != 6)&(df7.VT_Conf_Cases==0)& (df7.index >'2020-01-29')]).index 

In [None]:
with sns.axes_style("ticks"):
    fig, ax = plt.subplots()
    plt.plot(df7.index, df7['VT_Conf_Cases'],  color = 'blue')
    ax.set_ylabel('Vermont COVID cases')
    plt.xticks(rotation=45)
    for i in sunday_ind_vt:    
        ax.axvline(x=i, color='red', alpha=0.7)
    for i in saturday_ind_vt:    
        ax.axvline(x=i, color='orange', alpha=0.7)
    for i in other_ind_vt:    
        ax.axvline(x=i, color='green', alpha=0.7)
    plt.title('Zero values by weekday of Vermont COV cases', fontsize=16)
    sns.despine(left=True, bottom=True)
    plt.show()

Vermont seems only to have followed a Monday-Friday reporting schedule for about 2-3 months, from around June of 2021 until August 2021, before reverting back to daily reporting. **We'll need to address Saturday-Sunday zero case values for Vermont from June 2021- August 2021.** Vermont also didn't report its first COVID case until sometime in March of 2020.

It seems pretty clear that we can cut all data prior to March 2020 as we only have 1 non-zero value count (from Massachusetts) prior to March, and so the repetetive zero values from this period can only serve to skew our data. 

In [None]:
# Cut data from before March of 2020
df8=df7.loc['2020-03-01':]

Let's check out how one of the plots looks after the slicing:

In [None]:
sunday_ind_vt=(df8[(df8.weekday == 6)&(df8.VT_Conf_Cases==0)& (df8.index >'2020-01-29')]).index # Sunday zero values
saturday_ind_vt=(df8[(df8.weekday == 5)&(df8.VT_Conf_Cases==0)&(df8.index>'2020-01-29')]).index # Saturday zero values
# Zero values that are neither Sunday nor Monday
other_ind_vt=(df8[(df8.weekday != 5)& (df8.weekday != 6)&(df8.VT_Conf_Cases==0)& (df8.index >'2020-01-29')]).index 

In [None]:
with sns.axes_style("ticks"):
    fig, ax = plt.subplots()
    plt.plot(df8.index, df8['VT_Conf_Cases'], color = 'blue')
    ax.set_ylabel('Vermont COVID cases')
    plt.xticks(rotation=45)
    for i in sunday_ind_vt:    
        ax.axvline(x=i, color='red', alpha=0.7)
    for i in saturday_ind_vt:    
        ax.axvline(x=i, color='orange', alpha=0.7)
    for i in other_ind_vt:    
        ax.axvline(x=i, color='green', alpha=0.7)
    plt.title('Zero values by weekday of Vermont COV cases', fontsize=16)
    sns.despine(left=True, bottom=True)
    plt.show()

This looks a lot better! 

Now we'll need to address each state's unique reporting schedule. 
By examining each state governments' webpages (see here: [Maine](https://www.maine.gov/dhhs/mecdc/infectious-disease/epi/airborne/coronavirus/data.shtml), [Massachusetts](https://www.mass.gov/info-details/covid-19-response-reporting), [Vermont](https://www.healthvermont.gov/covid-19), [Connecticut](https://portal.ct.gov/coronavirus/covid-19-data-tracker)), we can confirm the apparent trends in the plots above. From [this Associated Press article](https://apnews.com/article/health-coronavirus-pandemic-vermont-c781aa063d30e8f665500deaf8902ab9) we can also confirm that Vermont resumed daily case reporting on 2021-08-23 (due to a surge in cases).

More simply, each state reported cases daily, besides the following periods (and a few select bank holidays):

* **Maine** reported cases **Tuesday-Saturday** from **2021-07-01 onwards**.
* **Massachusetts** reported cases **Monday-Friday** from **2021-07-01 onwards**.
* **Connecticut** reported cases **Monday-Friday** from **2020-07-01 onwards**.
* **Vermont** reported cases **Monday-Friday** from **2021-06-01 to 2021-08-23**.

The above hyperlinked resources also describe that each state reports a cumulative total of new cases on the day following a two-day break in reporting. For Massachusetts, Connecticut, and Vermont, this would be Monday. For Maine this would be Tuesday. The resources also indicate that if the day following a two-day break in reporting (either Monday or Tuesday) is a bank holiday, then a cumulative case count for the *three* prior days will be reported on the day following the bank holiday (for Massachusetts, Connecticut, and Vermont, this would be Tuesday, and for Maine, this would be Wednesday).

Now that we've more clearly defined each state's reporting schedules, let's address the zero values. Because we know that each state reports a *cumulative total* the day following a break in reporting, we can divide this cumulative count by three and replace each of the three days' values with a third of the cumulative value for those days. Before doing so, we'll have to check to make sure none of these days fall on a bank holiday, however.

Let's quickly plot the distribution of cases per weekday before we make any changes, so we can compare with the distributions after we've updated the dataframe.

In [None]:
# Plot boxplot of distribution of cases per weekday
#plt.rcParams["figure.figsize"] = [7.50, 3.50]
weekdays =['Mon', 'Tue', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun']
weekday_nums = [0, 1, 2, 3, 4, 5, 6]
with sns.axes_style("white"):
    fig, axes = plt.subplots(2,2, sharex=True, figsize=(16,12))
    fig.suptitle('Distribution of case counts per weekday', fontsize=20)
    # Set suptitle and subtitles
    axes[0,0].set_title('Massachusetts', fontsize=16)
    axes[0,1].set_title('Connecticut', fontsize =16)
    axes[1,0].set_title('Vermont', fontsize= 16)
    axes[1,1].set_title('Maine', fontsize= 16)
    #Maine
    sns.boxplot(ax=axes[0, 0], data=df8, x='weekday', y='MA_Conf_Cases', palette = 'Blues')
    plt.xticks(weekday_nums, weekdays)
    #Connecticut
    sns.boxplot(ax=axes[0, 1], data=df8, x='weekday', y='CT_Conf_Cases', palette = 'Blues')
    plt.xticks(weekday_nums, weekdays)
    # Vermont
    sns.boxplot(ax=axes[1, 0], data=df8, x='weekday', y='VT_Conf_Cases', palette = 'Blues')
    plt.xticks(weekday_nums, weekdays)
    #Massachusetts
    sns.boxplot(ax=axes[1, 1], data=df8, x='weekday', y='ME_Conf_Cases', palette = 'Blues')
    plt.xticks(weekday_nums, weekdays)
    plt.show()

## Maine

For Maine, we'll first check each confirmed case count occurring on or after 2021-07-01, on a Tuesday, that does not have a zero case count (so must not be a bank holiday on which cases were not reported), and create a column `ME_third` with a third of that value. We'll replace the original cumulative Tuesday value with one third of itself. Then we'll add 3 more columns representing lag-1, lag-2, and lag-3 of `ME_third`. We'll replace Monday zero values with lag-1 of `ME_third`, Sunday zero values with lag-2 of `ME_third`, and so on. Once this is complete we *should* only be left with missing values for non-reporting days that were part of a long weekend (bank holiday Tuesday, in the case of Maine). We'll then need to calculate using the returning Wednesday's value. 

In [None]:
df9=df8

In [None]:
#df9.at['2021-09-12','ME_Conf_Cases'] = x
#df9.loc[:, 'ME_Conf_Cases'][index] = df9.loc[:, 'ME_third'][index]

In [None]:
df9['ME_third'] = round(df9['ME_Conf_Cases']/3)

In [None]:
me_lags = {'me_lag_1': -1, 'me_lag_2': -2, 'me_lag_3': -3}
for key, value in me_lags.items():
    df9[key] = df9['ME_third'].shift(value, axis=0)

In [None]:
for index, row in df9.iterrows():
    if (index>= pd.Timestamp('2021-07-01')) & (row['weekday']==1) & (row['ME_Conf_Cases'] != 0):
        df9.at[index, 'ME_Conf_Cases'] = df9.at[index, 'ME_third']

In [None]:
for index, row in df9.iterrows():
    if (index>= pd.Timestamp('2021-07-01')) & (row['weekday']==0) & (row['ME_Conf_Cases'] == 0):
        df9.at[index, 'ME_Conf_Cases'] = df9.at[index, 'me_lag_1']

In [None]:
for index, row in df9.iterrows():
    if (index>= pd.Timestamp('2021-07-01')) & (row['weekday']==6) & (row['ME_Conf_Cases'] == 0):
        df9.at[index, 'ME_Conf_Cases'] = df9.at[index, 'me_lag_2']

Now let's address any long weekends where Tuesday was a bank holiday by taking a third of that week's Wednesday case count and replacing Monday, Tuesday, and Wednesday's count with that value instead. 

In [None]:
df9 = df9.drop(['ME_third', 'me_lag_1', 'me_lag_2', 'me_lag_3'], axis=1)

In [None]:
for index, row in df9.iterrows():
    if (index>= pd.Timestamp('2021-07-01')) & (row['weekday']==2) & (row['ME_Conf_Cases'] != 0):
        df9.at[index, 'ME_Conf_Cases'] = df9.at[index, 'ME_third']

In [None]:
for index, row in df9.iterrows():
    if (index>= pd.Timestamp('2021-07-01')) & (row['weekday']==1) & (row['ME_Conf_Cases'] == 0):
        df9.at[index, 'ME_Conf_Cases'] = df9.at[index, 'me_lag_1']

In [None]:
for index, row in df9.iterrows():
    if (index>= pd.Timestamp('2021-07-01')) & (row['weekday']==0) & (row['ME_Conf_Cases'] == 0):
        df9.at[index, 'ME_Conf_Cases'] = df9.at[index, 'me_lag_2']

Now let's check out a plot of Maine's updated case counts:

In [None]:
sunday_ind_me=(df9[(df9.weekday == 6)&(df9.ME_Conf_Cases==0)]).index # Sunday zero values
monday_ind_me=(df9[(df9.weekday == 0)&(df9.ME_Conf_Cases==0)]).index # Monday zero values
# Zero values that are neither Sunday nor Monday
other_ind_me=(df9[(df9.weekday != 0)& (df9.weekday != 6)&(df9.ME_Conf_Cases==0)]).index

In [None]:
sns.set(rc={'figure.figsize':(18,8)})
with sns.axes_style("ticks"):
    fig, ax = plt.subplots()
    plt.plot(df9.index, df9['ME_Conf_Cases'], color = 'blue')
    ax.set_ylabel('Maine COVID cases')
    plt.xticks(rotation=45)
    for i in sunday_ind_me:    
        ax.axvline(x=i, color='red', alpha=0.7)
    for i in monday_ind_me:    
        ax.axvline(x=i, color='orange', alpha=0.7)
    for i in other_ind_me:    
        ax.axvline(x=i, color='green', alpha=0.7)
    plt.title('Zero values by weekday of Maine COV cases', fontsize=20)
    sns.despine(left=True, bottom=True)
    plt.show()

## Connecticut

In [None]:
# Get rid of Maine-specific columns
df9 = df9.drop(['ME_third', 'me_lag_1', 'me_lag_2', 'me_lag_3'], axis=1)

In [None]:
# Create Connecticut-specific columns
# CT_third
df9['CT_third'] = round(df9['CT_Conf_Cases']/3)

In [None]:
# ct_lags
ct_lags = {'ct_lag_1': -1, 'ct_lag_2': -2, 'ct_lag_3': -3}
for key, value in ct_lags.items():
    df9[key] = df9['CT_third'].shift(value, axis=0)

In [None]:
for index, row in df9.iterrows():
    if (index>= pd.Timestamp('2020-07-01')) & (row['weekday']==0) & (row['CT_Conf_Cases'] != 0):
        df9.at[index, 'CT_Conf_Cases'] = df9.at[index, 'CT_third']

In [None]:
for index, row in df9.iterrows():
    if (index>= pd.Timestamp('2020-07-01')) & (row['weekday']==6) & (row['CT_Conf_Cases'] == 0):
        df9.at[index, 'CT_Conf_Cases'] = df9.at[index, 'ct_lag_1']

In [None]:
for index, row in df9.iterrows():
    if (index>= pd.Timestamp('2020-07-01')) & (row['weekday']==5) & (row['CT_Conf_Cases'] == 0):
        df9.at[index, 'CT_Conf_Cases'] = df9.at[index, 'ct_lag_2']

Now let's address any long weekends where Monday was a bank holiday by taking a third of that week's Tuesday case count and replacing Saturday, Sunday, and Monday count with that value instead

In [None]:
for index, row in df9.iterrows():
    if (index>= pd.Timestamp('2020-07-01')) & (row['weekday']==1) & (row['CT_Conf_Cases'] != 0):
        df9.at[index, 'CT_Conf_Cases'] = df9.at[index, 'CT_third']

In [None]:
for index, row in df9.iterrows():
    if (index>= pd.Timestamp('2020-07-01')) & (row['weekday']==0) & (row['CT_Conf_Cases'] == 0):
        df9.at[index, 'CT_Conf_Cases'] = df9.at[index, 'ct_lag_1']

In [None]:
for index, row in df9.iterrows():
    if (index>= pd.Timestamp('2020-07-01')) & (row['weekday']==6) & (row['CT_Conf_Cases'] == 0):
        df9.at[index, 'CT_Conf_Cases'] = df9.at[index, 'ct_lag_2']

In [None]:
for index, row in df9.iterrows():
    if (index>= pd.Timestamp('2020-07-01')) & (row['weekday']==5) & (row['CT_Conf_Cases'] == 0):
        df9.at[index, 'CT_Conf_Cases'] = df9.at[index, 'ct_lag_3']

In [None]:
sunday_ind_ct=(df9[(df9.weekday == 6)&(df9.CT_Conf_Cases==0)]).index # Sunday zero values
saturday_ind_ct=(df9[(df9.weekday == 5)&(df9.CT_Conf_Cases==0)]).index # Saturday zero values
# Zero values that are neither Sunday nor Monday
other_ind_ct=(df9[(df9.weekday != 5)& (df9.weekday != 6)&(df9.CT_Conf_Cases==0)]).index 

In [None]:
with sns.axes_style("ticks"):
    fig, ax = plt.subplots()
    plt.plot(df9.index, df9['CT_Conf_Cases'], color = 'blue')
    ax.set_ylabel('Connecticut COVID cases')
    plt.xticks(rotation=45)
    for i in sunday_ind_ct:    
        ax.axvline(x=i, color='red', alpha=0.7)
    for i in saturday_ind_ct:    
        ax.axvline(x=i, color='orange', alpha=0.7)
    for i in other_ind_ct:    
        ax.axvline(x=i, color='green', alpha=0.7)
    plt.title('Zero values by weekday of Connecticut COV cases', fontsize=20)
    sns.despine(left=True, bottom=True)
    plt.show()

## Vermont

In [None]:
# Get rid of Connecticut-specific columns
df9 = df9.drop(['CT_third', 'ct_lag_1', 'ct_lag_2', 'ct_lag_3'], axis=1)

In [None]:
# Create Vermont-specific columns
# VT_third
df9['VT_third'] = round(df9['VT_Conf_Cases']/3)

In [None]:
# vt_lags
vt_lags = {'vt_lag_1': -1, 'vt_lag_2': -2, 'vt_lag_3': -3}
for key, value in vt_lags.items():
    df9[key] = df9['VT_third'].shift(value, axis=0)

In [None]:
for index, row in df9.iterrows():
    if (index >=pd.Timestamp('2021-06-01')) & (index <=pd.Timestamp('2021-08-24')) & (row['weekday']==0) & (row['VT_Conf_Cases'] != 0):
        df9.at[index, 'VT_Conf_Cases'] = df9.at[index, 'VT_third']

In [None]:
for index, row in df9.iterrows():
    if (index >=pd.Timestamp('2021-06-01')) & (index <=pd.Timestamp('2021-08-24'))& (row['weekday']==6) & (row['VT_Conf_Cases'] == 0):
        df9.at[index, 'VT_Conf_Cases'] = df9.at[index, 'vt_lag_1']

In [None]:
for index, row in df9.iterrows():
    if (index >=pd.Timestamp('2021-06-01')) & (index <=pd.Timestamp('2021-08-24')) & (row['weekday']==5) & (row['VT_Conf_Cases'] == 0):
        df9.at[index, 'VT_Conf_Cases'] = df9.at[index, 'vt_lag_2']

Now let's address any long weekends where Monday was a bank holiday by taking a third of that week's Tuesday case count and replacing Saturday, Sunday, and Monday count with that value instead

In [None]:
for index, row in df9.iterrows():
    if (index >=pd.Timestamp('2021-06-01')) & (index <=pd.Timestamp('2021-08-24')) & (row['weekday']==1) & (row['VT_Conf_Cases'] != 0):
        df9.at[index, 'VT_Conf_Cases'] = df9.at[index, 'VT_third']

In [None]:
for index, row in df9.iterrows():
    if (index >=pd.Timestamp('2021-06-01')) & (index <=pd.Timestamp('2021-08-24')) & (row['weekday']==0) & (row['VT_Conf_Cases'] == 0):
        df9.at[index, 'VT_Conf_Cases'] = df9.at[index, 'vt_lag_1']

In [None]:
for index, row in df9.iterrows():
    if (index >=pd.Timestamp('2021-06-01')) & (index <=pd.Timestamp('2021-08-24')) & (row['weekday']==6) & (row['VT_Conf_Cases'] == 0):
        df9.at[index, 'VT_Conf_Cases'] = df9.at[index, 'vt_lag_2']

In [None]:
for index, row in df9.iterrows():
    if (index >=pd.Timestamp('2021-06-01')) & (index <=pd.Timestamp('2021-08-24')) & (row['weekday']==5) & (row['VT_Conf_Cases'] == 0):
        df9.at[index, 'VT_Conf_Cases'] = df9.at[index, 'vt_lag_3']

In [None]:
sunday_ind_vt=(df9[(df9.weekday == 6)&(df9.VT_Conf_Cases==0)]).index # Sunday zero values
saturday_ind_vt=(df9[(df9.weekday == 5)&(df9.VT_Conf_Cases==0)]).index # Saturday zero values
# Zero values that are neither Sunday nor Monday
other_ind_vt=(df9[(df9.weekday != 5)& (df9.weekday != 6)&(df9.VT_Conf_Cases==0)]).index

In [None]:
with sns.axes_style("ticks"):
    fig, ax = plt.subplots()
    plt.plot(df9.index, df9['VT_Conf_Cases'], color = 'blue')
    ax.set_ylabel('Vermont COVID cases')
    plt.xticks(rotation=45)
    for i in sunday_ind_vt:    
        ax.axvline(x=i, color='red', alpha=0.7)
    for i in saturday_ind_vt:    
        ax.axvline(x=i, color='orange', alpha=0.7)
    for i in other_ind_vt:    
        ax.axvline(x=i, color='green', alpha=0.7)
    plt.title('Zero values by weekday of Vermont COV cases', fontsize=16)
    sns.despine(left=True, bottom=True)
    plt.show()

## Massachusetts

In [None]:
# Get rid of Vermont-specific columns
df9 = df9.drop(['VT_third', 'vt_lag_1', 'vt_lag_2', 'vt_lag_3'], axis=1)

In [None]:
# Create Massachusetts-specific columns
# MA_third
df9['MA_third'] = round(df9['MA_Conf_Cases']/3)

In [None]:
# ma_lags
ma_lags = {'ma_lag_1': -1, 'ma_lag_2': -2, 'ma_lag_3': -3}
for key, value in ma_lags.items():
    df9[key] = df9['MA_third'].shift(value, axis=0)

In [None]:
for index, row in df9.iterrows():
    if (index>= pd.Timestamp('2021-07-01')) & (row['weekday']==0) & (row['MA_Conf_Cases'] != 0):
        df9.at[index, 'MA_Conf_Cases'] = df9.at[index, 'MA_third']

In [None]:
for index, row in df9.iterrows():
    if (index>= pd.Timestamp('2021-07-01')) & (row['weekday']==6) & (row['MA_Conf_Cases'] == 0):
        df9.at[index, 'MA_Conf_Cases'] = df9.at[index, 'ma_lag_1']

In [None]:
for index, row in df9.iterrows():
    if (index>= pd.Timestamp('2021-07-01')) & (row['weekday']==5) & (row['MA_Conf_Cases'] == 0):
        df9.at[index, 'MA_Conf_Cases'] = df9.at[index, 'ma_lag_2']

Now let's address any long weekends where Monday was a bank holiday by taking a third of that week's Tuesday case count and replacing Saturday, Sunday, and Monday count with that value instead

In [None]:
for index, row in df9.iterrows():
    if (index>= pd.Timestamp('2021-07-01')) & (row['weekday']==1) & (row['MA_Conf_Cases'] != 0):
        df9.at[index, 'MA_Conf_Cases'] = df9.at[index, 'MA_third']

In [None]:
for index, row in df9.iterrows():
    if (index>= pd.Timestamp('2021-07-01')) & (row['weekday']==0) & (row['MA_Conf_Cases'] == 0):
        df9.at[index, 'MA_Conf_Cases'] = df9.at[index, 'ma_lag_1']

In [None]:
for index, row in df9.iterrows():
    if (index>= pd.Timestamp('2021-07-01')) & (row['weekday']==6) & (row['MA_Conf_Cases'] == 0):
        df9.at[index, 'MA_Conf_Cases'] = df9.at[index, 'ma_lag_2']

In [None]:
for index, row in df9.iterrows():
    if (index>= pd.Timestamp('2021-07-01')) & (row['weekday']==5) & (row['MA_Conf_Cases'] == 0):
        df9.at[index, 'MA_Conf_Cases'] = df9.at[index, 'ma_lag_3']

In [None]:
sunday_ind_ma=(df9[(df9.weekday == 6)&(df9.MA_Conf_Cases==0)]).index # Sunday zero values
saturday_ind_ma=(df9[(df9.weekday == 5)&(df9.MA_Conf_Cases==0)]).index # Saturday zero values
# Zero values that are neither Sunday nor Monday
other_ind_ma=(df9[(df9.weekday != 5)& (df9.weekday != 6)&(df9.MA_Conf_Cases==0)]).index 

In [None]:
with sns.axes_style("ticks"):
    fig, ax = plt.subplots()
    plt.plot(df9.index, df9['MA_Conf_Cases'], color = 'blue')
    ax.set_ylabel('Massachusetts COVID cases')
    plt.xticks(rotation=45)
    for i in sunday_ind_ma:    
        ax.axvline(x=i, color='red', alpha=0.7)
    for i in saturday_ind_ma:    
        ax.axvline(x=i, color='orange', alpha=0.7)
    for i in other_ind_ma:    
        ax.axvline(x=i, color='green', alpha=0.7)
    plt.title('Zero values by weekday of Massachusetts COV cases', fontsize=20)
    sns.despine(left=True, bottom=True)
    plt.show()

In [None]:
# Get rid of Vermont-specific columns
df9 = df9.drop(['MA_third', 'ma_lag_1', 'ma_lag_2', 'ma_lag_3'], axis=1)

## Bank Holidays

Now the missing weekend values have been addressed, I'll identify US bank holidays that correspond to missing case counts across most or all of the states. After cross-checking with the US bank holidays listed in the below hyperlinks, I discovered 5 bank holidays where at least 50% of the states did not report any case values. 

Full list of 2020 US Holidays [here](https://www.officeholidays.com/countries/usa/2020).

Full list of 2021 US Holidays [here](https://www.officeholidays.com/countries/usa/2021).

In [None]:
#Create a list of US bank holidays in 2020-2021:
US_bank_holidays = ['2020-12-25', '2020-11-26', '2021-01-01', '2021-05-31', '2021-09-06']

In [None]:
#Drop rows with indices in US_bank_holidays list:
df9= df9.drop(df9[(df9.index.isin(US_bank_holidays))].index)
#df7=df6

In [None]:
#df9.loc['2021-09-06']

In [None]:
remaining_missing_MA = []
remaining_missing_ME = []
remaining_missing_CT = []
remaining_missing_VT = []

for index, row in df9.iterrows():
    if row['MA_Conf_Cases'] == 0:
        remaining_missing_MA.append(index)
    elif row['CT_Conf_Cases'] == 0:
        remaining_missing_CT.append(index)
    elif row['ME_Conf_Cases'] == 0:
        remaining_missing_ME.append(index)
    elif row['VT_Conf_Cases'] == 0:
        remaining_missing_VT.append(index)
    else:
        continue

In [None]:
len(remaining_missing_MA)

In [None]:
len(remaining_missing_ME)

In [None]:
len(remaining_missing_VT)

In [None]:
len(remaining_missing_CT)

In [None]:
# Replace cases negative values with np.nan
df9.MA_Conf_Cases[remaining_missing_MA] = np.nan
df9.CT_Conf_Cases[remaining_missing_CT] = np.nan
df9.VT_Conf_Cases[remaining_missing_VT] = np.nan
df9.ME_Conf_Cases[remaining_missing_ME] = np.nan

In [None]:
# Loop through NaNs and fill with average of previous and following cell values 
state_col_list=['CT_Conf_Cases', 'VT_Conf_Cases', 'ME_Conf_Cases', 'MA_Conf_Cases']
for col in state_col_list:
    df9[col] = df9[col].fillna((df9[col].shift() + df9[col].shift(-1))/2)

Let's double check for any remaining `NaN`s, as many machine learning models cannot work with them. It would be expected that we may still have some `NaN`s if we consider that towards the start of the pandemic, there may have been multiple days in a row with zero cases. Because we filled `NaN`s with the average of the value before and after the `NaN` value, we may have inadvertantly tried to divide by zero. Any number divided by zero is undefined, and would have left the `NaN` value in place. Let's see how many `NaN`s remaining, and as long as there aren't too many, let's convert them back to zeroes. 

In [None]:
# How many remaining NaN values?
df9.isna().sum()

Great! We only have 14 `NaN`s left. Let's fill these with zeroes before saving the file.

In [None]:
# Fill remaining NaNs
df9.fillna(0, inplace=True)

In [None]:
# Double check all NaNs are gone:
df9.isna().sum()

In [None]:
#df9.loc['ME_Conf_Cases','2021-07-01']

In [None]:
#Recheck summary statistics with dropped US_bank_holidays:
df9.describe()

In [None]:
df10 = df9

In [None]:
# Plot boxplot of distribution of cases per weekday
#plt.rcParams["figure.figsize"] = [7.50, 3.50]
with sns.axes_style("white"):
    fig, axes = plt.subplots(2,2, sharex=True, figsize=(16,12))
    fig.suptitle('Distribution of case counts per weekday', fontsize=20)
    # Set suptitle and subtitles
    axes[0,0].set_title('Massachusetts', fontsize=16)
    axes[0,1].set_title('Connecticut', fontsize =16)
    axes[1,0].set_title('Vermont', fontsize= 16)
    axes[1,1].set_title('Maine', fontsize= 16)
    #Maine
    sns.boxplot(ax=axes[0, 0], data=df10, x='weekday', y='MA_Conf_Cases', palette = 'Blues')
    plt.xticks(weekday_nums, weekdays)
    #Connecticut
    sns.boxplot(ax=axes[0, 1], data=df10, x='weekday', y='CT_Conf_Cases', palette = 'Blues')
    plt.xticks(weekday_nums, weekdays)
    # Vermont
    sns.boxplot(ax=axes[1, 0], data=df10, x='weekday', y='VT_Conf_Cases', palette = 'Blues')
    plt.xticks(weekday_nums, weekdays)
    #Massachusetts
    sns.boxplot(ax=axes[1, 1], data=df10, x='weekday', y='ME_Conf_Cases', palette = 'Blues')
    plt.xticks(weekday_nums, weekdays)
    plt.show()

In [None]:
df10.reset_index(inplace=True)

In [None]:
#months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
#month_nums = np.arange(1,13)

In [None]:
# Plot boxplot of distribution of cases per month

with sns.axes_style("white"):
    fig, axes = plt.subplots(2,2, sharex=False, figsize=(16,12))
    fig.suptitle('Distribution of case counts per month', fontsize=20)
    # Set suptitle and subtitles
    axes[0,0].set_title('Massachusetts', fontsize=16)
    axes[0,0].set_xlabel('Month')
    axes[0,1].set_title('Connecticut', fontsize =16)
    axes[0,1].set_xlabel('Month')
    axes[1,0].set_title('Vermont', fontsize=16)
    axes[1,0].set_xlabel('Month')
    axes[1,1].set_title('Maine', fontsize=16)
    axes[1,1].set_xlabel('Month')
    #Maine
    sns.boxplot(ax=axes[0, 0], data=df10, x=[i.month for i in df10['date']], y='MA_Conf_Cases', palette = 'flare')
    #plt.xticks(month_nums, months)
    #Connecticut
    sns.boxplot(ax=axes[0, 1], data=df10, x=[i.month for i in df10['date']], y='CT_Conf_Cases', palette = 'flare')
    #plt.xticks(month_nums, months)
    # Vermont
    sns.boxplot(ax=axes[1, 0], data=df10, x=[i.month for i in df10['date']], y='VT_Conf_Cases', palette = 'flare')
    #plt.xticks(month_nums, months)
    #Massachusetts
    sns.boxplot(ax=axes[1, 1], data=df10, x=[i.month for i in df10['date']], y='ME_Conf_Cases', palette = 'flare')
    #plt.xticks(month_nums, months)
    plt.show()

In [None]:
# Let's check out how each state compares with each other: 
#with sns.axes_style("white"):
    #fig, ax = plt.subplots(figsize=(12,8))
    #ax = sns.boxplot(data=df10[['MA_Conf_Cases', 'CT_Conf_Cases','ME_Conf_Cases', 'VT_Conf_Cases']], palette="flare")
    #plt.title('Distribution of daily COV counts by state', fontsize=16)
    #ax.set_ylabel('Daily COVID counts', fontsize=12)


In [None]:
#  DateTimeIndex
df9.set_index('date', inplace=True)

In [None]:
# save the data to a new csv file
df9.to_csv('state_cov_weather2.csv')

## Plots:

With so many twin axes to plot, I'll define a function to help plot all of twin axes time series data:

In [None]:
def plot_timeseries(axes, x, y, color, xlabel, ylabel):
    axes.plot(x,y,color=color)
    axes.set_xlabel(xlabel)
    axes.set_ylabel(ylabel)
    axes.tick_params('y', colors = color)

## Connecticut

In [None]:
#sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.set(rc={'figure.figsize':(18,8)})

In [None]:
#Plot CT time vs confirmed cases
fig, ax = plt.subplots()
plot_timeseries(ax, df9.index, df9['CT_Conf_Cases'], 'blue', 'Time', 'Confirmed Cases')
plt.xticks(rotation=90)
ax2 = ax.twinx()
#Plot CT temp vs time
plot_timeseries(ax2, df9.index, df9['CT_Avg_Temp(F)'], 'red', 'Time', 'Average Temperature (F)')
plt.title('Connecticut temp vs conf cases over time')
plt.xticks(rotation=90)
plt.show()

In [None]:
#Plot CT time vs confirmed cases
fig, ax = plt.subplots()
plot_timeseries(ax, df9.index, df9['CT_Conf_Cases'], 'blue', 'Time', 'Confirmed Cases')
plt.xticks(rotation=90)
ax2 = ax.twinx()
#Plot CT prcp vs time
plot_timeseries(ax2, df9.index, df9['CT_PRCP(mm)'], 'red', 'Time', 'CT_PRCP(mm)')
plt.title('Connecticut prcp vs confirmed cases over time')
plt.xticks(rotation=90)
plt.show()

***

## Maine

In [None]:
#Plot ME time vs Avg. Temp 
fig, ax = plt.subplots()
plot_timeseries(ax, df9.index, df9['ME_Conf_Cases'], 'blue', 'Time', 'Confirmed Cases')
plt.xticks(rotation=90)
ax2 = ax.twinx()
plot_timeseries(ax2, df9.index, df9['ME_Avg_Temp(F)'], 'red', 'Time', 'Average Temperature (F)')
plt.title('Maine Temp vs Conf Cases')
plt.xticks(rotation=90)
plt.show()

In [None]:
#Plot ME time vs confirmed cases
fig, ax = plt.subplots()
plot_timeseries(ax, df9.index, df9['ME_Conf_Cases'], 'blue', 'Time', 'Confirmed Cases')
plt.xticks(rotation=90)
ax2 = ax.twinx()
#Plot ME prcp vs time
plot_timeseries(ax2, df9.index, df9['ME_PRCP(mm)'], 'red', 'Time', 'ME_PRCP(mm)')
plt.title('Maine prcp vs confirmed cases over time')
plt.xticks(rotation=90)
plt.show()

## Massachusetts

In [None]:
#Plot MA time vs confirmed cases
fig, ax = plt.subplots()
plot_timeseries(ax, df9.index, df9['MA_Conf_Cases'], 'blue', 'Time', 'Confirmed Cases')
plt.xticks(rotation=90)
ax2 = ax.twinx()
#Plot MA temp vs time
plot_timeseries(ax2, df9.index, df9['MA_Avg_Temp(F)'], 'red', 'Time', 'MA_Avg_Temp(F)')
plt.title('Massachusetts TAVG vs confirmed cases over time')
plt.xticks(rotation=90)
plt.show()

In [None]:
#Plot MA time vs confirmed cases
fig, ax = plt.subplots()
plot_timeseries(ax, df9.index, df9['MA_Conf_Cases'], 'blue', 'Time', 'Confirmed Cases')
plt.xticks(rotation=90)
ax2 = ax.twinx()
#Plot MA prcp vs time
plot_timeseries(ax2, df9.index, df9['MA_PRCP(mm)'], 'red', 'Time', 'MA_PRCP(mm)')
plt.title('Massachusetts prcp vs confirmed cases over time')
plt.xticks(rotation=90)
plt.show()

## Vermont

In [None]:
#Plot VT time vs confirmed cases
fig, ax = plt.subplots()
plot_timeseries(ax, df9.index, df9['VT_Conf_Cases'], 'blue', 'Time', 'Confirmed Cases')
plt.xticks(rotation=90)
ax2 = ax.twinx()
#Plot VT temp vs time
plot_timeseries(ax2, df9.index, df9['VT_Avg_Temp(F)'], 'red', 'Time', 'VT_Avg_Temp(F)')
plt.title('Vermont TAVG vs confirmed cases over time')
plt.xticks(rotation=90)
plt.show()

In [None]:
#Plot VT time vs confirmed cases
fig, ax = plt.subplots()
plot_timeseries(ax, df9.index, df9['VT_Conf_Cases'], 'blue', 'Time', 'Confirmed Cases')
plt.xticks(rotation=90)
ax2 = ax.twinx()
#Plot MA prcp vs time
plot_timeseries(ax2, df9.index, df9['VT_PRCP(mm)'], 'red', 'Time', 'VT_PRCP(mm)')
plt.title('Vermont prcp vs confirmed cases over time')
plt.xticks(rotation=90)
plt.show()

## Perform Shapiro-Wilk test for normality:

In [None]:
#Convert dataframe to array to perform Shapiro-Wilcox test:
np_cov_weather9 = df9.to_numpy()

In [None]:
#Apply Shapiro test:
shapiro_test = stats.shapiro(np_cov_weather9)

In [None]:
#Print values:
print("Shapiro test statistic is: ", shapiro_test.statistic)
print("Shapiro p-value is: ", shapiro_test.pvalue)

## Predictive Power Score:

In [None]:
#Calculate ppscore 
pps = ppscore.matrix(df9)

In [None]:
print(pps)

In [None]:
print(df9.dtypes)

In [None]:
matrix_df = ppscore.matrix(df9).pivot(columns='x', index='y',  values='ppscore')

In [None]:
sns.set(rc={'figure.figsize':(14.5,11.5)})
sns.heatmap(matrix_df, annot= True)

In [None]:
sns.set(rc={'figure.figsize':(15,12)})
sns.heatmap(df9.corr(), square=True, annot=True)
#cmap='rocket'
#(11.7,8.27)

In [None]:
# Use hierarchical clustering with heatmap: clustermap
sns.set(rc={'figure.figsize':(15,12)})
sns.clustermap(df9.corr())

The clustermap is much easier to interpret than the unclustered heatmap, and visually points us immediately the correlation between average temperatures and confirmed case counts. The clustermap has also grouped Massachusetts values most closely with Connecticut values for each observation, as well as Vermont values most closely with Maine values. This is not surprising, as this is also the geographical grouping of the states.

In [None]:
CT = df9[['CT_Avg_Temp(F)', 'CT_PRCP(mm)', 'CT_Conf_Cases']]

In [None]:
sns.set(rc={'figure.figsize':(9,7)})
sns.heatmap(CT.corr(), square=True, cmap='rocket', annot = True)

In [None]:
MA = df9[['MA_Avg_Temp(F)', 'MA_PRCP(mm)', 'MA_Conf_Cases']]

In [None]:
sns.heatmap(MA.corr(), square=True, cmap='rocket', annot=True)

In [None]:
VT= df9[['VT_Avg_Temp(F)', 'VT_PRCP(mm)', 'VT_Conf_Cases']]

In [None]:
sns.heatmap(VT.corr(), square=True, cmap='rocket', annot=True)

In [None]:
ME = df9[['ME_Avg_Temp(F)', 'ME_PRCP(mm)', 'ME_Conf_Cases']]

In [None]:
sns.heatmap(ME.corr(), square=True, cmap='rocket', annot=True)