In [2]:
%reset -f
%config InlineBackend.figure_format = 'svg'

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.lines import Line2D
from matplotlib.patches import Patch
import pandas as pd
import seaborn as sns
import datetime
import re
from IPython.display import clear_output

# import classes
import Coviddataclass as cd

# fontsizes for plots
BIG_TEXT   = 18
MED_TEXT   = 14
SMALL_TEXT = 10

In [3]:
# load and clean data
df  = pd.read_csv('https://query.data.world/s/jbgdegbanosfmgly7etz2gxqsbhflk')
df2 = pd.read_csv('state_policy_updates_20201114_0719.csv')
data_manager = cd.Coviddataclass()

df, df2 = data_manager.clean_data(df, df2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['new_cases_7day'][df['date'] < pd.to_datetime('2020-01-30', format='%Y-%m-%d')] =\
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['new_deaths_7day'][df['date'] < pd.to_datetime('2020-01-30', format='%Y-%m-%d')] =\


Correlate COVID-19 responses at the state and local level with the number of new cases and deaths at a variable time (14 days by default). This will use the 7 day average at the date in question to avoid outliers.

In [8]:
def calculate_deltas(measure_period=14, filtered_policies=None, case_df=df, policy_df=df2):
    """For every policy implementation at the state and county level, calculate the change in case and death numbers. 
    inputs: 
    measure_period    -- time to wait
    min_samples       -- minimum number of samples
    filtered_policies -- selected policies to select
    
    returns: 
    A copy of the df (covid policies) dataframe with 2 appended columns for the change in case and death numbers. 
    """
    
    # initialize wait period before measurement
    wait_period = datetime.timedelta(days=measure_period) # time in days to watch change in case / death number
    day_1 = datetime.timedelta(days=1)
    
    # filter policies if needed and make a 2 copies of the original policies dataframe- one for iteration and another 
    # for modification
    
    if filtered_policies is not None: 
        all_policies  = policy_df.loc[policy_df['policy_type'].isin(filtered_policies)]
        policy_deltas = policy_df.loc[policy_df['policy_type'].isin(filtered_policies)]
    else: 
        all_policies  = policy_df.copy()
        policy_deltas = policy_df.copy()
    
    # initially fill the delta column with nan
    policy_deltas.loc[:, f"case_{measure_period}_day_delta"] = np.nan
    policy_deltas.loc[:, f"case_{measure_period}_day_accel"] = np.nan
    policy_deltas.loc[:, f"death_{measure_period}_day_delta"] = np.nan
    policy_deltas.loc[:, f"death_{measure_period}_day_accel"] = np.nan
    
    i=0 # counter to time the loop
    state_cases_dict = dict()

    # load all state-aggregated datasets into a dictionary
    print("aggregating state data")
    for state in policy_df['state_id'].unique(): 
        state_cases_dict[state]=data_manager.get_cases(df=case_df, level="state", state=state)
    
    print("aggregating state data complete")
        
    # loop through all policies in the policy dataset
    for index, data in all_policies.iterrows(): 

        # handle state info
        if data.policy_level == "state": 
            policy_case_df = state_cases_dict[data.state_id]

        # handle county info
        else: 
            policy_case_df = data_manager.get_cases(df=case_df, level="county", county=data.county, state=data.state_id)

        # output status updates since this loop takes a long time (~3 minutes)
        i += 1
        if i%100 == 0: 
            print(f"record {i}/{len(all_policies.index)}")
        policy_date = pd.to_datetime(data['date'])
        measure_date = policy_date + wait_period
        
        # pass if the measure date is within 3 days (gives the case data time to update)
        if measure_date > pd.Timestamp.today() - datetime.timedelta(days=3): 
            continue
        
        # calculate "velocity" of covid cases
        cases_start  = policy_case_df[policy_case_df.index==policy_date ]['new_cases_7day_1e6' ].values
        cases_end    = policy_case_df[policy_case_df.index==measure_date]['new_cases_7day_1e6' ].values
        deaths_start = policy_case_df[policy_case_df.index==policy_date ]['new_deaths_7day_1e6'].values
        deaths_end   = policy_case_df[policy_case_df.index==measure_date]['new_deaths_7day_1e6'].values

        delta_cases  = cases_end  - cases_start
        delta_deaths = deaths_end - deaths_start
        
        policy_deltas.at[index, f"case_{measure_period}_day_delta"]  = delta_cases
        policy_deltas.at[index, f"death_{measure_period}_day_delta"] = delta_deaths
        
        # calculate "acceleration" of covid cases
        # (velocity at end of measure period - velocity at start) / measure period 
        c11 = policy_case_df[policy_case_df.index==measure_date       ]['new_cases_7day_1e6' ].values
        c12 = policy_case_df[policy_case_df.index==measure_date+day_1 ]['new_cases_7day_1e6' ].values
        
        c21 = policy_case_df[policy_case_df.index==policy_date        ]['new_cases_7day_1e6' ].values
        c22 = policy_case_df[policy_case_df.index==policy_date+day_1  ]['new_cases_7day_1e6' ].values
        
        d11 = policy_case_df[policy_case_df.index==measure_date       ]['new_deaths_7day_1e6'].values
        d12 = policy_case_df[policy_case_df.index==measure_date+day_1 ]['new_deaths_7day_1e6'].values
        
        d21 = policy_case_df[policy_case_df.index==policy_date        ]['new_deaths_7day_1e6'].values
        d22 = policy_case_df[policy_case_df.index==policy_date+day_1  ]['new_deaths_7day_1e6'].values
        
        case_accel   = ((c12-c11) - (c21-c22)) / measure_period
        deaths_accel = ((d12-d11) - (d21-d22)) / measure_period
        
        policy_deltas.at[index, f"case_{measure_period}_day_accel"]  = case_accel
        policy_deltas.at[index, f"death_{measure_period}_day_accel"] = deaths_accel
        
    return policy_deltas

In [None]:
policy_deltas = calculate_deltas()

aggregating state data
aggregating state data complete
record 100/3415
record 200/3415
record 300/3415
record 400/3415
record 500/3415
record 600/3415
record 700/3415
record 800/3415
record 900/3415
record 1000/3415
record 1100/3415
record 1200/3415
record 1300/3415
record 1400/3415
record 1500/3415
record 1600/3415


In [None]:
df2.head()