Create a regression model to estimate the number of new cases / deaths given a set of policy implementations. 

# 0. Imports

In [1]:
import pandas as pd
from datetime import datetime as dt
from datetime import timedelta
import numpy as np
from covid_project.data_utils import (clean_covid_data, clean_policy_data)
                                      #calculate_deltas, generate_state_case_dict,
                                      #calc_delta_stats)
#from covid_project.plotting_funcs import plot_delta_stats
from tqdm.notebook import tqdm
import time

# 1. Planning / explore datasets

After several iterations on data prepocesing, settled on this schema:

Generate a model for every unique policy. The dataset for each policy will have this schema:

| info<br>state |  <br>county  | <br>date | <br>num_new_cases | policy name<br>0-2| <br>3-5 | <br>6-999|
| ------------- | ------------ | -------  | ----------------- | ------ | ------ | ------ |
| state   | county  | date - 1 | # of new cases | 0 | 0 | 0 |
| state   | county  | policy enacted today | # of new cases | 1 | 0 | 0 |
| state   | county  | date + 1 | # of new cases | 1 | 0 | 0 |
| state   | county  | date + 2 | # of new cases | 1 | 0 | 0 |
| state   | county  | date + 3 | # of new cases | 0 | 1 | 0 |
| state   | county  | date + 4 | # of new cases | 0 | 1 | 0 |
| state   | county  | date + 5 | # of new cases | 0 | 1 | 0 |
| state   | county  | date + 6 | # of new cases | 0 | 0 | 1 |
| state   | county  | date + 7 | # of new cases | 0 | 0 | 1 |
| state   | county  | date + 8 | # of new cases | 0 | 0 | 1 |
|    |   | ... |  |  |  | |
| state   | county  | today | # of new cases | 0 | 0 | 1 |


In [2]:
case_data = clean_covid_data()
policy_data = clean_policy_data()

In [3]:
policy_dict = {
    'aca special enrollment period' : 'medical', 
    'agriculture' : 'agriculture',
    'allow audio only telehealth' : 'medical', 
    'alternative care facilities' : 'medical', 
    'bars' : 'entertainement', 
    'childcare (k-12)' : 'childcare', 
    'colleges & universities' : 'education',
    'construction' : 'construction', 
    'day camps/overnight camps' : 'camps', 
    'day care' : 'childcare',
    'education' : 'education', 
    'election' : 'election', 
    'entertainment' : 'entertainment', 
    'executive order' : 'executive order',
    'expand medicaid telehealth coverage' : 'medical', 
    'food and drink' : 'entertainment',
    'froze mortgage payments' : 'housing', 
    'gatherings' : 'gatherings', 
    'grace period / security deposit for rent' : 'housing', 
    'graduation' : 'graduation',
    'graduation ceremony guidelines' : 'graduation', 
    'gyms' : 'gyms', 
    'health risk status' : 'medical',
    'higher education' : 'education', 
    'houses of worship' : 'houses of worship', 
    'libraries' : 'education',
    'mandate face masks in businesses' : 'mask mandate',
    'mandate face masks in public spaces' : 'mask mandate', 
    'manufacturing' : 'manufacturing',
    'mask requirement' : 'mask mandate', 
    'medical' : 'medical', 
    'modify medicaid requirements' : 'medical',
    'museums' : 'education', 
    'non-essential businesses' : 'non-essential businesses', 
    'nursing home visitations' : 'nursing homes',
    'nursing homes' : 'nursing homes', 
    'order freezing utility shut offs' : 'housing',
    'outdoor and recreation' : 'outdoor and recreation', 
    'personal care' : 'personal care', 
    'public gatherings' : 'gatherings',
    'public health advisory system' : 'medical', 
    'quarantine' : 'shelter in place',
    'residential overnight camps' : 'camps',
    'resumed elective medical procedures' : 'medical', 
    'shelter in place' : 'shelter in place',
    'state of emergency' : 'state of emergency', 
    'state of emergency/funds' : 'state of emergency',
    'stop enforcement of evictions' : 'housing', 
    'stop initiation of evictions' : 'housing',
    'suspend elective dental procedures' : 'medical', 
    'training jobs' : 'training jobs', 
    'travel' : 'travel',
    'traveler from out of state' : 'travel', 
    'updated guidelines' : 'updated guidelines',
    'wholesale trade' : 'wholesale trade',
}

In [4]:
def prep_policy_data(policy_data,
                     policy_dict,
                     min_samples=3):
    """Small funciton to process policy data
    df2: DataFrame with the policy data
    policy_dict: dictionary to rename / aggregate policy types
    min_samples: throw out policies that were not implemented many times
    """
    
    proc_policy_data = policy_data.copy()
    
    # Replace policies with the ones in policy_dict(). 
    for key in policy_dict.keys():
        proc_policy_data['policy_type'].replace(to_replace=key, value=policy_dict[key], inplace=True)
        
    # Define a new field that includes policy_type, start_stop, and policy_level information
    proc_policy_data.loc[:, 'full_policy'] = proc_policy_data['policy_type'] + " - " +\
                                        proc_policy_data['start_stop'] + " - " +\
                                        proc_policy_data['policy_level']
    
    # Get number of times each policy was implemented.
    num_samples = proc_policy_data['full_policy'].value_counts()
    
    # drop the policy if it was implemented fewer than min_policy times. 
    proc_policy_data = proc_policy_data.drop(proc_policy_data[
        proc_policy_data['full_policy'].isin(num_samples[num_samples.values < min_samples].index)
    ].index)
    
    # return the DataFrame
    return proc_policy_data

policy_data_prepped = prep_policy_data(policy_data, policy_dict)

In [5]:
case_data.head()

Unnamed: 0,uid,location_type,fips_code,county,state,date,total_population,cumulative_cases,cumulative_cases_1e6,cumulative_deaths,cumulative_deaths_1e6,new_cases,new_deaths,new_cases_1e6,new_deaths_1e6,new_cases_7day,new_deaths_7day,full_loc_name,new_cases_7day_1e6,new_deaths_7day_1e6
69440,84001001,county,1001,autauga,Alabama,2020-01-22,55200,0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,"autauga, Alabama",0.0,0.0
69441,84001001,county,1001,autauga,Alabama,2020-01-23,55200,0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,"autauga, Alabama",0.0,0.0
69442,84001001,county,1001,autauga,Alabama,2020-01-24,55200,0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,"autauga, Alabama",0.0,0.0
69443,84001001,county,1001,autauga,Alabama,2020-01-25,55200,0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,"autauga, Alabama",0.0,0.0
69444,84001001,county,1001,autauga,Alabama,2020-01-26,55200,0,0.0,0,0.0,0,0,0.0,0.0,0.0,0.0,"autauga, Alabama",0.0,0.0


In [6]:
policy_data_prepped.head()

Unnamed: 0,state,policy_level,date,policy_type,start_stop,county,fips_code,full_policy
1,Mississippi,county,2020-07-20,outdoor and recreation,stop,sunflower,28133,outdoor and recreation - stop - county
3,Missouri,state,2020-06-15,non-essential businesses,stop,statewide,29,non-essential businesses - stop - state
5,Georgia,county,2020-04-30,childcare,stop,fulton,13121,childcare - stop - county
6,Missouri,county,2020-05-31,entertainment,stop,jackson,29095,entertainment - stop - county
7,Delaware,state,2020-06-01,gyms,stop,statewide,10,gyms - stop - state


In [7]:
list(policy_data_prepped['full_policy'].values)[0]

'outdoor and recreation - stop - county'

In [8]:
case_data

Unnamed: 0,uid,location_type,fips_code,county,state,date,total_population,cumulative_cases,cumulative_cases_1e6,cumulative_deaths,cumulative_deaths_1e6,new_cases,new_deaths,new_cases_1e6,new_deaths_1e6,new_cases_7day,new_deaths_7day,full_loc_name,new_cases_7day_1e6,new_deaths_7day_1e6
69440,84001001,county,1001,autauga,Alabama,2020-01-22,55200,0,0.00,0,0.00,0,0,0.00,0.0,0.00,0.0,"autauga, Alabama",0.000000,0.0
69441,84001001,county,1001,autauga,Alabama,2020-01-23,55200,0,0.00,0,0.00,0,0,0.00,0.0,0.00,0.0,"autauga, Alabama",0.000000,0.0
69442,84001001,county,1001,autauga,Alabama,2020-01-24,55200,0,0.00,0,0.00,0,0,0.00,0.0,0.00,0.0,"autauga, Alabama",0.000000,0.0
69443,84001001,county,1001,autauga,Alabama,2020-01-25,55200,0,0.00,0,0.00,0,0,0.00,0.0,0.00,0.0,"autauga, Alabama",0.000000,0.0
69444,84001001,county,1001,autauga,Alabama,2020-01-26,55200,0,0.00,0,0.00,0,0,0.00,0.0,0.00,0.0,"autauga, Alabama",0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2798268,84056045,county,56045,weston,Wyoming,2021-12-26,7100,1246,17549.30,14,197.18,0,0,0.00,0.0,1.43,0.0,"weston, Wyoming",20.140845,0.0
2798269,84056045,county,56045,weston,Wyoming,2021-12-27,7100,1247,17563.38,14,197.18,1,0,14.08,0.0,0.86,0.0,"weston, Wyoming",12.112676,0.0
2798270,84056045,county,56045,weston,Wyoming,2021-12-28,7100,1249,17591.55,14,197.18,2,0,28.17,0.0,0.86,0.0,"weston, Wyoming",12.112676,0.0
2798271,84056045,county,56045,weston,Wyoming,2021-12-29,7100,1252,17633.80,14,197.18,3,0,42.25,0.0,1.29,0.0,"weston, Wyoming",18.169014,0.0


In [9]:
type(case_data['date'].values[0])

numpy.datetime64

# 1.1 Generate dataset for single policy

In [36]:
policy_name='outdoor and recreation - stop - county'
bins_list = [(0, 5), (6, 999)]

import os

def prepare_new_df(case_data):
    """Initialize the new dataframe"""

    tuples_info = [('info', 'location_type'),
               ("info", "state"),
               ("info", "county"),
               ("info", "date"),
               ("info", "new_cases_1e6")]
    info_cols = pd.MultiIndex.from_tuples(tuples_info)
    new_df = pd.DataFrame(columns = info_cols)
    new_df[[('info', 'location_type'),
            ('info', 'state'),
            ('info', 'county'),
            ('info', 'date'),
            ('info', 'new_cases_1e6')]] = case_data[['location_type',
                                                      'state',
                                                      'county',
                                                      'date',
                                                      'new_cases_1e6']]
    
    return new_df

def prepare_data(case_data,
                 policy_data_prepped,
                 policy_name,
                 bins_list,
                 save_path = "./data/single_policy_bins/",
                 save_data = True,
                 force_rerun = False,
                 pbar = True,
                 new_df = None):

    def get_date_range(date, start_move=0, stop_move=7): 
        """Get the date range from date+start_move to date+stop_move"""

        return pd.date_range(start=date+timedelta(days=start_move), 
                             end=date+timedelta(days=stop_move))
    
    ### reload the dataframe from file if applicable
    filename = policy_name.replace(" - ", "_") +\
                "-bins=" + ''.join([str(b[0])+"-"+str(b[1])+"_" for b in bins_list])[:-1] + ".csv"
    
    if not force_rerun and os.path.exists(save_path + filename):
        new_df = pd.read_csv(save_path + filename, index_col=0, header=[0, 1])
        new_df[('info', 'date')] = pd.to_datetime(new_df[('info', 'date')], format='%Y-%m-%d')
        return new_df
    
    ### initialize the new dataframe
    if new_df is None:
        new_df = prepare_new_df(case_data)

    tuples_policies = [ (policy_name, (str(date_range[0]) + "-" + str(date_range[1])))
                           for date_range in bins_list]    
    cols_polices = pd.MultiIndex.from_tuples(tuples_policies)
    policies_df = pd.DataFrame(columns=cols_polices)
    new_df = pd.concat([new_df, policies_df])
    new_df = new_df.fillna(0)
    policy_data_filtered = policy_data_prepped[policy_data_prepped['full_policy']==policy_name]

    # generate dataframe
    df_dict = policy_data_filtered.to_dict('records')
    for row in tqdm(df_dict, disable=not pbar):
        for date_bin in bins_list:
            date_range = get_date_range(row['date'], date_bin[0], date_bin[1])

            # Generate label (this is the 2nd level label in the multiIndexed column)
            label = (str(date_bin[0]) + "-" + str(date_bin[1]))
            new_df.loc[(new_df[('info', 'date')].isin(date_range)) &\
                       ((new_df[('info', 'county')] == row['county']) | (row['policy_level'] == 'state')) &\
                       (new_df[('info', 'state')] == row['state']), (policy_name, label)] = 1

    if save_data:
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        new_df.to_csv(save_path+filename)
    return new_df

In [37]:
new_df = prepare_data(
    case_data = case_data,
    policy_data_prepped = policy_data_prepped,
    policy_name = 'outdoor and recreation - stop - county',
    bins_list = [(0, 10), (11, 20), (21, 999)],
    pbar = True,
    force_rerun = True,
)

  0%|          | 0/24 [00:00<?, ?it/s]

## 1.2 Generate datasets for a group of policies

In [41]:
def generate_dataset_group(bins_list,
                           policy_dict,
                           min_samples=3):
    """Generate datasets for every policy for a given group of bins
    Parameters
    ----------
    bins_list
    
    policy_dict
    """

    case_data = clean_covid_data()
    policy_data = clean_policy_data()
    
    policy_data_prepped = prep_policy_data(policy_data=policy_data,
                                           policy_dict=policy_dict,
                                           min_samples=min_samples)
    
    all_policies = policy_data_prepped['full_policy'].unique()
    new_df = prepare_new_df(case_data)

    for policy in tqdm(all_policies, desc='generating datasets for policies'):
        prepare_data(
            case_data=case_data,
            policy_data_prepped = policy_data_prepped,
            policy_name = policy,
            bins_list = bins_list,
            pbar = False,
            new_df=new_df
        )

In [42]:
generate_dataset_group(bins_list=[(0, 3), (4, 999)], 
                       policy_dict=policy_dict,
                       min_samples=3)

generating datasets for policies:   0%|          | 0/50 [00:00<?, ?it/s]

KeyboardInterrupt: 

~ 30 min with v1

very little change converting to dict, trying lower layer

abstracted prep for new df, down to ~20 minutes