# Develop data preparation all policies 20230116

In the next version of the project, we will try modifying the data preprocessing steps. 

In the last version, we got the preprocessing pipeline working with one kind of policy, now we want to work with an arbitrary set of policies

In [20]:
from covid_project.data_utils import clean_covid_data, clean_policy_data, \
    prep_policy_data, prepare_new_df, prepare_data, load_policy_data, prepare_new_df, get_date_range

import os
import pandas as pd
from tabulate import tabulate
from sodapy import Socrata
from tqdm import tqdm
import numpy as np
from datetime import timedelta
from datetime import datetime
import us
import re
#from tqdm import tqdm
from tqdm.notebook import tqdm
from typing import Tuple, Union, List


In [3]:
case_data = clean_covid_data()

In [4]:
policy_data = clean_policy_data()

In [5]:
policy_dict = {
    'aca special enrollment period'            : 'medical', 
    'agriculture'                              : 'agriculture',
    'allow audio only telehealth'              : 'medical', 
    'alternative care facilities'              : 'medical', 
    'bars'                                     : 'entertainement', 
    'childcare (k-12)'                         : 'childcare', 
    'colleges & universities'                  : 'education',
    'construction'                             : 'construction', 
    'day camps/overnight camps'                : 'camps', 
    'day care'                                 : 'childcare',
    'education'                                : 'education', 
    'election'                                 : 'election', 
    'entertainment'                            : 'entertainment', 
    'executive order'                          : 'executive order',
    'expand medicaid telehealth coverage'      : 'medical', 
    'food and drink'                           : 'entertainment',
    'froze mortgage payments'                  : 'housing', 
    'gatherings'                               : 'gatherings', 
    'grace period / security deposit for rent' : 'housing', 
    'graduation'                               : 'graduation',
    'graduation ceremony guidelines'           : 'graduation', 
    'gyms'                                     : 'gyms', 
    'health risk status'                       : 'medical',
    'higher education'                         : 'education', 
    'houses of worship'                        : 'houses of worship', 
    'libraries'                                : 'education',
    'mandate face masks in businesses'         : 'mask mandate',
    'mandate face masks in public spaces'      : 'mask mandate', 
    'manufacturing'                            : 'manufacturing',
    'mask requirement' : 'mask mandate', 
    'medical' : 'medical', 
    'modify medicaid requirements' : 'medical',
    'museums' : 'education', 
    'non-essential businesses' : 'non-essential businesses', 
    'nursing home visitations' : 'nursing homes',
    'nursing homes' : 'nursing homes', 
    'order freezing utility shut offs' : 'housing',
    'outdoor and recreation' : 'outdoor and recreation', 
    'personal care' : 'personal care', 
    'public gatherings' : 'gatherings',
    'public health advisory system' : 'medical', 
    'quarantine' : 'shelter in place',
    'residential overnight camps' : 'camps',
    'resumed elective medical procedures' : 'medical', 
    'shelter in place' : 'shelter in place',
    'state of emergency' : 'state of emergency', 
    'state of emergency/funds' : 'state of emergency',
    'stop enforcement of evictions' : 'housing', 
    'stop initiation of evictions' : 'housing',
    'suspend elective dental procedures' : 'medical', 
    'training jobs' : 'training jobs', 
    'travel' : 'travel',
    'traveler from out of state' : 'travel', 
    'updated guidelines' : 'updated guidelines',
    'wholesale trade' : 'wholesale trade',
}

min_samples = 3
policy_data_prepped = prep_policy_data(policy_data=policy_data,
                                           policy_dict=policy_dict,
                                           min_samples=min_samples)

In [6]:
policy_data_prepped

Unnamed: 0,state,policy_level,date,policy_type,start_stop,county,fips_code,full_policy
1,Mississippi,county,2020-07-20,outdoor and recreation,stop,sunflower,28133,outdoor and recreation - stop - county
3,Missouri,state,2020-06-15,non-essential businesses,stop,statewide,29,non-essential businesses - stop - state
5,Georgia,county,2020-04-30,childcare,stop,fulton,13121,childcare - stop - county
6,Missouri,county,2020-05-31,entertainment,stop,jackson,29095,entertainment - stop - county
7,Missouri,state,2021-08-31,mask mandate,stop,statewide,29,mask mandate - stop - state
...,...,...,...,...,...,...,...,...
4213,Illinois,county,2021-01-15,houses of worship,start,cass,17017,houses of worship - start - county
4214,New York,county,2020-06-08,non-essential businesses,start,new york,36061,non-essential businesses - start - county
4215,Texas,county,2020-07-03,mask mandate,start,morris,48343,mask mandate - start - county
4216,Maine,county,2020-05-18,outdoor and recreation,start,piscataquis,23021,outdoor and recreation - start - county


# Fixing Clean policy data

In [5]:
import os
import pandas as pd
from tabulate import tabulate
from sodapy import Socrata
from tqdm import tqdm
import numpy as np
from datetime import timedelta
from datetime import datetime
import us
import re
#from tqdm import tqdm
from tqdm.notebook import tqdm
from typing import Tuple, Union

df = None
cleaned_timeseries_path = "./data/covid_timeseries_cleaned.csv"
path = "./data/covid_policies.csv"
clean_path = "./data/covid_policies_cleaned.csv"
force_reload = False
force_reclean = False

 # get covid policy data
if df is None:
    if os.path.exists(clean_path) and not force_reclean:
        df = pd.read_csv(clean_path, index_col=0)
        df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
        #return df
    
    df = load_policy_data(path, force_reload)

# remove irrelevant columns
df = df.drop(['geocoded_state', 'comments', 'source', 'total_phases'], axis=1)
# get covid timeseries data
timeseries_df = clean_covid_data(
    df=None,
    clean_path = cleaned_timeseries_path
)

# clean up state names
abbr = [elem.abbr for elem in us.states.STATES]
df = df.drop(df[~df['state_id'].isin(abbr)].index)
df.replace(to_replace=us.states.mapping('abbr', 'name'), inplace=True)
df.rename(columns={'state_id': 'state'}, inplace=True)

### county
# convert nulls in count to 'statewide'
df.fillna(value={'county': 'statewide'}, inplace=True)

# convert to lowercase
df['county'] = df['county'].str.lower()

# address mismatches
county_match = re.compile(" county$")
munici_match = re.compile(" municipality$")
city_match = re.compile(" city$")
Borough_match = re.compile(" borough$")

df['county'].replace(to_replace=county_match, value='', inplace=True)
df['county'].replace(to_replace=munici_match, value='', inplace=True)
df['county'].replace(to_replace=city_match, value='', inplace=True)
df['county'].replace(to_replace=Borough_match, value='', inplace=True)

locs = timeseries_df['county'].unique()
mismatches = [county for county in df['county'][df['county']!='statewide'].unique() 
            if county not in locs]
assert len(mismatches) == 0, f"[ERROR] found mismatches between timeseries and policy dataset: {mismatches}"

# fips code

In [9]:
for index, data in df.iterrows():
    if data.policy_level == 'state':
        print(us.states.lookup(us.states.mapping("name", "abbr").get(data.state)).fips)


10
29
10
29
35
10
28
27
55
56
22
44
49
16
23
48
35
28
27
40
23
18
39
42
50
08
36
32
22
08
02
35
25
49
37
21
18
55
37
41
19
04
30
17
31
56
38
25
12
56
50
04
54
50
27
20
05
22
29
26
16
34
44
48
10
09
08
19
31
21
12
01
40
40
47
13
49
47
44
08
22
41
49
17
29
54
26
16
56
35
56
41
10
39
18
31
18
15
37
37
25
28
01
13
09
37
08
13
04
54
19
36
39
55
18
27
01
31
20
17
29
23
08
33
56
01
16
49
16
15
24
41
29
37
48
29
16
19
18
21
16
19
13
53
48
30
51
08
28
28
19
13
19
26
27
29
50
36
51
24
18
23
56
18
22
16
06
01
22
28
30
28
28
39
40
05
31
08
02
31
48
22
04
55
45
26
39
29
21
31
34
49
02
25
39
31
18
55
35
01
09
04
19
20
19
41
06
33
15
13
19
22
16
44
37
19
20
26
20
55
28
44
08
32
47
18
23
22
37
26
33
28
20
27
04
19
22
22
31
35
41
27
53
16
47
13
26
08
37
49
09
04
49
27
27
37
51
56
50
13
49
49
13
22
22
26
20
06
31
01
04
35
31
48
49
15
35
31
50
21
28
21
09
17
09
01
29
55
17
35
08
18
56
44
08
20
02
01
26
31
35
44
28
18
50
22
48
30
56
35
44
27
30
30
08
21
17
39
50
29
19
35
12
22
19
53
53
37
47
21
45
46
28
3

In [15]:
us.states.lookup(us.states.mapping('name', 'abbr').get("Delaware"))

<State:Delaware>

In [4]:
policy_data

Unnamed: 0,state,policy_level,date,policy_type,start_stop,county,fips_code
1,Mississippi,county,2020-07-20,outdoor and recreation,stop,sunflower,28133
3,Missouri,state,2020-06-15,non-essential businesses,stop,statewide,29
5,Georgia,county,2020-04-30,childcare (k-12),stop,fulton,13121
6,Missouri,county,2020-05-31,entertainment,stop,jackson,29095
7,Missouri,state,2021-08-31,mask requirement,stop,statewide,29
...,...,...,...,...,...,...,...
4213,Illinois,county,2021-01-15,houses of worship,start,cass,17017
4214,New York,county,2020-06-08,non-essential businesses,start,new york,36061
4215,Texas,county,2020-07-03,mask requirement,start,morris,48343
4216,Maine,county,2020-05-18,outdoor and recreation,start,piscataquis,23021


NameError: name 'policy_dict' is not defined

# digging into the prepare_data function

In [12]:
bins_list = [(0, 4), (5, 999)]


policies: str = None
file_id: Union[str, None] = "test"
save_path: str = "./data/single_policy_bins/"
save_data: bool = True
force_rerun: bool = False
pbar: bool = True
new_df: Union[None, pd.DataFrame] = None

In [15]:
### reload the dataframe from file if applicable
if file_id is None and isinstance(policies, str):
    file_id = policies
elif file_id is None and not isinstance(policies, str):
    raise ValueError("if passing multiple policies, you must pass a file id")

filename = file_id.replace(" - ", "_") +\
            "-bins=" + ''.join([str(b[0])+"-"+str(b[1])+"_" for b in bins_list])[:-1] + ".csv"

if not force_rerun and os.path.exists(save_path + filename):
    new_df = pd.read_csv(save_path + filename, index_col=0, header=[0, 1])
    new_df[('info', 'date')] = pd.to_datetime(new_df[('info', 'date')], format='%Y-%m-%d')
    #return new_df

### initialize the new dataframe
if new_df is None:
    new_df = prepare_new_df(case_data)

# 3 possible cases for policies:
# 1) None (use all policies)
# 2) str (use this specific policy)
# 3) List (use the given list of policies)

if policies is None:
    policies = policy_data_prepped['full_policy'].unique()
elif isinstance(policies, str):
    policies = [policies]

tuples_policies = [ (p, (str(date_range[0]) + "-" + str(date_range[1])))
                        for p in policies
                        for date_range in bins_list]

cols_polices = pd.MultiIndex.from_tuples(tuples_policies)
policies_df = pd.DataFrame(columns=cols_polices)
new_df = pd.concat([new_df, policies_df])
new_df = new_df.fillna(0)
policy_data_filtered = policy_data_prepped[policy_data_prepped['full_policy'].isin(policies)]

# generate dataframe
df_dict = policy_data_filtered.to_dict('records')

In [17]:
policies

array(['outdoor and recreation - stop - county',
       'non-essential businesses - stop - state',
       'childcare - stop - county', 'entertainment - stop - county',
       'mask mandate - stop - state',
       'non-essential businesses - stop - county',
       'personal care - stop - state', 'shelter in place - stop - county',
       'gyms - stop - state', 'outdoor and recreation - stop - state',
       'houses of worship - stop - state',
       'houses of worship - stop - county', 'childcare - stop - state',
       'shelter in place - stop - state', 'entertainment - stop - state',
       'manufacturing - stop - state', 'manufacturing - stop - county',
       'mask mandate - stop - county', 'personal care - stop - county',
       'executive order - stop - state', 'manufacturing - start - state',
       'mask mandate - start - county', 'entertainment - start - county',
       'childcare - start - state', 'childcare - start - county',
       'gyms - start - state', 'houses of worship 

In [16]:
df_dict

[{'state': 'Mississippi',
  'policy_level': 'county',
  'date': Timestamp('2020-07-20 00:00:00'),
  'policy_type': 'outdoor and recreation',
  'start_stop': 'stop',
  'county': 'sunflower',
  'fips_code': 28133,
  'full_policy': 'outdoor and recreation - stop - county'},
 {'state': 'Missouri',
  'policy_level': 'state',
  'date': Timestamp('2020-06-15 00:00:00'),
  'policy_type': 'non-essential businesses',
  'start_stop': 'stop',
  'county': 'statewide',
  'fips_code': 29,
  'full_policy': 'non-essential businesses - stop - state'},
 {'state': 'Georgia',
  'policy_level': 'county',
  'date': Timestamp('2020-04-30 00:00:00'),
  'policy_type': 'childcare',
  'start_stop': 'stop',
  'county': 'fulton',
  'fips_code': 13121,
  'full_policy': 'childcare - stop - county'},
 {'state': 'Missouri',
  'policy_level': 'county',
  'date': Timestamp('2020-05-31 00:00:00'),
  'policy_type': 'entertainment',
  'start_stop': 'stop',
  'county': 'jackson',
  'fips_code': 29095,
  'full_policy': 'enter

In [22]:
for row in df_dict:
    for date_bin in bins_list:
        date_range = get_date_range(row['date'], date_bin[0], date_bin[1])
        label = (str(date_bin[0]) + "-" + str(date_bin[1]))
        print(row)
        label = (str(date_bin[0]) + "-" + str(date_bin[1]))
        new_df.loc[(new_df[('info', 'date')].isin(date_range)) &\
                          ((new_df[('info', 'county')] == row['county']) | (row['policy_level'] == 'state')) &\
                           (new_df[('info', 'state')] == row['state']), (policy_name, label)] = 1
    break

{'state': 'Mississippi', 'policy_level': 'county', 'date': Timestamp('2020-07-20 00:00:00'), 'policy_type': 'outdoor and recreation', 'start_stop': 'stop', 'county': 'sunflower', 'fips_code': 28133, 'full_policy': 'outdoor and recreation - stop - county'}


NameError: name 'policy_name' is not defined

In [23]:
new_df.loc[(new_df[('info', 'date')].isin(date_range)) &\
                          ((new_df[('info', 'county')] == row['county']) | (row['policy_level'] == 'state')) &\
                           (new_df[('info', 'state')] == row['state']), (policy_name, label)]

Unnamed: 0_level_0,info,info,info,info,info,info,info,info,outdoor and recreation - stop - county,non-essential businesses - stop - state,...,shelter in place - start - state,nursing homes - start - state,manufacturing - start - county,personal care - start - state,travel - start - state,gatherings - start - state,education - start - state,personal care - start - county,construction - start - county,executive order - start - state
Unnamed: 0_level_1,location_type,state,county,date,new_cases_1e6,new_deaths_1e6,new_cases_7day_1e6,new_deaths_7day_1e6,0-4,0-4,...,5-999,5-999,5-999,5-999,5-999,5-999,5-999,5-999,5-999,5-999
87200,county,Alabama,autauga,2020-01-22,0.00,0.0,0.000000,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
87201,county,Alabama,autauga,2020-01-23,0.00,0.0,0.000000,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
87202,county,Alabama,autauga,2020-01-24,0.00,0.0,0.000000,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
87203,county,Alabama,autauga,2020-01-25,0.00,0.0,0.000000,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
87204,county,Alabama,autauga,2020-01-26,0.00,0.0,0.000000,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3513774,county,Wyoming,weston,2021-12-26,0.00,0.0,20.140845,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
3513775,county,Wyoming,weston,2021-12-27,14.08,0.0,12.112676,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
3513776,county,Wyoming,weston,2021-12-28,28.17,0.0,12.112676,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
3513777,county,Wyoming,weston,2021-12-29,42.25,0.0,18.169014,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
