# Project 5: Mobility and States Regulation Dataframe

### Imports

In [1]:
import pandas as pd
from datetime import datetime
import numpy as np

### Mobility Data Frame

Notes:
- Nulls are data "*that doesn't meet quality and privacy thresholds for every day*"
- Source: https://www.google.com/covid19/mobility/ 

In [2]:
us_mobility = pd.read_csv("../data/2021_US_Region_Mobility_Report.csv")

In [3]:
us_mobility.head()

Unnamed: 0,country_region_code,country_region,sub_region_1,sub_region_2,metro_area,iso_3166_2_code,census_fips_code,place_id,date,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline
0,US,United States,,,,,,ChIJCzYy5IS16lQRQrfeQ5K5Oxw,2021-01-01,-47.0,-37.0,-21.0,-62.0,-75.0,27.0
1,US,United States,,,,,,ChIJCzYy5IS16lQRQrfeQ5K5Oxw,2021-01-02,-26.0,-13.0,-14.0,-33.0,-21.0,9.0
2,US,United States,,,,,,ChIJCzYy5IS16lQRQrfeQ5K5Oxw,2021-01-03,-27.0,-16.0,-24.0,-33.0,-17.0,7.0
3,US,United States,,,,,,ChIJCzYy5IS16lQRQrfeQ5K5Oxw,2021-01-04,-19.0,-8.0,-23.0,-39.0,-34.0,12.0
4,US,United States,,,,,,ChIJCzYy5IS16lQRQrfeQ5K5Oxw,2021-01-05,-20.0,-6.0,-20.0,-39.0,-34.0,12.0


In [4]:
us_mobility.isnull().sum()

country_region_code                                        0
country_region                                             0
sub_region_1                                             133
sub_region_2                                            6916
metro_area                                            346819
iso_3166_2_code                                       340036
census_fips_code                                        6783
place_id                                                   0
date                                                       0
retail_and_recreation_percent_change_from_baseline    124330
grocery_and_pharmacy_percent_change_from_baseline     148395
parks_percent_change_from_baseline                    252363
transit_stations_percent_change_from_baseline         211489
workplaces_percent_change_from_baseline                 4196
residential_percent_change_from_baseline              132605
dtype: int64

In [5]:
us_mobility.drop(columns=["metro_area",
                         "iso_3166_2_code",
                         "census_fips_code",
                         "place_id"],
                inplace = True)

In [6]:
us_mobility["date"] = pd.to_datetime(us_mobility["date"])

#### US: Country

In [7]:
us_country_mobility = us_mobility[(us_mobility["sub_region_1"].isna()) &
                                  (us_mobility["sub_region_2"].isna())]

us_country_mobility.drop(columns = ["sub_region_1",
                                   "sub_region_2"],
                        inplace = True)

us_country_mobility.reset_index(inplace = True, drop = True)
us_country_mobility = us_country_mobility[us_country_mobility["date"] >= "2020-01-01"]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [8]:
us_country_mobility.isnull().sum()

country_region_code                                   0
country_region                                        0
date                                                  0
retail_and_recreation_percent_change_from_baseline    0
grocery_and_pharmacy_percent_change_from_baseline     0
parks_percent_change_from_baseline                    0
transit_stations_percent_change_from_baseline         0
workplaces_percent_change_from_baseline               0
residential_percent_change_from_baseline              0
dtype: int64

#### US: States

In [9]:
us_states_mobility = us_mobility[(us_mobility["sub_region_1"].isna() == False) & 
                                 us_mobility["sub_region_2"].isna()]

us_states_mobility.drop(columns = "sub_region_2",
                       inplace = True)

us_states_mobility.reset_index(inplace = True, drop = True)
us_states_mobility = us_states_mobility[us_states_mobility["date"] >= "2021-01-01"]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [10]:
us_states_mobility.isnull().sum()

country_region_code                                    0
country_region                                         0
sub_region_1                                           0
date                                                   0
retail_and_recreation_percent_change_from_baseline     0
grocery_and_pharmacy_percent_change_from_baseline      0
parks_percent_change_from_baseline                    19
transit_stations_percent_change_from_baseline          0
workplaces_percent_change_from_baseline                0
residential_percent_change_from_baseline               0
dtype: int64

In [11]:
us_states_mobility.shape

(6783, 10)

In [12]:
us_states_mobility = pd.merge(us_states_mobility,
              us_country_mobility[["date","parks_percent_change_from_baseline"]],
              on=["date"],
              how='left',
              suffixes=["","_x"]
             )

In [13]:
us_states_mobility["parks_percent_change_from_baseline"].fillna(us_states_mobility["parks_percent_change_from_baseline_x"],
                                             inplace = True)
us_states_mobility.drop(columns = ["parks_percent_change_from_baseline_x"],
                       inplace = True)

In [14]:
us_states_mobility.isnull().sum()

country_region_code                                   0
country_region                                        0
sub_region_1                                          0
date                                                  0
retail_and_recreation_percent_change_from_baseline    0
grocery_and_pharmacy_percent_change_from_baseline     0
parks_percent_change_from_baseline                    0
transit_stations_percent_change_from_baseline         0
workplaces_percent_change_from_baseline               0
residential_percent_change_from_baseline              0
dtype: int64

In [15]:
us_states_mobility.shape

(6783, 10)

#### US: County

In [16]:
us_county_mobility = us_mobility[us_mobility["sub_region_2"].isna() == False]

us_county_mobility.reset_index(inplace = True, drop = True)

In [17]:
us_county_mobility.head()

Unnamed: 0,country_region_code,country_region,sub_region_1,sub_region_2,date,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline
0,US,United States,Alabama,Autauga County,2021-01-01,-46.0,-28.0,,,-77.0,28.0
1,US,United States,Alabama,Autauga County,2021-01-02,-10.0,-5.0,,,-20.0,7.0
2,US,United States,Alabama,Autauga County,2021-01-03,-12.0,-3.0,-10.0,,-14.0,6.0
3,US,United States,Alabama,Autauga County,2021-01-04,4.0,2.0,,,-26.0,8.0
4,US,United States,Alabama,Autauga County,2021-01-05,2.0,3.0,,,-22.0,8.0


In [18]:
us_county_mobility.isnull().sum()

country_region_code                                        0
country_region                                             0
sub_region_1                                               0
sub_region_2                                               0
date                                                       0
retail_and_recreation_percent_change_from_baseline    124330
grocery_and_pharmacy_percent_change_from_baseline     148395
parks_percent_change_from_baseline                    252344
transit_stations_percent_change_from_baseline         211489
workplaces_percent_change_from_baseline                 4196
residential_percent_change_from_baseline              132605
dtype: int64

In [19]:
us_county_mobility.shape

(339903, 11)

In [20]:
col_with_nulls = ['retail_and_recreation_percent_change_from_baseline',
                  'grocery_and_pharmacy_percent_change_from_baseline',
                  'parks_percent_change_from_baseline',
                  'transit_stations_percent_change_from_baseline',
                  'workplaces_percent_change_from_baseline',
                  'residential_percent_change_from_baseline']

In [21]:
us_county_mobility = pd.merge(us_county_mobility,
              us_states_mobility[["sub_region_1","date"]+col_with_nulls],
              on=["date","sub_region_1"],
              how='left',
              suffixes=["","_x"]
             )

In [22]:
col_to_replace = ['retail_and_recreation_percent_change_from_baseline_x',
                  'grocery_and_pharmacy_percent_change_from_baseline_x',
                  'parks_percent_change_from_baseline_x',
                  'transit_stations_percent_change_from_baseline_x',
                  'workplaces_percent_change_from_baseline_x',
                  'residential_percent_change_from_baseline_x']

In [23]:
for i in range(len(col_with_nulls)):
    us_county_mobility[col_with_nulls[i]].fillna(us_county_mobility[col_to_replace[i]],
                                                 inplace = True)
    us_county_mobility.drop(columns = [col_to_replace[i]],
                           inplace = True)

In [24]:
us_county_mobility.isnull().sum()

country_region_code                                   0
country_region                                        0
sub_region_1                                          0
sub_region_2                                          0
date                                                  0
retail_and_recreation_percent_change_from_baseline    0
grocery_and_pharmacy_percent_change_from_baseline     0
parks_percent_change_from_baseline                    0
transit_stations_percent_change_from_baseline         0
workplaces_percent_change_from_baseline               0
residential_percent_change_from_baseline              0
dtype: int64

In [25]:
us_county_mobility.shape

(339903, 11)

### State Regulation Data Frame

Notes:
- Data are state level
- Source: https://github.com/OxCGRT/USA-covid-policy

In [26]:
excel_regulation = pd.ExcelFile('../data/OxCGRTUS_timeseries_all.xlsx')

In [27]:
excel_regulation.sheet_names

['stringency_index',
 'government_response_index',
 'containment_health_index',
 'economic_support_index',
 'c1_school_closing',
 'c1_flag',
 'c2_workplace_closing',
 'c2_flag',
 'c3_cancel_public_events',
 'c3_flag',
 'c4_restrictions_on_gatherings',
 'c4_flag',
 'c5_close_public_transport',
 'c5_flag',
 'c6_stay_at_home_requirements',
 'c6_flag',
 'c7_movementrestrictions',
 'c7_flag',
 'c8_internationaltravel',
 'e1_income_support',
 'e1_flag',
 'e2_debtrelief',
 'h1_public_information_campaigns',
 'h1_flag',
 'h2_testing_policy',
 'h3_contact_tracing',
 'h6_facial_coverings',
 'h6_flag',
 'h7_vaccination_policy',
 'h7_flag',
 'h8_protection_of_elderly_people',
 'h8_flag',
 'confirmed_cases',
 'confirmed_deaths']

In [28]:
sheets = [sheet_name for sheet_name in excel_regulation.sheet_names if sheet_name not in ["confirmed_cases","confirmed_deaths"]]

for sheet in sheets:
    df = excel_regulation.parse(sheet)
    df = df.melt(id_vars=["country_code",
                         "country_name",
                         "region_code",
                         "region_name",
                         "jurisdiction"], 
                 var_name="date", 
                 value_name=sheet)
    df["date"] = df["date"].map(lambda x: datetime.strptime(x,"%d%b%Y").strftime('%Y/%m/%d'))
    df["date"] = pd.to_datetime(df["date"])
    df = df[df["date"] >= "2021-01-01"]
    df["region_name"] = df["region_name"].map(lambda x: x if x != "Washington DC" else "District of Columbia")
    df = df[["date","region_name",sheet]]
    locals()[sheet] = df

In [29]:
sheets_df = [stringency_index,
            government_response_index,
            containment_health_index,
            economic_support_index,
            c1_school_closing,
            c1_flag,
            c2_workplace_closing,
            c2_flag,
            c3_cancel_public_events,
            c3_flag,
            c4_restrictions_on_gatherings,
            c4_flag,
            c5_close_public_transport,
            c5_flag,
            c6_stay_at_home_requirements,
            c6_flag,
            c7_movementrestrictions,
            c7_flag,
            c8_internationaltravel,
            e1_income_support,
            e1_flag,
            e2_debtrelief,
            h1_public_information_campaigns,
            h1_flag,
            h2_testing_policy,
            h3_contact_tracing,
            h6_facial_coverings,
            h6_flag,
            h7_vaccination_policy,
            h7_flag,
            h8_protection_of_elderly_people,
            h8_flag]

In [30]:
for i, df in enumerate(sheets_df):
    print(sheets[i])
    print(df.isnull().sum())
    print(df.shape)
    print("\n")

stringency_index
date                  0
region_name           0
stringency_index    217
dtype: int64
(6018, 3)


government_response_index
date                         0
region_name                  0
government_response_index    0
dtype: int64
(6018, 3)


containment_health_index
date                          0
region_name                   0
containment_health_index    218
dtype: int64
(6018, 3)


economic_support_index
date                        0
region_name                 0
economic_support_index    230
dtype: int64
(6018, 3)


c1_school_closing
date                   0
region_name            0
c1_school_closing    191
dtype: int64
(6018, 3)


c1_flag
date             0
region_name      0
c1_flag        244
dtype: int64
(6018, 3)


c2_workplace_closing
date                      0
region_name               0
c2_workplace_closing    211
dtype: int64
(6018, 3)


c2_flag
date             0
region_name      0
c2_flag        211
dtype: int64
(6018, 3)


c3_cancel_public_events
date  

In [31]:
for df in sheets_df:
    df.fillna(method='ffill', inplace = True)

In [32]:
for i, df in enumerate(sheets_df):
    print(sheets[i])
    print(df.isnull().sum())
    print(df.shape)
    print("\n")

stringency_index
date                0
region_name         0
stringency_index    0
dtype: int64
(6018, 3)


government_response_index
date                         0
region_name                  0
government_response_index    0
dtype: int64
(6018, 3)


containment_health_index
date                        0
region_name                 0
containment_health_index    0
dtype: int64
(6018, 3)


economic_support_index
date                      0
region_name               0
economic_support_index    0
dtype: int64
(6018, 3)


c1_school_closing
date                 0
region_name          0
c1_school_closing    0
dtype: int64
(6018, 3)


c1_flag
date           0
region_name    0
c1_flag        0
dtype: int64
(6018, 3)


c2_workplace_closing
date                    0
region_name             0
c2_workplace_closing    0
dtype: int64
(6018, 3)


c2_flag
date           0
region_name    0
c2_flag        0
dtype: int64
(6018, 3)


c3_cancel_public_events
date                       0
region_name        

In [33]:
for df in sheets_df:
    df.fillna(method='bfill', inplace = True)

In [34]:
for i, df in enumerate(sheets_df):
    print(sheets[i])
    print(df.isnull().sum())
    print(df.shape)
    print("\n")

stringency_index
date                0
region_name         0
stringency_index    0
dtype: int64
(6018, 3)


government_response_index
date                         0
region_name                  0
government_response_index    0
dtype: int64
(6018, 3)


containment_health_index
date                        0
region_name                 0
containment_health_index    0
dtype: int64
(6018, 3)


economic_support_index
date                      0
region_name               0
economic_support_index    0
dtype: int64
(6018, 3)


c1_school_closing
date                 0
region_name          0
c1_school_closing    0
dtype: int64
(6018, 3)


c1_flag
date           0
region_name    0
c1_flag        0
dtype: int64
(6018, 3)


c2_workplace_closing
date                    0
region_name             0
c2_workplace_closing    0
dtype: int64
(6018, 3)


c2_flag
date           0
region_name    0
c2_flag        0
dtype: int64
(6018, 3)


c3_cancel_public_events
date                       0
region_name        

In [35]:
from functools import reduce

total_regulations = reduce(lambda left,right: pd.merge(left,right,on=["date","region_name"]), sheets_df)
total_regulations.head()

Unnamed: 0,date,region_name,stringency_index,government_response_index,containment_health_index,economic_support_index,c1_school_closing,c1_flag,c2_workplace_closing,c2_flag,...,h1_public_information_campaigns,h1_flag,h2_testing_policy,h3_contact_tracing,h6_facial_coverings,h6_flag,h7_vaccination_policy,h7_flag,h8_protection_of_elderly_people,h8_flag
0,2021-01-01,Alaska,61.57,59.27,62.38,37.5,3.0,0.0,1.0,0.0,...,2.0,1.0,3.0,1.0,3.0,0.0,2.0,1.0,2.0,1.0
1,2021-01-01,Alabama,43.52,49.9,51.67,37.5,2.0,0.0,1.0,1.0,...,2.0,1.0,3.0,1.0,3.0,1.0,2.0,1.0,2.0,1.0
2,2021-01-01,Arkansas,56.48,61.25,62.86,50.0,2.0,0.0,2.0,0.0,...,2.0,1.0,2.0,2.0,3.0,1.0,2.0,0.0,3.0,1.0
3,2021-01-01,Arizona,57.41,60.68,60.42,62.5,2.0,0.0,1.0,1.0,...,2.0,1.0,3.0,2.0,3.0,0.0,0.0,0.0,2.0,1.0
4,2021-01-01,California,66.2,73.07,71.01,87.5,3.0,0.0,2.0,0.0,...,2.0,1.0,3.0,2.0,3.0,1.0,2.0,1.0,3.0,0.0


### Joining CDC Counties Name

In [36]:
cdc_county = pd.read_csv("../data/cdc_sample_df.csv")
cdc_state = pd.read_csv("../data/cdc_sample_state_df.csv")

In [37]:
cdc_county.head()

Unnamed: 0,County,FIPS code,County type,CBSA,CBSA type,State Abbreviation,County only
0,"Los Angeles County, CA",6037,Large central metro,"Los Angeles-Long Beach-Anaheim, CA",Metropolitan,CA,Los Angeles County
1,"Maricopa County, AZ",4013,Large central metro,"Phoenix-Mesa-Chandler, AZ",Metropolitan,AZ,Maricopa County
2,"San Bernardino County, CA",6071,Large fringe metro,"Riverside-San Bernardino-Ontario, CA",Metropolitan,CA,San Bernardino County
3,"Riverside County, CA",6065,Large central metro,"Riverside-San Bernardino-Ontario, CA",Metropolitan,CA,Riverside County
4,"Cook County, IL",17031,Large central metro,"Chicago-Naperville-Elgin, IL-IN-WI",Metropolitan,IL,Cook County


In [38]:
cdc_county = cdc_county[cdc_county["State Abbreviation"] != "PR"]

cdc_county.shape

(3192, 7)

In [39]:
cdc_state.head()

Unnamed: 0,State,State Abbreviation,FEMA region,Population,Population as a percent of national population
0,Florida,FL,Region 4,21477737,0.0647
1,Michigan,MI,Region 5,9986857,0.0301
2,Pennsylvania,PA,Region 3,12801989,0.0386
3,New York,NY,Region 2,19453561,0.0586
4,Texas,TX,Region 6,28995881,0.0874


In [40]:
cdc_state["State"].unique()

array(['Florida', 'Michigan', 'Pennsylvania', 'New York', 'Texas',
       'Illinois', 'California', 'North Carolina', 'Colorado',
       'Minnesota', 'Washington', 'Ohio', 'New Jersey', 'Indiana',
       'Georgia', 'Massachusetts', 'Tennessee', 'Oregon', 'Arizona',
       'Virginia', 'Wisconsin', 'Maryland', 'Kentucky', 'Louisiana',
       'Missouri', 'Connecticut', 'Nevada', 'West Virginia', 'Utah',
       'South Carolina', 'Iowa', 'Puerto Rico', 'Maine', 'New Mexico',
       'Arkansas', 'Oklahoma', 'Mississippi', 'New Hampshire', 'Kansas',
       'Nebraska', 'Rhode Island', 'Delaware', 'Idaho', 'Alabama',
       'Montana', 'North Dakota', 'Alaska', 'Hawaii', 'South Dakota',
       'Wyoming', 'Vermont', 'District of Columbia',
       'United States Virgin Islands', 'Guam',
       'Commonwealth of the Northern Mariana Islands', 'American Samoa'],
      dtype=object)

In [41]:
cdc_state = cdc_state[(cdc_state["State"] != "Puerto Rico") &
                      (cdc_state["State"] != "Guam") &
                      (cdc_state["State"] != "United States Virgin Islands") &
                      (cdc_state["State"] != "Commonwealth of the Northern Mariana Islands") &
                      (cdc_state["State"] != "American Samoa")]

cdc_state.shape

(51, 5)

In [42]:
cdc_county = cdc_county[['County',
                        "County only",
                        "State Abbreviation",
                        "FIPS code"] 
                       ]

In [43]:
cdc_county = pd.merge(cdc_county,
                      cdc_state[["State Abbreviation","State"]],
                      on = "State Abbreviation",
                      how = "left"
                     )

cdc_county.drop(columns = "State Abbreviation",
                inplace = True
               )

In [44]:
cdc_county.shape

(3192, 4)

In [45]:
ls = []
for date in set(list(us_country_mobility["date"])):
    data_f = cdc_county.copy()
    data_f["date"] = date
    ls.append(data_f)
    
cdc_county = pd.concat(ls)

In [46]:
cdc_county.reset_index(inplace = True, drop = True)

In [47]:
cdc_county.index

RangeIndex(start=0, stop=424536, step=1)

In [48]:
cdc_county.shape

(424536, 5)

In [49]:
cdc_county = pd.merge(cdc_county,
                      us_county_mobility,
                      left_on=["date","State","County only"],
                      right_on=["date","sub_region_1","sub_region_2"],
                      how="left")

In [50]:
cdc_county.drop(columns = ["country_region_code",
                           "country_region",
                           "sub_region_1",
                           "sub_region_2"],
               inplace = True)

In [51]:
cdc_county.head()

Unnamed: 0,County,County only,FIPS code,State,date,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline
0,"Los Angeles County, CA",Los Angeles County,6037,California,2021-01-29,-43.0,-27.0,-58.0,-57.0,-44.0,21.0
1,"Maricopa County, AZ",Maricopa County,4013,Arizona,2021-01-29,-25.0,-19.0,-45.0,-37.0,-37.0,14.0
2,"San Bernardino County, CA",San Bernardino County,6071,California,2021-01-29,-34.0,-23.0,-57.0,-43.0,-38.0,17.0
3,"Riverside County, CA",Riverside County,6065,California,2021-01-29,-32.0,-22.0,-65.0,-45.0,-40.0,17.0
4,"Cook County, IL",Cook County,17031,Illinois,2021-01-29,-32.0,-14.0,-39.0,-53.0,-38.0,14.0


In [52]:
cdc_county.shape

(424536, 11)

In [53]:
cdc_county.isnull().sum()

County                                                    0
County only                                               0
FIPS code                                                 0
State                                                     0
date                                                      0
retail_and_recreation_percent_change_from_baseline    90752
grocery_and_pharmacy_percent_change_from_baseline     90752
parks_percent_change_from_baseline                    90752
transit_stations_percent_change_from_baseline         90752
workplaces_percent_change_from_baseline               90752
residential_percent_change_from_baseline              90752
dtype: int64

In [54]:
cdc_county = pd.merge(cdc_county,
                      us_states_mobility,
                      left_on=["date","State"],
                      right_on=["date","sub_region_1"],
                      how="left",
                      suffixes=["","_x"]
                     )

In [55]:
cdc_county.head()

Unnamed: 0,County,County only,FIPS code,State,date,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline,country_region_code,country_region,sub_region_1,retail_and_recreation_percent_change_from_baseline_x,grocery_and_pharmacy_percent_change_from_baseline_x,parks_percent_change_from_baseline_x,transit_stations_percent_change_from_baseline_x,workplaces_percent_change_from_baseline_x,residential_percent_change_from_baseline_x
0,"Los Angeles County, CA",Los Angeles County,6037,California,2021-01-29,-43.0,-27.0,-58.0,-57.0,-44.0,21.0,US,United States,California,-38.0,-22.0,-47.0,-56.0,-42.0,18.0
1,"Maricopa County, AZ",Maricopa County,4013,Arizona,2021-01-29,-25.0,-19.0,-45.0,-37.0,-37.0,14.0,US,United States,Arizona,-24.0,-18.0,-43.0,-32.0,-34.0,13.0
2,"San Bernardino County, CA",San Bernardino County,6071,California,2021-01-29,-34.0,-23.0,-57.0,-43.0,-38.0,17.0,US,United States,California,-38.0,-22.0,-47.0,-56.0,-42.0,18.0
3,"Riverside County, CA",Riverside County,6065,California,2021-01-29,-32.0,-22.0,-65.0,-45.0,-40.0,17.0,US,United States,California,-38.0,-22.0,-47.0,-56.0,-42.0,18.0
4,"Cook County, IL",Cook County,17031,Illinois,2021-01-29,-32.0,-14.0,-39.0,-53.0,-38.0,14.0,US,United States,Illinois,-26.0,-11.0,-32.0,-43.0,-32.0,12.0


In [56]:
cdc_county.shape

(424536, 20)

In [57]:
for i in range(len(col_with_nulls)):
    cdc_county[col_with_nulls[i]].fillna(cdc_county[col_to_replace[i]],
                                         inplace = True)
    cdc_county.drop(columns = [col_to_replace[i]],
                    inplace = True)

In [58]:
cdc_county.isnull().sum()

County                                                0
County only                                           0
FIPS code                                             0
State                                                 0
date                                                  0
retail_and_recreation_percent_change_from_baseline    0
grocery_and_pharmacy_percent_change_from_baseline     0
parks_percent_change_from_baseline                    0
transit_stations_percent_change_from_baseline         0
workplaces_percent_change_from_baseline               0
residential_percent_change_from_baseline              0
country_region_code                                   0
country_region                                        0
sub_region_1                                          0
dtype: int64

In [59]:
cdc_county.drop(columns = ["country_region_code",
                           "country_region_code",
                           "country_region",
                           "sub_region_1"],
                inplace = True)

In [60]:
cdc_county.shape

(424536, 11)

In [61]:
cdc_county = pd.merge(cdc_county,
                      total_regulations,
                      left_on=["date","State"],
                      right_on=["date","region_name"],
                      how="left"
                     )

In [62]:
cdc_county.columns

Index(['County', 'County only', 'FIPS code', 'State', 'date',
       'retail_and_recreation_percent_change_from_baseline',
       'grocery_and_pharmacy_percent_change_from_baseline',
       'parks_percent_change_from_baseline',
       'transit_stations_percent_change_from_baseline',
       'workplaces_percent_change_from_baseline',
       'residential_percent_change_from_baseline', 'region_name',
       'stringency_index', 'government_response_index',
       'containment_health_index', 'economic_support_index',
       'c1_school_closing', 'c1_flag', 'c2_workplace_closing', 'c2_flag',
       'c3_cancel_public_events', 'c3_flag', 'c4_restrictions_on_gatherings',
       'c4_flag', 'c5_close_public_transport', 'c5_flag',
       'c6_stay_at_home_requirements', 'c6_flag', 'c7_movementrestrictions',
       'c7_flag', 'c8_internationaltravel', 'e1_income_support', 'e1_flag',
       'e2_debtrelief', 'h1_public_information_campaigns', 'h1_flag',
       'h2_testing_policy', 'h3_contact_tracing', 

In [63]:
cdc_county.drop(columns = ["region_name"],
                inplace = True)

In [64]:
cdc_county.shape

(424536, 43)

In [65]:
cdc_county['month'] = cdc_county['date'].to_numpy().astype('datetime64[M]')

In [66]:
cdc_county.sort_values("date", inplace = True)
cdc_county.reset_index(inplace = True, drop = True)

In [67]:
cdc_county.head()

Unnamed: 0,County,County only,FIPS code,State,date,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,...,h1_flag,h2_testing_policy,h3_contact_tracing,h6_facial_coverings,h6_flag,h7_vaccination_policy,h7_flag,h8_protection_of_elderly_people,h8_flag,month
0,"Unallocated, IA",Unallocated,19000,Iowa,2021-01-01,-40.0,-36.0,-15.0,-49.0,-75.0,...,1.0,2.0,1.0,3.0,0.0,2.0,1.0,2.0,1.0,2021-01-01
1,"Nodaway County, MO",Nodaway County,29147,Missouri,2021-01-01,-35.0,-46.0,-34.0,-52.0,-74.0,...,0.0,3.0,2.0,3.0,1.0,3.0,1.0,2.0,1.0,2021-01-01
2,"Jackson County, TN",Jackson County,47087,Tennessee,2021-01-01,-35.0,-28.0,8.0,-48.0,-63.0,...,1.0,3.0,1.0,3.0,0.0,2.0,1.0,2.0,1.0,2021-01-01
3,"Shelby County, TX",Shelby County,48419,Texas,2021-01-01,-27.0,-32.0,-36.0,-52.0,-69.0,...,1.0,2.0,1.0,3.0,1.0,2.0,1.0,2.0,1.0,2021-01-01
4,"Gaines County, TX",Gaines County,48165,Texas,2021-01-01,-46.0,-32.0,-36.0,-52.0,-76.0,...,1.0,2.0,1.0,3.0,1.0,2.0,1.0,2.0,1.0,2021-01-01


In [68]:
cdc_county.isnull().sum()

County                                                    0
County only                                               0
FIPS code                                                 0
State                                                     0
date                                                      0
retail_and_recreation_percent_change_from_baseline        0
grocery_and_pharmacy_percent_change_from_baseline         0
parks_percent_change_from_baseline                        0
transit_stations_percent_change_from_baseline             0
workplaces_percent_change_from_baseline                   0
residential_percent_change_from_baseline                  0
stringency_index                                      47880
government_response_index                             47880
containment_health_index                              47880
economic_support_index                                47880
c1_school_closing                                     47880
c1_flag                                 

### Joining Unemployment Data

In [69]:
unemployment = pd.ExcelFile('../data/county_unemployment_feb20_mar21.xlsx')

In [70]:
unemployment.sheet_names

['laucntycur14', 'county_names']

In [71]:
unemployment_df = unemployment.parse("laucntycur14")

In [72]:
unemployment_df.head()

Unnamed: 0,laus,state_code,country_code,county,date,labor_force,employe,unemployed,unemploy_rate
0,CN0100100000000,1,1,"Autauga County, AL",Feb-20,26157,25467,690,2.6
1,CN0100300000000,1,3,"Baldwin County, AL",Feb-20,96660,94213,2447,2.5
2,CN0100500000000,1,5,"Barbour County, AL",Feb-20,8573,8286,287,3.3
3,CN0100700000000,1,7,"Bibb County, AL",Feb-20,8615,8369,246,2.9
4,CN0100900000000,1,9,"Blount County, AL",Feb-20,25141,24529,612,2.4


In [73]:
unemployment_df[unemployment_df["county"].str.contains(",") == False]

Unnamed: 0,laus,state_code,country_code,county,date,labor_force,employe,unemployed,unemploy_rate
319,CN1100100000000,11,1,District of Columbia,Feb-20,425308,404594,20714,4.9
3538,CN1100100000000,11,1,District of Columbia,Mar-20,420428,397484,22944,5.5
6757,CN1100100000000,11,1,District of Columbia,Apr-20,405479,362532,42947,10.6
9976,CN1100100000000,11,1,District of Columbia,May-20,396512,360847,35665,9.0
13195,CN1100100000000,11,1,District of Columbia,Jun-20,400830,364052,36778,9.2
16414,CN1100100000000,11,1,District of Columbia,Jul-20,407168,368356,38812,9.5
19633,CN1100100000000,11,1,District of Columbia,Aug-20,406441,370306,36135,8.9
22852,CN1100100000000,11,1,District of Columbia,Sep-20,404522,368874,35648,8.8
26071,CN1100100000000,11,1,District of Columbia,Oct-20,405870,372532,33338,8.2
29290,CN1100100000000,11,1,District of Columbia,Nov-20,409289,375016,34273,8.4


In [74]:
unemployment_df["month"] = unemployment_df["date"].map(lambda x: datetime.strptime(x,"%b-%y"))

In [75]:
unemployment_df = unemployment_df[["county",
                 "month",
                 "labor_force",
                 "employe",
                 "unemployed",
                 "unemploy_rate"
                ]]

unemployment_df = unemployment_df[unemployment_df["month"] >= "2021-01-01"]

In [76]:
unemployment_df[unemployment_df["county"].str.contains("/")]["county"].unique()

array(['Anchorage Borough/municipality, AK', 'Juneau Borough/city, AK',
       'Sitka Borough/city, AK', 'Wrangell Borough/city, AK',
       'Yakutat Borough/city, AK', 'San Francisco County/city, CA',
       'Broomfield County/city, CO', 'Denver County/city, CO',
       'Honolulu County/city, HI', 'Nantucket County/town, MA',
       'Philadelphia County/city, PA'], dtype=object)

In [77]:
unemployment_df["county"] = unemployment_df["county"].map(lambda x: "Anchorage Municipality, AK" if x == "Anchorage Borough/municipality, AK"
                                                          else "Doña Ana County, NM" if x == "Dona Ana County, NM"
                                                          else "Sitka City and Borough, AK" if x == 'Sitka Borough/city, AK'
                                                          else "Yakutat City and Borough, AK" if x == 'Yakutat Borough/city, AK'
                                                          else 'Wrangell City and Borough, AK' if x == 'Wrangell Borough/city, AK'
                                                          else "Juneau City and Borough, AK" if x == 'Juneau Borough/city, AK'
                                                          else x if "/" not in x 
                                                          else x.replace("/town","").replace("/city","").replace("/municipality",""))

In [78]:
unemployment_df["state_abbrev"] = unemployment_df["county"].map(lambda x: x.split(", ")[1] if x != "District of Columbia" else "DC")

In [79]:
unemployment_df["county_only"] = unemployment_df["county"].map(lambda x: x.split(", ")[0] if x != "District of Columbia" else "District of Columbia")

In [80]:
unemployment_df = unemployment_df[unemployment_df["state_abbrev"] != "PR"]

In [81]:
unemployment_df.columns

Index(['county', 'month', 'labor_force', 'employe', 'unemployed',
       'unemploy_rate', 'state_abbrev', 'county_only'],
      dtype='object')

In [82]:
unemployment_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9423 entries, 35409 to 44987
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   county         9423 non-null   object        
 1   month          9423 non-null   datetime64[ns]
 2   labor_force    9423 non-null   object        
 3   employe        9423 non-null   object        
 4   unemployed     9423 non-null   object        
 5   unemploy_rate  9423 non-null   object        
 6   state_abbrev   9423 non-null   object        
 7   county_only    9423 non-null   object        
dtypes: datetime64[ns](1), object(7)
memory usage: 662.6+ KB


In [83]:
for columns in ["labor_force",
                "employe",
                "unemployed",
                "unemploy_rate"
               ]:
    unemployment_df[columns] = unemployment_df[columns].astype(float)

In [84]:
undemployed_avg = pd.DataFrame(unemployment_df.groupby(["month","state_abbrev"])[["labor_force","employe","unemployed","unemploy_rate"]].mean())

In [85]:
undemployed_avg.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,labor_force,employe,unemployed,unemploy_rate
month,state_abbrev,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-01-01,AK,12037.862069,11206.827586,831.034483,9.303448
2021-01-01,AL,33267.298507,31920.776119,1346.522388,4.480597
2021-01-01,AR,17993.306667,17074.386667,918.92,5.596
2021-01-01,AZ,238193.133333,221732.866667,16460.266667,7.74
2021-01-01,CA,321067.862069,291409.327586,29658.534483,8.765517


In [86]:
ls_unallocated = []

for state in list(unemployment_df["state_abbrev"].unique()):
    for month in list(unemployment_df["month"].unique()):
            dic_unallocated = {}
            dic_unallocated["county"] = "Unallocated, " + state
            dic_unallocated["month"] = month
            dic_unallocated["labor_force"] = undemployed_avg[(undemployed_avg.index.get_level_values('month') == month) &
                                                             (undemployed_avg.index.get_level_values('state_abbrev') == state)]["labor_force"][0]
            dic_unallocated["employe"] = undemployed_avg[(undemployed_avg.index.get_level_values('month') == month) &
                                                             (undemployed_avg.index.get_level_values('state_abbrev') == state)]["employe"][0]
            dic_unallocated["unemployed"] = undemployed_avg[(undemployed_avg.index.get_level_values('month') == month) &
                                                             (undemployed_avg.index.get_level_values('state_abbrev') == state)]["unemployed"][0]
            dic_unallocated["unemploy_rate"] = undemployed_avg[(undemployed_avg.index.get_level_values('month') == month) &
                                                             (undemployed_avg.index.get_level_values('state_abbrev') == state)]["unemploy_rate"][0]
            dic_unallocated["state_abbrev"] = state
            dic_unallocated["county_only"] = "Unallocated"
            ls_unallocated.append(dic_unallocated)
            
for month in list(unemployment_df["month"].unique()):            
    ls_unallocated.append({"county": 'Kalawao County, HI',
                           "month": month,
                           "labor_force": undemployed_avg[(undemployed_avg.index.get_level_values('month') == month) &
                                                          (undemployed_avg.index.get_level_values('state_abbrev') == "HI")]["labor_force"][0],
                           "employe": undemployed_avg[(undemployed_avg.index.get_level_values('month') == month) &
                                                      (undemployed_avg.index.get_level_values('state_abbrev') == "HI")]["employe"][0],
                           "unemployed": undemployed_avg[(undemployed_avg.index.get_level_values('month') == month) &
                                                         (undemployed_avg.index.get_level_values('state_abbrev') == "HI")]["unemployed"][0],
                           "unemploy_rate": undemployed_avg[(undemployed_avg.index.get_level_values('month') == month) &
                                                            (undemployed_avg.index.get_level_values('state_abbrev') == "HI")]["unemploy_rate"][0],
                           "state_abbrev": 'HI',
                           "county_only": "Kalawao County"

    })
    
unallocated_unemployment = pd.DataFrame.from_dict(ls_unallocated)

In [87]:
unemployment_df = pd.merge(unemployment_df,
                           unallocated_unemployment,
                           how='outer'
                          )

In [88]:
unemployment_df = pd.merge(unemployment_df,
                           cdc_state[["State Abbreviation","State"]],
                           left_on = "state_abbrev",
                           right_on = "State Abbreviation",
                           how = "left"
                     )

unemployment_df.drop(columns = ["State Abbreviation","state_abbrev"],
                inplace = True
               )

In [89]:
cdc_county = pd.merge(cdc_county,
                      unemployment_df,
                      left_on=["month","County only","State"],
                      right_on=["month","county_only","State"],
                      how="left",
                      suffixes = ["","_x"]
                     )

In [90]:
cdc_county.head()

Unnamed: 0,County,County only,FIPS code,State,date,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,...,h7_flag,h8_protection_of_elderly_people,h8_flag,month,county,labor_force,employe,unemployed,unemploy_rate,county_only
0,"Unallocated, IA",Unallocated,19000,Iowa,2021-01-01,-40.0,-36.0,-15.0,-49.0,-75.0,...,1.0,2.0,1.0,2021-01-01,"Unallocated, IA",16441.313131,15648.929293,792.383838,4.535354,Unallocated
1,"Nodaway County, MO",Nodaway County,29147,Missouri,2021-01-01,-35.0,-46.0,-34.0,-52.0,-74.0,...,1.0,2.0,1.0,2021-01-01,"Nodaway County, MO",10510.0,10103.0,407.0,3.9,Nodaway County
2,"Jackson County, TN",Jackson County,47087,Tennessee,2021-01-01,-35.0,-28.0,8.0,-48.0,-63.0,...,1.0,2.0,1.0,2021-01-01,"Jackson County, TN",4866.0,4583.0,283.0,5.8,Jackson County
3,"Shelby County, TX",Shelby County,48419,Texas,2021-01-01,-27.0,-32.0,-36.0,-52.0,-69.0,...,1.0,2.0,1.0,2021-01-01,"Shelby County, TX",11348.0,10530.0,818.0,7.2,Shelby County
4,"Gaines County, TX",Gaines County,48165,Texas,2021-01-01,-46.0,-32.0,-36.0,-52.0,-76.0,...,1.0,2.0,1.0,2021-01-01,"Gaines County, TX",9839.0,9201.0,638.0,6.5,Gaines County


In [91]:
cdc_county.isnull().sum()

County                                                     0
County only                                                0
FIPS code                                                  0
State                                                      0
date                                                       0
retail_and_recreation_percent_change_from_baseline         0
grocery_and_pharmacy_percent_change_from_baseline          0
parks_percent_change_from_baseline                         0
transit_stations_percent_change_from_baseline              0
workplaces_percent_change_from_baseline                    0
residential_percent_change_from_baseline                   0
stringency_index                                       47880
government_response_index                              47880
containment_health_index                               47880
economic_support_index                                 47880
c1_school_closing                                      47880
c1_flag                 

In [92]:
cdc_county.shape

(424536, 50)

In [93]:
cdc_county.drop(columns = ["county","county_only"], inplace = True)
cdc_county.rename(columns = {"employe":"employed"}, inplace = True)

In [94]:
cdc_county = cdc_county[cdc_county["date"] < "2021-05-01"]
cdc_county.fillna(method='ffill', inplace = True)

In [95]:
cdc_county.isnull().sum()

County                                                0
County only                                           0
FIPS code                                             0
State                                                 0
date                                                  0
retail_and_recreation_percent_change_from_baseline    0
grocery_and_pharmacy_percent_change_from_baseline     0
parks_percent_change_from_baseline                    0
transit_stations_percent_change_from_baseline         0
workplaces_percent_change_from_baseline               0
residential_percent_change_from_baseline              0
stringency_index                                      0
government_response_index                             0
containment_health_index                              0
economic_support_index                                0
c1_school_closing                                     0
c1_flag                                               0
c2_workplace_closing                            

In [96]:
cdc_county.shape

(383040, 48)

### Joining CDC Full Data

In [97]:
cdc_full = pd.read_csv("../data/cdc_counties.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [98]:
cdc_full.head()

Unnamed: 0,County,FIPS code,County type,CBSA,CBSA type,State Abbreviation,FEMA region,Population,Population as a percent of CBSA,Population as a percent of state,...,Testing latency - absolute change.1,% tests resulted in 3 or fewer days - absolute change.1,Viral (RT-PCR) lab test positivity rate - 15-21 days ago (may be an underestimate due to delayed reporting),Total RT-PCR diagnostic tests - 15-21 days ago (may be an underestimate due to delayed reporting),RT-PCR tests per 100k - 15-21 days ago (may be an underestimate due to delayed reporting),Median test latency - 15-21 days ago,% tests resulted in 3 or fewer days - 15-21 days ago,% Native American / Alaskan Native,% Asian,Month
0,"Unallocated, MI",26000,,,,MI,5,,,,...,,,,,,,,,,2021-04-01
1,"Miami-Dade County, FL",12086,Large central metro,"Miami-Fort Lauderdale-Pompano Beach, FL",Metropolitan,FL,4,2716940.0,0.4406,0.1265,...,,,,,,,,,,2021-04-01
2,"Cook County, IL",17031,Large central metro,"Chicago-Naperville-Elgin, IL-IN-WI",Metropolitan,IL,5,5150233.0,0.5445,0.4064,...,,,,,,,,,,2021-04-01
3,"Wayne County, MI",26163,Large central metro,"Detroit-Warren-Dearborn, MI",Metropolitan,MI,5,1749343.0,0.405,0.1752,...,,,,,,,,,,2021-04-01
4,"Kings County, NY",36047,Large central metro,"New York-Newark-Jersey City, NY-NJ-PA",Metropolitan,NY,2,2559903.0,0.1332,0.1316,...,,,,,,,,,,2021-04-01


In [99]:
cdc_full["File Date"] = cdc_full["File Date"].map(lambda x: datetime.strptime(x,"%Y-%m-%d"))

In [100]:
cdc_county = pd.merge(cdc_county,
                      cdc_full,
                      left_on = ["FIPS code","date"],
                      right_on = ["FIPS code","File Date"],
                      how = "left",
                      suffixes = ["","_x"]
                     )

In [101]:
cdc_county.drop(columns = ["County_x",
                          "County only_x",
                          "File Date",
                          "Month"], inplace = True)

In [102]:
len(cdc_county.columns) == len(set([col for col in cdc_county.columns]))

True

In [103]:
cdc_county.shape

(383040, 148)

### Joining John Hopkins Data

In [104]:
jh_data_deaths = pd.read_csv("../data/time_series_covid19_deaths_US.csv")
jh_data_confirmed = pd.read_csv("../data/time_series_covid19_confirmed_US.csv")

In [105]:
jh_data_deaths.drop(columns = ["UID", "iso2", "iso3", "code3", "Country_Region", "Combined_Key", "Population"], inplace = True)
jh_data_confirmed.drop(columns = ["UID", "iso2", "iso3", "code3", "Country_Region", "Combined_Key"], inplace = True)

In [106]:
jh_data_deaths.head()

Unnamed: 0,FIPS,Admin2,Province_State,Lat,Long_,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,...,5/7/21,5/8/21,5/9/21,5/10/21,5/11/21,5/12/21,5/13/21,5/14/21,5/15/21,5/16/21
0,1001.0,Autauga,Alabama,32.539527,-86.644082,0,0,0,0,0,...,108,108,108,108,108,108,108,108,108,108
1,1003.0,Baldwin,Alabama,30.72775,-87.722071,0,0,0,0,0,...,307,308,308,308,308,309,309,309,310,310
2,1005.0,Barbour,Alabama,31.868263,-85.387129,0,0,0,0,0,...,57,57,57,57,57,56,56,56,56,56
3,1007.0,Bibb,Alabama,32.996421,-87.125115,0,0,0,0,0,...,64,64,64,64,64,64,64,64,64,64
4,1009.0,Blount,Alabama,33.982109,-86.567906,0,0,0,0,0,...,137,137,137,137,137,137,137,139,139,139


In [107]:
jh_data_confirmed.head()

Unnamed: 0,FIPS,Admin2,Province_State,Lat,Long_,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,...,5/7/21,5/8/21,5/9/21,5/10/21,5/11/21,5/12/21,5/13/21,5/14/21,5/15/21,5/16/21
0,1001.0,Autauga,Alabama,32.539527,-86.644082,0,0,0,0,0,...,6918,6918,6920,6920,6926,6928,6938,6971,7001,7005
1,1003.0,Baldwin,Alabama,30.72775,-87.722071,0,0,0,0,0,...,21107,21123,21131,21135,21154,21170,21191,21290,21392,21411
2,1005.0,Barbour,Alabama,31.868263,-85.387129,0,0,0,0,0,...,2307,2307,2308,2308,2310,2314,2317,2319,2320,2320
3,1007.0,Bibb,Alabama,32.996421,-87.125115,0,0,0,0,0,...,2604,2605,2607,2607,2609,2612,2615,2630,2645,2647
4,1009.0,Blount,Alabama,33.982109,-86.567906,0,0,0,0,0,...,6651,6656,6660,6661,6678,6680,6694,6750,6771,6773


In [108]:
jh_data_deaths = jh_data_deaths.melt(id_vars=["FIPS",
                         "Admin2",
                         "Province_State",
                         "Lat",
                         "Long_"], 
                 var_name="date", 
                 value_name="daily_deaths")

In [109]:
jh_data_deaths["date"] = pd.to_datetime(jh_data_deaths["date"])
jh_data_deaths = jh_data_deaths[(jh_data_deaths["date"]>= "2021-01-01") &
                                (jh_data_deaths["date"]< "2021-05-01")]
jh_data_deaths.reset_index(inplace = True, drop = True)

In [110]:
jh_data_deaths.head()

Unnamed: 0,FIPS,Admin2,Province_State,Lat,Long_,date,daily_deaths
0,1001.0,Autauga,Alabama,32.539527,-86.644082,2021-01-01,50
1,1003.0,Baldwin,Alabama,30.72775,-87.722071,2021-01-01,169
2,1005.0,Barbour,Alabama,31.868263,-85.387129,2021-01-01,33
3,1007.0,Bibb,Alabama,32.996421,-87.125115,2021-01-01,46
4,1009.0,Blount,Alabama,33.982109,-86.567906,2021-01-01,63


In [111]:
jh_data_confirmed = jh_data_confirmed.melt(id_vars=["FIPS",
                         "Admin2",
                         "Province_State",
                         "Lat",
                         "Long_"], 
                 var_name="date", 
                 value_name="daily_confirmed_cases")

In [112]:
jh_data_confirmed["date"] = pd.to_datetime(jh_data_confirmed["date"])
jh_data_confirmed = jh_data_confirmed[(jh_data_confirmed["date"]>= "2021-01-01") &
                                      (jh_data_confirmed["date"]< "2021-05-01")
                                     ]
jh_data_confirmed.reset_index(inplace = True, drop = True)

In [113]:
jh_data_confirmed.head()

Unnamed: 0,FIPS,Admin2,Province_State,Lat,Long_,date,daily_confirmed_cases
0,1001.0,Autauga,Alabama,32.539527,-86.644082,2021-01-01,4239
1,1003.0,Baldwin,Alabama,30.72775,-87.722071,2021-01-01,13823
2,1005.0,Barbour,Alabama,31.868263,-85.387129,2021-01-01,1517
3,1007.0,Bibb,Alabama,32.996421,-87.125115,2021-01-01,1854
4,1009.0,Blount,Alabama,33.982109,-86.567906,2021-01-01,4693


In [114]:
non_states = ["American Samoa", "Guam", "Grand Princess", "Puerto Rico","Northern Mariana Islands", "Diamond Princess", "Virgin Islands"]

In [115]:
jh_data_deaths = jh_data_deaths[jh_data_deaths["Province_State"].isin(non_states) == False]
jh_data_confirmed = jh_data_confirmed[jh_data_confirmed["Province_State"].isin(non_states) == False]

In [116]:
combo_death = pd.merge(jh_data_deaths[["FIPS","Admin2","Province_State"]].drop_duplicates(),
         cdc_county[["FIPS code","County","State"]].drop_duplicates(),
         left_on = "FIPS",
         right_on = "FIPS code",
         how = "outer"
        )

In [117]:
fips_to_drop_death = combo_death[combo_death["FIPS code"].isna()]["FIPS"]

In [118]:
jh_data_deaths = jh_data_deaths[jh_data_deaths["FIPS"].isin(fips_to_drop_death) == False]

In [119]:
combo_confirmed = pd.merge(jh_data_confirmed[["FIPS","Admin2","Province_State"]].drop_duplicates(),
         cdc_county[["FIPS code","County","State"]].drop_duplicates(),
         left_on = "FIPS",
         right_on = "FIPS code",
         how = "outer"
        )

In [120]:
fips_to_drop_confirmed = combo_confirmed[combo_confirmed["FIPS code"].isna()]["FIPS"]

In [121]:
jh_data_confirmed = jh_data_confirmed[jh_data_confirmed["FIPS"].isin(fips_to_drop_confirmed) == False]

In [122]:
avg_deaths = pd.DataFrame(jh_data_deaths.groupby(["date","Province_State"])["daily_deaths"].mean())

In [123]:
avg_confirmed = pd.DataFrame(jh_data_confirmed.groupby(["date","Province_State"])["daily_confirmed_cases"].mean())

In [124]:
ls_unallocated = []
for state in list(jh_data_deaths["Province_State"].unique()):
    for date in list(jh_data_deaths["date"].unique()):
            dic_unallocated = {}
            dic_unallocated["FIPS"] = [combo_death[(combo_death["State"] == state) & 
                                                  (combo_death["County"].str.contains("Unallocated"))]["FIPS code"].values[0]
                                       if combo_death[(combo_death["State"] == state) & 
                                                  (combo_death["County"].str.contains("Unallocated"))]["FIPS code"].values.size != 0
                                       else 0][0]
                
            dic_unallocated['Admin2'] = "Unallocated"
            dic_unallocated["Province_State"] = state
            dic_unallocated["Lat"] =  0.0
            dic_unallocated["Long_"] = 0.0
            dic_unallocated["date"] = date
            dic_unallocated["daily_deaths"] = avg_deaths[(avg_deaths.index.get_level_values('date') == date) &
                                                   (avg_deaths.index.get_level_values('Province_State') == state)]["daily_deaths"][0]
            ls_unallocated.append(dic_unallocated)
temp = pd.DataFrame.from_dict(ls_unallocated)
jh_data_deaths = pd.concat([jh_data_deaths,
                            temp])

In [125]:
ls_unallocated = []
for state in list(jh_data_confirmed["Province_State"].unique()):
    for date in list(jh_data_confirmed["date"].unique()):
            dic_unallocated = {}
            dic_unallocated["FIPS"] = [combo_confirmed[(combo_confirmed["State"] == state) & 
                                                  (combo_confirmed["County"].str.contains("Unallocated"))]["FIPS code"].values[0]
                                       if combo_confirmed[(combo_confirmed["State"] == state) & 
                                                  (combo_confirmed["County"].str.contains("Unallocated"))]["FIPS code"].values.size != 0
                                       else 0][0]
                
            dic_unallocated['Admin2'] = "Unallocated"
            dic_unallocated["Province_State"] = state
            dic_unallocated["Lat"] =  0.0
            dic_unallocated["Long_"] = 0.0
            dic_unallocated["date"] = date
            dic_unallocated["daily_confirmed_cases"] = avg_confirmed[(avg_confirmed.index.get_level_values('date') == date) &
                                                   (avg_confirmed.index.get_level_values('Province_State') == state)]["daily_confirmed_cases"][0]
            ls_unallocated.append(dic_unallocated)
temp = pd.DataFrame.from_dict(ls_unallocated)
jh_data_confirmed = pd.concat([jh_data_confirmed,
                            temp])

In [126]:
jh_data_confirmed.isnull().sum()

FIPS                     0
Admin2                   0
Province_State           0
Lat                      0
Long_                    0
date                     0
daily_confirmed_cases    0
dtype: int64

In [127]:
jh_data_deaths.isnull().sum()

FIPS              0
Admin2            0
Province_State    0
Lat               0
Long_             0
date              0
daily_deaths      0
dtype: int64

In [128]:
jh_data_confirmed.shape, jh_data_deaths.shape

((383160, 7), (383160, 7))

In [129]:
cdc_county = pd.merge(cdc_county,
                      jh_data_deaths[["FIPS","Lat","Long_","date","daily_deaths"]],
                      left_on = ["FIPS code","date"],
                      right_on = ["FIPS","date"],
                      how = "left",
                      suffixes = ["","_x"]
                     )

In [130]:
cdc_county.drop(columns=["FIPS"], inplace = True)

In [131]:
cdc_county = pd.merge(cdc_county,
                      jh_data_confirmed[["FIPS","Lat","Long_","date","daily_confirmed_cases"]],
                      left_on = ["FIPS code","date"],
                      right_on = ["FIPS","date"],
                      how = "left",
                      suffixes = ["","_x"]
                     )

In [132]:
cdc_county.drop(columns=["FIPS","Lat_x","Long__x"], inplace = True)

In [133]:
cdc_county.shape

(383040, 152)

### Creating Dataframes for EDA

In [134]:
cdc_county.sort_values("date", inplace = True)

In [135]:
cdc_county.reset_index(inplace = True, drop = True)

In [136]:
cdc_county.fillna(method='ffill', inplace = True)

In [137]:
cdc_county.fillna(method='bfill', inplace = True)

In [138]:
[col for col in cdc_county.columns if cdc_county[col].dtype == "object"]

['County',
 'County only',
 'State',
 'County type',
 'CBSA',
 'CBSA type',
 'State Abbreviation',
 'FEMA region',
 'Area of Concern Category',
 'Rapid Riser Category',
 'Community Transmission Level - last 7 days',
 'Community Transmission Level - previous 7 days',
 'Forecasted case trajectory']

In [139]:
to_transpose = cdc_county.drop(columns = [
    'County only',
    'State',
    'County type',
    'CBSA',
    'CBSA type',
    'State Abbreviation',
    'FEMA region',
    'Area of Concern Category',
    'Rapid Riser Category',
    'Community Transmission Level - last 7 days',
    'Community Transmission Level - previous 7 days',
    'Forecasted case trajectory'
])

In [140]:
numeric_cols = [col for col in cdc_county.columns 
                if (cdc_county[col].dtype != "object") 
                and (col != "month" )
                and (col != "date") 
                and (col != "FIPS code")]

In [141]:
num_cols_ = []
for col in numeric_cols:
    to_transpose.rename(columns = {col:col.replace("-","").replace(" ","_").replace("%","pct").replace("/","or").replace("#","no").replace(".","rnd").replace("+","more").replace(">","over").replace("(","").replace(")","")},inplace = True)
    num_cols_.append(col.replace("-","").replace(" ","_").replace("%","pct").replace("/","or").replace("#","no").replace(".","rnd").replace("+","more").replace(">","over").replace("(","").replace(")",""))

In [142]:
num_cols_

['retail_and_recreation_percent_change_from_baseline',
 'grocery_and_pharmacy_percent_change_from_baseline',
 'parks_percent_change_from_baseline',
 'transit_stations_percent_change_from_baseline',
 'workplaces_percent_change_from_baseline',
 'residential_percent_change_from_baseline',
 'stringency_index',
 'government_response_index',
 'containment_health_index',
 'economic_support_index',
 'c1_school_closing',
 'c1_flag',
 'c2_workplace_closing',
 'c2_flag',
 'c3_cancel_public_events',
 'c3_flag',
 'c4_restrictions_on_gatherings',
 'c4_flag',
 'c5_close_public_transport',
 'c5_flag',
 'c6_stay_at_home_requirements',
 'c6_flag',
 'c7_movementrestrictions',
 'c7_flag',
 'c8_internationaltravel',
 'e1_income_support',
 'e1_flag',
 'e2_debtrelief',
 'h1_public_information_campaigns',
 'h1_flag',
 'h2_testing_policy',
 'h3_contact_tracing',
 'h6_facial_coverings',
 'h6_flag',
 'h7_vaccination_policy',
 'h7_flag',
 'h8_protection_of_elderly_people',
 'h8_flag',
 'labor_force',
 'employed',

In [143]:
for col in num_cols_:
    df = to_transpose.pivot(index="County", columns="date", values=col)
    locals()[col] = df

In [144]:
numeric_df = [retail_and_recreation_percent_change_from_baseline,
 grocery_and_pharmacy_percent_change_from_baseline,
 parks_percent_change_from_baseline,
 transit_stations_percent_change_from_baseline,
 workplaces_percent_change_from_baseline,
 residential_percent_change_from_baseline,
 stringency_index,
 government_response_index,
 containment_health_index,
 economic_support_index,
 c1_school_closing,
 c1_flag,
 c2_workplace_closing,
 c2_flag,
 c3_cancel_public_events,
 c3_flag,
 c4_restrictions_on_gatherings,
 c4_flag,
 c5_close_public_transport,
 c5_flag,
 c6_stay_at_home_requirements,
 c6_flag,
 c7_movementrestrictions,
 c7_flag,
 c8_internationaltravel,
 e1_income_support,
 e1_flag,
 e2_debtrelief,
 h1_public_information_campaigns,
 h1_flag,
 h2_testing_policy,
 h3_contact_tracing,
 h6_facial_coverings,
 h6_flag,
 h7_vaccination_policy,
 h7_flag,
 h8_protection_of_elderly_people,
 h8_flag,
 labor_force,
 employed,
 unemployed,
 unemploy_rate,
 Population,
 Population_as_a_percent_of_CBSA,
 Population_as_a_percent_of_state,
 Population_as_a_percent_of_national_population,
 IHE_with_over5000_fulltime_enrollment,
 IHE_Fulltime_enrollment,
 IHE_Fulltime_enrollment_as_a_percent_of_the_population,
 Cases_as_a_percent_of_national_total__last_7_days,
 Cases__last_7_days,
 Cases_per_100k__last_7_days,
 Deaths__last_7_days,
 Deaths_per_100k__last_7_days,
 Cases__pct_change,
 Deaths__pct_change,
 Cases_as_a_percent_of_national_total__previous_7_days,
 Cases__previous_7_days,
 Cases_per_100k__previous_7_days,
 Deaths__previous_7_days,
 Deaths_per_100k__previous_7_days,
 Cumulative_cases,
 Cumulative_cases_per_100k,
 Cumulative_deaths,
 Cumulative_deaths_per_100k,
 Rapid_rise_last_14_days,
 Number_of_days_of_downward_case_trajectory,
 Viral_RTPCR_lab_test_positivity_rate__last_7_days_may_be_an_underestimate_due_to_delayed_reporting,
 Total_RTPCR_diagnostic_tests__last_7_days_may_be_an_underestimate_due_to_delayed_reporting,
 RTPCR_tests_per_100k__last_7_days_may_be_an_underestimate_due_to_delayed_reporting,
 Median_test_latency__last_7_days,
 pct_tests_resulted_in_3_or_fewer_days__last_7_days,
 Viral_RTPCR_lab_test_positivity_rate__absolute_change_may_be_an_underestimate_due_to_delayed_reporting,
 Total_RTPCR_diagnostic_tests__pct_change_may_be_an_underestimate_due_to_delayed_reporting,
 Testing_latency__absolute_change,
 pct_tests_resulted_in_3_or_fewer_days__absolute_change,
 Viral_RTPCR_lab_test_positivity_rate__previous_7_days_may_be_an_underestimate_due_to_delayed_reporting,
 Total_RTPCR_diagnostic_tests__previous_7_days_may_be_an_underestimate_due_to_delayed_reporting,
 RTPCR_tests_per_100k__previous_7_daysmay_be_an_underestimate_due_to_delayed_reporting,
 Median_test_latency__last_7_daysrnd1,
 pct_tests_resulted_in_3_or_fewer_days__last_7_daysrnd1,
 Confirmed_COVID19_admissions__last_7_days,
 Confirmed_COVID19_admissions_per_100_inpatient_beds__last_7_days,
 Suspected_COVID19_admissions__last_7_days,
 Suspected_COVID19_admissions_per_100_inpatient_beds__last_7_days,
 pct_inpatient_beds_occupied,
 pct_inpatient_beds_occupied_by_COVID19_patient,
 pct_staffed_adult_ICU_beds_occupied,
 pct_staffed_adult_ICU_beds_occupied_by_COVID19_patient,
 pct_ventilators_in_use,
 pct_ventilators_in_use_by_COVID19_patient,
 Confirmed_COVID19_admissions_per_100_inpatient_beds__percent_increase,
 Suspected_COVID19_admissions_per_100_inpatient_beds__percent_increase,
 pct_inpatient_beds_occupied__absolute_change,
 pct_inpatient_beds_occupied_by_COVID19_patient__absolute_change,
 pct_staffed_adult_ICU_beds_occupied__absolute_change,
 pct_staffed_adult_ICU_beds_occupied_by_COVID19_patient__absolute_change,
 pct_ventilators_in_use__absolute_change,
 pct_ventilators_in_use_by_COVID19_patient__absolute_change,
 Total_no_of_hospital_CCNs,
 Total_inpatient_beds_among_hospitals_reporting__last_7_days,
 Total_staffed_adult_ICU_beds_among_hospitals_reporting__last_7_days,
 Total_ventilators_among_hospitals_reporting__last_7_days,
 pct_hospital_CCNs_reporting_any_utilization_data_at_least_once__last_7_days,
 pct_hospital_CCNs_reporting_all_utilization_data__at_least_once__last_7_days,
 pct_hospital_CCNs_ever_reporting_utilization_data,
 pct_hospital_CCNs_reporting_COVID19_admissions_at_least_once__last_7_days,
 People_who_are_fully_vaccinated,
 People_who_are_fully_vaccinated_as_pct_of_total_population,
 People_who_are_fully_vaccinated__ages_65more,
 People_who_are_fully_vaccinated_as_pct_of_population__ages_65more,
 pct_Uninsured,
 pct_In_Poverty,
 pct_Over_Age_65,
 Average_household_size,
 pct_NonHispanic_Black,
 pct_Hispanic,
 pct_NonHispanic_Native_American_or_Alaskan_Native,
 pct_NonHispanic_Asian,
 SVI_score,
 CCVI_score,
 Viral_RTPCR_lab_test_positivity_rate__absolute_change_may_be_an_underestimate_due_to_delayed_reportingrnd1,
 Total_RTPCR_diagnostic_tests__pct_change_may_be_an_underestimate_due_to_delayed_reportingrnd1,
 Testing_latency__absolute_changernd1,
 pct_tests_resulted_in_3_or_fewer_days__absolute_changernd1,
 Viral_RTPCR_lab_test_positivity_rate__1521_days_ago_may_be_an_underestimate_due_to_delayed_reporting,
 Total_RTPCR_diagnostic_tests__1521_days_ago_may_be_an_underestimate_due_to_delayed_reporting,
 RTPCR_tests_per_100k__1521_days_ago_may_be_an_underestimate_due_to_delayed_reporting,
 Median_test_latency__1521_days_ago,
 pct_tests_resulted_in_3_or_fewer_days__1521_days_ago,
 pct_Native_American_or_Alaskan_Native,
 pct_Asian,
 Lat,
 Long_,
 daily_deaths,
 daily_confirmed_cases]

In [145]:
pd.DataFrame(cdc_county.isnull().sum()).sort_values(by = 0, ascending = False).head()

Unnamed: 0,0
County,0
% ventilators in use,0
Confirmed COVID-19 admissions per 100 inpatient beds - last 7 days,0
Suspected COVID-19 admissions - last 7 days,0
Suspected COVID-19 admissions per 100 inpatient beds - last 7 days,0


In [146]:
cdc_county.set_index("date", drop = True, inplace = True)
cdc_county.to_csv("../data/complete_df.csv",index = False)