In [107]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots


In [108]:
# TODO: automate data acquisition

In [109]:
# import data
df_cases = pd.read_csv("data/RAW_us_confirmed_cases.csv")
df_deaths_md = pd.read_csv("data/CONVENIENT_us_metadata.csv")
df_masks = pd.read_csv("data/mask-use-by-county.csv")
df_policy = pd.read_csv("data/U.S._State_and_Territorial_Public_Mask_Mandates_From_April_10__2020_through_August_15__2021_by_County_by_Day.csv")

print(df_cases.head())

  Province_State   Admin2       UID iso2 iso3  code3    FIPS Country_Region  \
0        Alabama  Autauga  84001001   US  USA    840  1001.0             US   
1        Alabama  Baldwin  84001003   US  USA    840  1003.0             US   
2        Alabama  Barbour  84001005   US  USA    840  1005.0             US   
3        Alabama     Bibb  84001007   US  USA    840  1007.0             US   
4        Alabama   Blount  84001009   US  USA    840  1009.0             US   

         Lat      Long_  ... 10/23/22  10/24/22  10/25/22  10/26/22  10/27/22  \
0  32.539527 -86.644082  ...    18480     18480     18480     18480     18511   
1  30.727750 -87.722071  ...    65895     65895     65895     65895     65973   
2  31.868263 -85.387129  ...     6926      6926      6926      6926      6930   
3  32.996421 -87.125115  ...     7560      7560      7560      7560      7575   
4  33.982109 -86.567906  ...    17286     17286     17286     17286     17320   

   10/28/22  10/29/22  10/30/22  10/31

Here, we narrow the data down, particulary to the county of interst, Pierce County in WA.



In [110]:

p_population = df_deaths_md.query("Province_State == 'Washington' and Admin2 == 'Pierce'")["Population"]
p_masks = df_masks.query("COUNTYFP == 53053")

p_cases_tmp = df_cases.query("Province_State == 'Washington' and Admin2 == 'Pierce'")
id_vars = ["Province_State","Admin2","UID","iso2","iso3","code3","FIPS","Country_Region","Lat","Long_","Combined_Key"]
p_cases = p_cases_tmp.melt(id_vars=id_vars, var_name="date", value_name="cases")

p_policy = df_policy.query("County_Name == 'Pierce County' and State_Tribe_Territory == 'WA'")
p_policy["County"] = "Pierce"
p_policy["masks_mandated"] = p_policy["Face_Masks_Required_in_Public"].apply(lambda x: "yes" if x == "Yes" else "no")



     Province_State  Admin2       UID iso2 iso3  code3     FIPS  \
3172     Washington  Pierce  84053053   US  USA    840  53053.0   

     Country_Region        Lat       Long_  ... 10/23/22  10/24/22  10/25/22  \
3172             US  47.038928 -122.140596  ...   231293    231470    231470   

      10/26/22  10/27/22  10/28/22  10/29/22  10/30/22  10/31/22  11/1/22  
3172    231714    231714    231714    231714    231714    232296   232296  

[1 rows x 1026 columns]




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Here, we perform a rough EDA. 

Observice that the steady increase of cumulative cases, though growth rate varies. 

We also notice 1712 revoked cases, resulting in days with negative case counts. These are zeroed. 

FInally, we observe that, for the data available, once the mask mandate was put into effect, it stayed.

In [130]:
px.bar(p_cases, x="date", y="cases", title="Pierce County Cumulative Cases").show()

p_cases['cases_per_day'] = p_cases['cases'].diff()
p_cases['cases_accumulative'] = p_cases['cases']
px.bar(p_cases, x="date", y="cases_per_day", title="Pierce County Daily Cases", log_y=True).show()

# How many negative cases are there?
case_adjustments = p_cases.query("cases_per_day < 0")
print("Number of cases over counted: ", -case_adjustments['cases_per_day'].sum())

p_cases_no_adjustments = p_cases.query("cases_per_day >= 0")
px.bar(p_cases_no_adjustments, x="date", y="cases_per_day", title="Pierce County Daily Cases (Adjustments removed)", log_y=True).show()

Number of cases over counted:  1712.0


In [132]:
# Now let's try to udnerstand the mask policy
print(p_policy["Face_Masks_Required_in_Public"].unique())
px.line(p_policy, x="date", y="masks_mandated", title="Pierce County Mask Mandate").show()

[nan 'Yes']


Now we try to observe a visual relationship between mask mandate and new infections. I didn't notice a district relationship from the plots above, so I tried to increase the complexity of the model. I am forced to make some big assumptions that are obviously not true:
* Once a person is infected, they cannot be reinfected (I know this is not true, but I don't immediately know how to model it)
* No travel to and from Pierce County.

Infection rate is simply the the number of new cases each day divided by the remaing population that hasn't been infected. 

In [115]:
# Remove infected people from population 
p_cases['county_population'] = p_population.iloc[0]
p_cases.head()

p_cases["uninfected_population"] = p_cases["county_population"] - p_cases["cases_accumulative"]
p_cases["infection_rate"] = (p_cases["cases_per_day"] / p_cases["uninfected_population"]) * 100


In [116]:
px.bar(p_cases, x="date", y="infection_rate", title="Infection Rate", log_y=True).show()

The visualization is not conclusive. While we do see a decline in infection rate after the mandate comes into effect, the best we can do is not the association. The model is extremely simply, and almost certianly missies important, causal variables. 

In [129]:

#print(p_cases.head())
p_cases['date'] = pd.to_datetime(p_cases['date'])
p_cases_clipped = p_cases.query("date <= '2021-8-13'")
print(type(p_cases_clipped['date'].iloc[0]))
p_cases_clipped['mask_mandate'] = p_cases_clipped.apply(lambda x: "yes" if x['date'] > pd.Timestamp("2020-06-26") else "no", axis=1)

px.bar(p_cases_clipped.sort_values(by="date"), 
        x="date", y="infection_rate", color="mask_mandate", 
        title="Infection rate and mask mandate for Pierce County", 
        labels={
            "date": "Date",
            "infection_rate": "Infection Rate (dailyInfections/healthyPopulation)",
            "mask_mandate": "Mask Mandate",
        },
        log_y=True).show()


<class 'pandas._libs.tslibs.timestamps.Timestamp'>




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

