In [113]:
import requests
import pandas as pd
import ssl
import datetime
ssl._create_default_https_context = ssl._create_unverified_context
cdc_raw = pd.read_csv("https://data.cdc.gov/api/views/8xkx-amqh/rows.csv?accessType=DOWNLOAD")
can_raw = requests.get("https://api.covidactnow.org/v2/counties.json?apiKey=81d0e97ecec0406abf12c80d6cd8ec93")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [30]:
rows = []
for entry in can_raw.json():
    county = entry['actuals']
    county['county'] = entry['county']
    county['state'] = entry['state']
    county['fips'] = entry['fips']
    rows.append(county)
can = (
    pd.DataFrame.from_records(rows)[["vaccinationsInitiated", "vaccinationsCompleted", "state", "county", "fips"]]
    .rename(columns={"vaccinationsInitiated":"can_initiated", "vaccinationsCompleted":"can_completed"})
)

In [143]:
yesterday = datetime.date.today() - datetime.timedelta(days=1)
yesterday = str(yesterday.strftime("%m/%d/%Y"))
cdc = (
    cdc_raw.loc[:, ["Date", "FIPS", "Recip_County", "Recip_State", "Administered_Dose1_Recip", "Series_Complete_Yes"]]
    .rename(columns={"Recip_County":"county", "Administered_Dose1_Recip":"cdc_initiated","Recip_State":"state", "Series_Complete_Yes":"cdc_completed", "FIPS":"fips"})
    .loc[cdc_raw["Date"] == yesterday]
    .drop(columns={"Date"})
)

## CDC County Coverage

In [144]:
cdc_coverage = len(cdc.loc[
    (cdc["cdc_initiated"] != 0) & 
    (cdc["county"] != "Unknown County") & 
    (cdc["cdc_completed"] != 0)
])
print(f"CDC has full coverage for {cdc_coverage} counties")

CDC has full coverage for 2737 counties


## CDC missing counties

In [145]:
unknown = len(cdc.loc[cdc["county"] == "Unknown County"])
missing = len(cdc.loc[(cdc["cdc_initiated"] == 0)]) # add  & (cdc["cdc_completed"] == 0)] for neither init or complete 
print(f"CDC has no data for {missing} counties, and tracks {unknown} unknown counties")

CDC has no data for 491 counties, and tracks 58 unknown counties


In [146]:
def missing(var):
    missing = (
    cdc.loc[cdc[var] == 0]
    .loc[:, ["state"]]
    .groupby("state")
    .size()
    )
    total = cdc.groupby("state").size()
    compare = pd.concat([missing, total], axis=1).dropna()
    compare.columns = ["missing counties", "total counties"]
    compare["% missing"] = round(compare["missing counties"] / compare["total counties"],4)*100
    return compare.sort_values("% missing", ascending=False)

#### States with Counties missing 1+ dose data

In [147]:
missing_initiated = missing("cdc_initiated").reset_index()
missing_initiated

Unnamed: 0,state,missing counties,total counties,% missing
0,HI,5.0,5,100.0
1,TX,254.0,254,100.0
2,NE,76.0,94,80.85
3,SD,42.0,67,62.69
4,NM,19.0,34,55.88
5,CO,34.0,65,52.31
6,DC,1.0,2,50.0
7,OH,15.0,89,16.85
8,GA,24.0,160,15.0
9,CA,8.0,59,13.56


#### States with Counties missing vaccine completed data

In [148]:
missing("cdc_completed")

Unnamed: 0_level_0,missing counties,total counties,% missing
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
HI,5.0,5,100.0
TX,254.0,254,100.0
CA,8.0,59,13.56
VA,7.0,134,5.22


## Compare CDC and Dashboard Data

In [149]:
full = pd.merge(cdc, can, how="left", on=["fips", "state"]).query("fips != 'UNK'")

### Percent Differences
`percent_difference` is calculated as: \begin{equation*}
\Bigg(\frac{\mbox{cdc values} - \mbox{state values}}{\frac{\mbox{cdc values} + \mbox{state values}}{2}}\Bigg)*100
\end{equation*}

In [169]:
full = full.assign(
    initiated_difference=lambda x: x['cdc_initiated'] - x['can_initiated'],
    percent_init_difference=lambda x: x['initiated_difference'] / ((x['cdc_initiated'] + x['can_initiated'])/2) * 100
)

#### Summary of % difference

In [172]:
summary = (
    full.query("cdc_initiated != 0")["percent_init_difference"]
    .describe()
)
summary["median"] = full.query("cdc_initiated != 0")['percent_init_difference'].median()
pd.DataFrame(summary)

Unnamed: 0,percent_init_difference
count,2723.0
mean,-8.349674
std,33.807794
min,-195.959596
25%,-5.84028
50%,-0.431344
75%,3.072435
max,182.399531
median,-0.431344


#### States with largest under-representation by CDC data
(states where state-dashboard values are larger than the cdc values by at least 5% difference)

In [167]:
d = (
    full.query("cdc_initiated != 0")
    .groupby("state")
    .mean()
    .sort_values('percent_init_difference', ascending=False)
    .reset_index()
    .drop(columns={"cdc_initiated", "cdc_completed", "can_initiated", "can_completed"})
)
high_discrepancies = d.loc[d["percent_init_difference"] < -5]
high_discrepancies

Unnamed: 0,state,initiated_difference,percent_init_difference
42,IL,-3379.176471,-5.235285
43,KS,-110.771429,-5.861061
44,NM,-8577.785714,-16.211599
45,ND,-803.566038,-20.232754
46,VT,-6135.5,-26.432973
47,WV,-5397.581818,-57.348546
48,GA,-12292.977778,-75.1176
49,VA,-14382.821138,-91.317769


In [173]:
# out of the worst 250 counties(in terms of % diff), which states do these counties belong to?
# f = full.query("cdc_initiated != 0")
# f["percent_init_difference"] = f["percent_init_difference"].abs()
# f = f.sort_values("percent_init_difference", ascending=False).head(250)
# pd.DataFrame(f.groupby("state").size().sort_values(ascending=False))

## States to block:
#### States are detected if they are missing any counties, or have a mean % difference of < -5% 
(negative % diff indicates state dashboard values are larger than CDC values)

In [168]:
# states that we currently use CDC for and cannot block
# and DC is not missing a county, so do not block
# Illinois does not have 1+ data, so we need to use cdc 1+
states_to_not_block = ["IL", "CO", "FL", "DC", "MA", "AR", "DE", "FL", "KS", "KY", "NH", "NE", "OK", "PR", "RI", "SC", "UT"]

block = (
    pd.concat(
        [
            pd.DataFrame(missing_initiated["state"]).assign(explanation="missing counties"),
            pd.DataFrame(high_discrepancies["state"]).assign(explanation="% difference of < -5% (CDC under-counting)")
        ]
    )
    .groupby("state")['explanation']
    .apply(lambda x: ' and '.join(x))
    .reset_index()
    .query("state not in @states_to_not_block")
)
block

Unnamed: 0,state,explanation
0,CA,missing counties
4,GA,missing counties and % difference of < -5% (CD...
5,HI,missing counties
8,MI,missing counties
9,MT,missing counties
10,ND,% difference of < -5% (CDC under-counting)
12,NJ,missing counties
13,NM,missing counties and % difference of < -5% (CD...
14,OH,missing counties
15,SD,missing counties
