# CDC Databases for COVID

In [None]:
# Provisional COVID-19 Death Counts in the United States by County NCHS
"https://data.cdc.gov/resource/kn79-hsxy.json"

# United States COVID-19 Cases and Deaths by State over Time Case Surveillance
"https://data.cdc.gov/resource/9mfq-cb36.json"

# Distribution of COVID-19 Deaths and Populations, by Jurisdiction, Age, and Race and Hispanic Origin NCHS
"https://data.cdc.gov/resource/jwta-jxbg.json"

# Conditions Contributing to COVID-19 Deaths, by State and Age, Provisional 2020-2022 NCHS
"https://data.cdc.gov/resource/hk9y-quqm.json"

# COVID-19 Case Surveillance Public Use Data with Geography Case Surveillance
"https://data.cdc.gov/resource/n8mc-b4w4.json"

# COVID-19 Case Surveillance Public Use DataCase Surveillance
"https://data.cdc.gov/resource/vbim-akqf.json"

# United States COVID-19 Community Levels by County Public Health Surveillance
"https://data.cdc.gov/resource/3nnm-4jni.json"

# COVID-19 Vaccinations in the United States,County Vaccinations
"https://data.cdc.gov/resource/8xkx-amqh.json"


In [1]:
#!/usr/bin/env python

# make sure to install these packages before running:
# pip install pandas
# pip install sodapy

import pandas as pd
from sodapy import Socrata

In [2]:
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.cdc.gov", None)



In [3]:
# Open up our data from CDC.gov with my API_Token
with open("APP_TOKEN.txt",'r') as f:
    API_TOKEN=f.read()
client = Socrata("data.cdc.gov", API_TOKEN)

In [4]:
# Conditions Contributing to COVID-19 Deaths, by State and Age, Provisional 2020-2022 NCHS
results = client.get("hk9y-quqm", select='*', limit=1000000) # can use sql query like select='start_date,end_date,state,age_group,covid_19_deaths'
# Convert to pandas DataFrame
results_df = pd.DataFrame.from_records(results)
results_df

Unnamed: 0,data_as_of,start_date,end_date,group,state,condition_group,condition,icd10_codes,age_group,covid_19_deaths,number_of_mentions,flag,year,month
0,2022-04-17T00:00:00.000,2020-01-01T00:00:00.000,2022-04-16T00:00:00.000,By Total,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,0-24,1309,1368,,,
1,2022-04-17T00:00:00.000,2020-01-01T00:00:00.000,2022-04-16T00:00:00.000,By Total,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,25-34,5448,5650,,,
2,2022-04-17T00:00:00.000,2020-01-01T00:00:00.000,2022-04-16T00:00:00.000,By Total,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,35-44,14329,14900,,,
3,2022-04-17T00:00:00.000,2020-01-01T00:00:00.000,2022-04-16T00:00:00.000,By Total,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,45-54,35661,37022,,,
4,2022-04-17T00:00:00.000,2020-01-01T00:00:00.000,2022-04-16T00:00:00.000,By Total,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,55-64,78106,80880,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
397435,2022-04-17T00:00:00.000,2021-12-01T00:00:00.000,2021-12-31T00:00:00.000,By Month,Puerto Rico,COVID-19,COVID-19,U071,All Ages,54,54,,2021,12
397436,2022-04-17T00:00:00.000,2022-01-01T00:00:00.000,2022-01-31T00:00:00.000,By Month,Puerto Rico,COVID-19,COVID-19,U071,All Ages,621,621,,2022,1
397437,2022-04-17T00:00:00.000,2022-02-01T00:00:00.000,2022-02-28T00:00:00.000,By Month,Puerto Rico,COVID-19,COVID-19,U071,All Ages,209,209,,2022,2
397438,2022-04-17T00:00:00.000,2022-03-01T00:00:00.000,2022-03-31T00:00:00.000,By Month,Puerto Rico,COVID-19,COVID-19,U071,All Ages,43,43,,2022,3


Well we can do SQL queries on the data which is nice.

How many rows are there anyways?... Looks like there are currently nearly 400,000

In [5]:
results_df.dtypes

data_as_of            object
start_date            object
end_date              object
group                 object
state                 object
condition_group       object
condition             object
icd10_codes           object
age_group             object
covid_19_deaths       object
number_of_mentions    object
flag                  object
year                  object
month                 object
dtype: object

In [6]:
results_df.infer_objects().dtypes

data_as_of            object
start_date            object
end_date              object
group                 object
state                 object
condition_group       object
condition             object
icd10_codes           object
age_group             object
covid_19_deaths       object
number_of_mentions    object
flag                  object
year                  object
month                 object
dtype: object

Need to fix this but we are getting there.

In [7]:
results_df['start_date']=pd.to_datetime(results_df['start_date']).dt.tz_localize(None)
results_df['end_date']=pd.to_datetime(results_df['end_date']).dt.tz_localize(None)
results_df['covid_19_deaths']=pd.to_numeric(results_df['covid_19_deaths'])

In [8]:
results_df.dtypes

data_as_of                    object
start_date            datetime64[ns]
end_date              datetime64[ns]
group                         object
state                         object
condition_group               object
condition                     object
icd10_codes                   object
age_group                     object
covid_19_deaths              float64
number_of_mentions            object
flag                          object
year                          object
month                         object
dtype: object

This was a success.  Wish that the .infer_objects() worked.

In [10]:
results_df['state'].unique()

array(['United States', 'Alabama', 'Alaska', 'Arizona', 'Arkansas',
       'California', 'Colorado', 'Connecticut', 'Delaware',
       'District of Columbia', 'Florida', 'Georgia', 'Hawaii', 'Idaho',
       'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana',
       'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota',
       'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada',
       'New Hampshire', 'New Jersey', 'New Mexico', 'New York',
       'New York City', 'North Carolina', 'North Dakota', 'Ohio',
       'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island',
       'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah',
       'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin',
       'Wyoming', 'Puerto Rico'], dtype=object)

In [9]:
California_Deaths=results_df[results_df['state']=='California']
California_Deaths

Unnamed: 0,data_as_of,start_date,end_date,group,state,condition_group,condition,icd10_codes,age_group,covid_19_deaths,number_of_mentions,flag,year,month
1150,2022-04-17T00:00:00.000,2020-01-01,2022-04-16,By Total,California,Respiratory diseases,Influenza and pneumonia,J09-J18,0-24,155.0,162,,,
1151,2022-04-17T00:00:00.000,2020-01-01,2022-04-16,By Total,California,Respiratory diseases,Influenza and pneumonia,J09-J18,25-34,682.0,703,,,
1152,2022-04-17T00:00:00.000,2020-01-01,2022-04-16,By Total,California,Respiratory diseases,Influenza and pneumonia,J09-J18,35-44,1769.0,1806,,,
1153,2022-04-17T00:00:00.000,2020-01-01,2022-04-16,By Total,California,Respiratory diseases,Influenza and pneumonia,J09-J18,45-54,4442.0,4539,,,
1154,2022-04-17T00:00:00.000,2020-01-01,2022-04-16,By Total,California,Respiratory diseases,Influenza and pneumonia,J09-J18,55-64,9600.0,9777,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88315,2022-04-17T00:00:00.000,2021-12-01,2021-12-31,By Month,California,COVID-19,COVID-19,U071,All Ages,2325.0,2325,,2021,12
88316,2022-04-17T00:00:00.000,2022-01-01,2022-01-31,By Month,California,COVID-19,COVID-19,U071,All Ages,6619.0,6619,,2022,1
88317,2022-04-17T00:00:00.000,2022-02-01,2022-02-28,By Month,California,COVID-19,COVID-19,U071,All Ages,4691.0,4691,,2022,2
88318,2022-04-17T00:00:00.000,2022-03-01,2022-03-31,By Month,California,COVID-19,COVID-19,U071,All Ages,1087.0,1087,,2022,3


In [10]:
California_Deaths.groupby(['group','condition_group']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,covid_19_deaths
group,condition_group,Unnamed: 2_level_1
By Month,All other conditions and causes (residual),66058.0
By Month,Alzheimer disease,6776.0
By Month,COVID-19,188237.0
By Month,Circulatory diseases,158050.0
By Month,Diabetes,39705.0
By Month,"Intentional and unintentional injury, poisoning, and other adverse events",4476.0
By Month,Malignant neoplasms,8496.0
By Month,Obesity,10568.0
By Month,Renal failure,25111.0
By Month,Respiratory diseases,244165.0


In [11]:
import matplotlib.pyplot as plt

In [13]:
pd.set_option('display.max_rows', None)
California_Deaths.groupby(['condition','age_group']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,covid_19_deaths
condition,age_group,Unnamed: 2_level_1
Adult respiratory distress syndrome,0-24,105.0
Adult respiratory distress syndrome,25-34,531.0
Adult respiratory distress syndrome,35-44,1547.0
Adult respiratory distress syndrome,45-54,3774.0
Adult respiratory distress syndrome,55-64,6841.0
Adult respiratory distress syndrome,65-74,7712.0
Adult respiratory distress syndrome,75-84,4814.0
Adult respiratory distress syndrome,85+,2055.0
Adult respiratory distress syndrome,All Ages,27632.0
Adult respiratory distress syndrome,Not stated,0.0


This is going to be super useful.  I can compair COVID rates to other diseases.  And have a good spread of national average of risk.

In [14]:
pd.set_option('display.max_rows', 20)

In [42]:
# United States COVID-19 Cases and Deaths by State over Time Case Surveillance
results = client.get("9mfq-cb36", limit=50000)
results_df1 = pd.DataFrame.from_records(results)
results_df1

Unnamed: 0,submission_date,state,tot_cases,conf_cases,prob_cases,new_case,pnew_case,tot_death,new_death,pnew_death,created_at,consent_cases,consent_deaths,conf_death,prob_death
0,2022-01-14T00:00:00.000,KS,621273,470516.0,150757,19414.0,6964,7162,21.0,4,2022-01-15T14:59:30.476,Agree,,,
1,2022-01-02T00:00:00.000,AS,11,,,0.0,0,0,0.0,0,2022-01-03T15:18:16.200,,,,
2,2020-08-22T00:00:00.000,AR,56199,,,547.0,0,674,11.0,0,2020-08-23T14:15:28.102,Not agree,Not agree,,
3,2021-12-06T00:00:00.000,MP,1104,1104.0,0,0.0,0,5,2.0,0,2021-12-08T00:00:00.000,Agree,Agree,5.0,0
4,2021-05-09T00:00:00.000,PW,0,,,0.0,0,0,0.0,0,2021-05-10T14:15:36.155,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49135,2020-05-28T00:00:00.000,IA,18585,,,228.0,0,506,14.0,0,2020-05-29T14:19:55.781,Not agree,Not agree,,
49136,2020-06-07T00:00:00.000,SD,5438,,,71.0,0,65,0.0,1,2020-06-08T14:55:08.000,,Agree,64.0,1
49137,2021-04-30T00:00:00.000,SD,122660,,,128.0,17,1967,5.0,1,2021-05-01T13:43:22.175,,Agree,1601.0,366
49138,2020-04-06T00:00:00.000,NM,686,,,62.0,0,12,0.0,0,2020-04-08T00:00:00.000,,Not agree,,


In [43]:
# Provisional COVID-19 Death Counts in the United States by County NCHS
results = client.get("kn79-hsxy", limit=50000)
results_df2 = pd.DataFrame.from_records(results)
results_df2

Unnamed: 0,data_as_of,start_week,end_week,state_name,county_name,county_fips_code,urban_rural_code,total_death,footnote,covid_death
0,2022-04-20T00:00:00.000,2020-01-01T00:00:00.000,2022-04-16T00:00:00.000,AK,Aleutians East Borough,2013,Noncore,16,One or more data cells have counts between 1-9...,
1,2022-04-20T00:00:00.000,2020-01-01T00:00:00.000,2022-04-16T00:00:00.000,AK,Anchorage Municipality,2020,Medium metro,5766,,671
2,2022-04-20T00:00:00.000,2020-01-01T00:00:00.000,2022-04-16T00:00:00.000,AK,Bethel Census Area,2050,Noncore,264,,32
3,2022-04-20T00:00:00.000,2020-01-01T00:00:00.000,2022-04-16T00:00:00.000,AK,Denali Borough,2068,Noncore,14,One or more data cells have counts between 1-9...,
4,2022-04-20T00:00:00.000,2020-01-01T00:00:00.000,2022-04-16T00:00:00.000,AK,Dillingham Census Area,2070,Noncore,77,One or more data cells have counts between 1-9...,
...,...,...,...,...,...,...,...,...,...,...
3071,2022-04-20T00:00:00.000,2020-01-01T00:00:00.000,2022-04-16T00:00:00.000,WY,Sweetwater County,56037,Micropolitan,788,,98
3072,2022-04-20T00:00:00.000,2020-01-01T00:00:00.000,2022-04-16T00:00:00.000,WY,Teton County,56039,Micropolitan,211,,18
3073,2022-04-20T00:00:00.000,2020-01-01T00:00:00.000,2022-04-16T00:00:00.000,WY,Uinta County,56041,Micropolitan,332,,20
3074,2022-04-20T00:00:00.000,2020-01-01T00:00:00.000,2022-04-16T00:00:00.000,WY,Washakie County,56043,Noncore,187,,28


In [None]:
# Provisional COVID-19 Death Counts in the United States by County NCHS
results = client.get("kn79-hsxy", limit=None)
results_df2 = pd.DataFrame.from_records(results)
results_df2

In [5]:
# COVID-19 infer_objectsnations in the United States,County Vaccinations
results = client.get("8xkx-amqh", limit=100000)
results_df3 = pd.DataFrame.from_records(results)
results_df3

Unnamed: 0,date,fips,mmwr_week,recip_county,recip_state,completeness_pct,administered_dose1_recip,administered_dose1_pop_pct,administered_dose1_recip_5plus,administered_dose1_recip_5pluspop_pct,...,booster_doses_vax_pct_ur_equity,booster_doses_12plusvax_pct_ur_equity,booster_doses_18plusvax_pct_ur_equity,booster_doses_65plusvax_pct_ur_equity,census2019,census2019_5pluspop,census2019_5to17pop,census2019_12pluspop,census2019_18pluspop,census2019_65pluspop
0,2022-04-20T00:00:00.000,53033,16,King County,WA,96,2000651,88.8,1999897,94.1,...,4,4,4,4,2252782,2125383,324217,1946808,1801166,304315
1,2022-04-20T00:00:00.000,55023,16,Crawford County,WI,96.8,10011,62.1,10011,65.2,...,8,8,8,8,16131,15362,2464,14166,12898,3867
2,2022-04-20T00:00:00.000,12131,16,Walton County,FL,98.7,43580,58.8,43570,62.3,...,2,2,3,2,74071,69930,10943,63887,58987,14977
3,2022-04-20T00:00:00.000,13001,16,Appling County,GA,89.5,7558,41.1,7558,43.9,...,6,6,6,6,18386,17209,3336,15376,13873,3257
4,2022-04-20T00:00:00.000,13035,16,Butts County,GA,89.5,9891,39.7,9891,42,...,2,2,2,2,24936,23565,3732,21552,19833,3942
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,2022-03-21T00:00:00.000,31073,12,Gosper County,NE,90.6,,,,,...,,,,,1990,1882,,1727,1562,487
99996,2022-03-21T00:00:00.000,31101,12,Keith County,NE,90.6,,,,,...,,,,,8034,7591,,6988,6386,2177
99997,2022-03-21T00:00:00.000,35005,12,Chaves County,NM,97,34365,53.2,34364,56.9,...,,,,,64615,60423,,53902,47845,10434
99998,2022-03-21T00:00:00.000,35028,12,Los Alamos County,NM,97,18479,95,18478,95,...,,,,,19369,18362,,16676,15103,3503


In [8]:
results_df3.columns

Index(['date', 'fips', 'mmwr_week', 'recip_county', 'recip_state',
       'completeness_pct', 'administered_dose1_recip',
       'administered_dose1_pop_pct', 'administered_dose1_recip_5plus',
       'administered_dose1_recip_5pluspop_pct',
       'administered_dose1_recip_12plus',
       'administered_dose1_recip_12pluspop_pct',
       'administered_dose1_recip_18plus',
       'administered_dose1_recip_18pluspop_pct',
       'administered_dose1_recip_65plus',
       'administered_dose1_recip_65pluspop_pct', 'series_complete_yes',
       'series_complete_pop_pct', 'series_complete_5plus',
       'series_complete_5pluspop_pct', 'series_complete_5to17',
       'series_complete_5to17pop_pct', 'series_complete_12plus',
       'series_complete_12pluspop_pct', 'series_complete_18plus',
       'series_complete_18pluspop_pct', 'series_complete_65plus',
       'series_complete_65pluspop_pct', 'booster_doses',
       'booster_doses_vax_pct', 'booster_doses_12plus',
       'booster_doses_12plus_v

Okay... More than a million records.

This is where we need some SQL type queries set before we do things.

In [10]:
results=[]
for n in range(17):
    print(f"Getting record {n+1}")
    results.extend(client.get("8xkx-amqh",select='date,recip_state,recip_county,administered_dose1_recip, administered_dose1_pop_pct, \
    booster_doses, booster_doses_vax_pct, svi_ctgy, series_complete_pop_pct_svi', limit=100000, offset=100000*n))

results_df=pd.DataFrame(results)

# Using get_all took over 40 minutes and sometimes timed out.  The for loop with chunking saved time as well as was less likely to crash
    #results = client.get_all("8xkx-amqh",select='date,recip_state,recip_county,administered_dose1_recip, administered_dose1_pop_pct, booster_doses, booster_doses_vax_pct, svi_ctgy, series_complete_pop_pct_svi',order='recip_state') 
results_df.to_csv('./data/vaccine_rate_by_county.csv')
results_df

Getting record 1
Getting record 2
Getting record 3
Getting record 4
Getting record 5
Getting record 6
Getting record 7
Getting record 8
Getting record 9
Getting record 10
Getting record 11
Getting record 12
Getting record 13
Getting record 14
Getting record 15
Getting record 16
Getting record 17


Unnamed: 0,date,recip_state,recip_county,administered_dose1_recip,administered_dose1_pop_pct,booster_doses,booster_doses_vax_pct,svi_ctgy,series_complete_pop_pct_svi
0,2022-04-20T00:00:00.000,WA,King County,2000651,88.8,1097154,60.1,B,8
1,2022-04-20T00:00:00.000,WI,Crawford County,10011,62.1,5521,58.6,B,6
2,2022-04-20T00:00:00.000,FL,Walton County,43580,58.8,14232,39.1,C,9
3,2022-04-20T00:00:00.000,GA,Appling County,7558,41.1,2197,33.3,D,13
4,2022-04-20T00:00:00.000,GA,Butts County,9891,39.7,3163,37.4,D,13
...,...,...,...,...,...,...,...,...,...
1621280,2020-12-13T00:00:00.000,TN,Washington County,0,0,,,B,
1621281,2020-12-13T00:00:00.000,NY,Bronx County,0,0,,,D,
1621282,2020-12-13T00:00:00.000,AR,Monroe County,0,0,,,D,
1621283,2020-12-13T00:00:00.000,OR,Tillamook County,0,0,,,C,


In [15]:
# Strangely enough I had to reimport the data from my save to get Panda's to recognize and parse the dtypes correctly
df= pd.read_csv('./data/vaccine_rate_by_county.csv',  index_col=[0], parse_dates=[1,2])
df.infer_objects().dtypes

date                           datetime64[ns]
recip_state                            object
recip_county                           object
administered_dose1_recip              float64
administered_dose1_pop_pct            float64
booster_doses                         float64
booster_doses_vax_pct                 float64
svi_ctgy                               object
series_complete_pop_pct_svi           float64
dtype: object

In [16]:
df[['recip_state','recip_county','administered_dose1_pop_pct','booster_doses_vax_pct']].groupby(['recip_state']).max()\
.sort_values(by='administered_dose1_pop_pct', ascending=False)

Unnamed: 0_level_0,recip_county,administered_dose1_pop_pct,booster_doses_vax_pct
recip_state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AZ,Yuma County,100.0,47.3
NM,Valencia County,100.0,65.2
MT,Yellowstone County,100.0,75.1
AK,Yukon-Koyukuk Census Area,99.9,62.2
GA,Worth County,99.9,95.0
...,...,...,...
PW,Unknown County,0.0,62.5
MP,Unknown County,0.0,48.1
MH,Unknown County,0.0,24.8
UNK,Unknown County,0.0,


In [17]:
df[['recip_state','recip_county','administered_dose1_pop_pct','booster_doses_vax_pct']].groupby(['recip_state']).median()\
.sort_values(by='administered_dose1_pop_pct', ascending=False)

Unnamed: 0_level_0,administered_dose1_pop_pct,booster_doses_vax_pct
recip_state,Unnamed: 1_level_1,Unnamed: 2_level_1
PR,66.0,47.50
CT,64.9,49.40
MA,62.3,48.30
ME,61.9,52.40
NJ,61.0,45.10
...,...,...
AS,0.0,15.30
NE,0.0,53.40
GU,0.0,31.95
TX,0.0,34.10


Very interesting in how different states are doing with vaccines and boosters... wait a minute.  The median county in Texas has 0% with the first dose but 34% with a booster....  That must be an error.

In [25]:
pd.set_option('display.max_rows', None)
df.groupby(['recip_state','recip_county']).max()

Unnamed: 0_level_0,Unnamed: 1_level_0,date,administered_dose1_recip,administered_dose1_pop_pct,booster_doses,booster_doses_vax_pct,svi_ctgy,series_complete_pop_pct_svi
recip_state,recip_county,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AK,Aleutians East Borough,2022-04-20,2842.0,85.2,912.0,36.3,C,12.0
AK,Aleutians West Census Area,2022-04-20,4206.0,74.7,1147.0,32.4,B,8.0
AK,Anchorage Municipality,2022-04-20,218263.0,75.8,83051.0,42.8,B,8.0
AK,Bethel Census Area,2022-04-20,13295.0,72.3,5747.0,46.5,D,16.0
AK,Bristol Bay Borough,2022-04-20,1097.0,99.9,327.0,37.4,B,8.0
AK,Denali Borough,2022-04-20,1319.0,62.9,547.0,48.1,A,4.0
AK,Dillingham Census Area,2022-04-20,2824.0,57.4,1124.0,43.7,D,16.0
AK,Fairbanks North Star Borough,2022-04-20,65946.0,68.1,19935.0,34.4,B,8.0
AK,Haines Borough,2022-04-20,1864.0,73.7,974.0,56.1,A,4.0
AK,Hoonah-Angoon Census Area,2022-04-20,1616.0,75.2,938.0,62.2,B,8.0


Some of the vaccine records appear to be missing or have some issue with them.

I found this alternative site which has corrected for these issues.

In [3]:
import pandas as pd

vaccine_tracking_county_curent = pd.read_csv('https://github.com/bansallab/vaccinetracking/blob/main/vacc_data/data_county_current.csv')
vaccine_tracking_time_series = pd.read_csv('https://github.com/bansallab/vaccinetracking/blob/main/vacc_data/data_county_timeseries.csv')
vaccine_tracking_county_curent.to_csv('./data/vaccine_data_county_current.csv')
vaccine_tracking_time_series.to_csv('./data/vaccine_data_county_current.csv')
# Citation:
# If you use this data, please cite this repository and the following article: Andrew Tiu, Zachary Susswein, Alexes Merritt, Shweta Bansal. 
# Characterizing the spatiotemporal heterogeneity of the COVID-19 vaccination landscape. medRxiv. 
# https://doi.org/10.1101/2021.10.04.21263345

  vaccine_tracking_county_curent = pd.read_csv('https://github.com/bansallab/vaccinetracking/blob/main/vacc_data/data_county_current.csv', sep='/t')


ParserError: Expected 1 fields in line 417, saw 2. Error could possibly be due to quotes being ignored when a multi-char delimiter is used.

In [24]:
pd.set_option('display.max_rows', 20)

In [8]:
a= results_df['state'].unique()
a.sort()
a

array(['AK', 'AL', 'AR', 'AS', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL',
       'FSM', 'GA', 'GU', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA',
       'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MP', 'MS', 'MT', 'NC', 'ND',
       'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'NYC', 'OH', 'OK', 'OR', 'PA',
       'PR', 'PW', 'RI', 'RMI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VI',
       'VT', 'WA', 'WI', 'WV', 'WY'], dtype=object)

60 "states" which include New York City and Washington DC and Puerto Rico.  Don't know what PW and RMI are.  I think GU is Guam, but FSM?  Flying Speghetti Monster?

### Some helpful tables about the states

In [10]:
# Lets save this so we can use it later
results_df.to_csv("./data/CDC_Cases_and_deaths.csv", index=False)

In [15]:
State_Abriv=pd.read_clipboard()
State_Abriv

Unnamed: 0,State,Standard,Postal
0,Alabama,Ala.,AL
1,Alaska,Alaska,AK
2,Arizona,Ariz.,AZ
3,Arkansas,Ark.,AR
4,California,Calif.,CA
5,Canal Zone,C.Z.,CZ
6,Colorado,Colo.,CO
7,Connecticut,Conn.,CT
8,Delaware,Del.,DE
9,District of Columbia,D.C.,DC


In [16]:
State_Abriv.to_csv("./data/state_abriv.csv")

In [24]:
results_df['state'].map(dict(zip(State_Abriv['Postal'],State_Abriv['State']))).unique()

array(['Kansas', nan, 'Arkansas', 'Hawaii', 'Alaska', 'Oklahoma',
       'Georgia', 'Florida', 'Texas', 'Utah', 'West Virginia',
       'Massachusetts', 'Colorado', 'Puerto Rico', 'Pennsylvania', 'Iowa',
       'Ohio', 'Tennessee', 'New Mexico', 'New York', 'South Dakota',
       'Washington', 'North Dakota', 'Maryland', 'Vermont', 'Alabama',
       'Illinois', 'Wisconsin', 'Guam', 'Mississippi', 'Nevada', 'Maine',
       'Michigan', 'Connecticut', 'Delaware', 'Nebraska', 'Indiana',
       'Missouri', 'Idaho', 'California', 'Virgin Islands',
       'North Carolina', 'Montana', 'New Hampshire', 'Louisiana',
       'District of Columbia', 'New Jersey', 'Wyoming', 'Kentucky',
       'Minnesota', 'Oregon', 'Rhode Island', 'South Carolina',
       'Virginia', 'Arizona'], dtype=object)

In [25]:
# this works but there is an issue with the entries which are not part of the dictionary.  Like NYC and FSM and RMI.

In [None]:
# Capturing the neighboring states may help with prediction.  Neighbors may have an effect on one another.  
# This was easier to copy out of the web and make a spreadsheet then copy here.
# Don't know how to do this by county.  Maybe longitude and latitude?

In [13]:
States_Neighbors = pd.read_clipboard()
States_Neighbors

Unnamed: 0,State Name,Bordering States
0,Alabama,"Florida, Georgia, Mississippi, Tennessee"
1,Alaska,
2,American Samoa,
3,Arizona,"California, Colorado, Nevada, New Mexico, Utah"
4,Arkansas,"Louisiana, Mississippi, Missouri, Oklahoma, Te..."
5,California,"Arizona, Nevada, Oregon"
6,Colorado,"Arizona, Kansas, Nebraska, New Mexico, Oklahom..."
7,Connecticut,"Massachusetts, New York, Rhode Island"
8,Delaware,"Maryland, New Jersey, Pennsylvania"
9,District of Columbia,"Maryland, Virginia"


In [14]:
States_Neighbors.to_csv("./data/state_neighbors.csv")

### Opendatasoft data of county geoospatial data

In [None]:
# Opendatasoft has an amazing set of data for the county boarders of the US
https://public.opendatasoft.com/explore/embed/dataset/us-county-boundaries/map/?disjunctive.statefp&disjunctive.countyfp&disjunctive.name&disjunctive.namelsad&disjunctive.stusab&disjunctive.state_name&location=5,40.83044,-92.98828&basemap=jawg.light

So I only coppied the first 80 rows from that.  Doesn't work sadly that way.

In [3]:
boundries =pd.read_csv("./data/boarders/us-county-boundaries.csv", sep=';')
boundries

Unnamed: 0,Geo Point,Geo Shape,STATEFP,COUNTYFP,COUNTYNS,GEOID,NAME,NAMELSAD,STUSAB,LSAD,...,CSAFP,CBSAFP,METDIVFP,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,STATE_NAME,COUNTYFP NOZERO
0,"40.1763796295,-98.0471849897","{""coordinates"": [[[-98.273667, 40.089399], [-9...",31,129,835886,31129,Nuckolls,Nuckolls County,NE,6,...,,,,A,1489645188,1718484,40.176492,-98.046842,Nebraska,129
1,"40.7715207081,-84.1057794323","{""coordinates"": [[[-84.397189, 40.786584], [-8...",39,3,1074015,39003,Allen,Allen County,OH,6,...,338.0,30620.0,,A,1042479889,11259523,40.771627,-84.106103,Ohio,3
2,"41.9103521337,-81.250939242","{""coordinates"": [[[-81.488643, 41.631522], [-8...",39,85,1074055,39085,Lake,Lake County,OH,6,...,184.0,17460.0,,A,593807218,1942301625,41.924116,-81.392643,Ohio,85
3,"37.5023005833,-94.3471107747","{""coordinates"": [[[-94.617088, 37.477776], [-9...",29,11,758460,29011,Barton,Barton County,MO,6,...,,,,A,1533351029,12152201,37.500799,-94.344089,Missouri,11
4,"45.151770859,-86.0384960523","{""coordinates"": [[[-85.561745, 44.952258], [-8...",26,89,1622987,26089,Leelanau,Leelanau County,MI,6,...,,45900.0,,A,899241895,5659105307,45.146182,-86.051574,Michigan,89
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3228,"31.8133256702,-95.6525471812","{""coordinates"": [[[-95.677533, 32.051631], [-9...",48,1,1383786,48001,Anderson,Anderson County,TX,6,...,,37300.0,,A,2752201306,39808595,31.841261,-95.661727,Texas,1
3229,"38.0267183008,-77.3470467441","{""coordinates"": [[[-77.551611, 38.073609], [-7...",51,33,1480107,51033,Caroline,Caroline County,VA,6,...,,,,A,1366502847,24928486,38.030319,-77.352348,Virginia,33
3230,"45.1719349421,-96.7676947106","{""coordinates"": [[[-97.226066, 45.209924], [-9...",46,51,1265782,46051,Grant,Grant County,SD,6,...,,,,A,1764937242,15765681,45.172637,-96.772261,South Dakota,51
3231,"40.31072723,-79.4669607014","{""coordinates"": [[[-79.61818, 40.644766], [-79...",42,129,1209191,42129,Westmoreland,Westmoreland County,PA,6,...,430.0,38300.0,,A,2662612862,20837789,40.311068,-79.466688,Pennsylvania,129


In [None]:
import geopandas

gdf = geopandas.read_file("./data/boarders/georef-united-states-of-america-county.geojson")

gdf

CRSError: Invalid projection: epsg:4326: (Internal Proj Error: proj_create: SQLite error on SELECT name, type, coordinate_system_auth_name, coordinate_system_code, datum_auth_name, datum_code, area_of_use_auth_name, area_of_use_code, text_definition, deprecated FROM geodetic_crs WHERE auth_name = ? AND code = ?: no such column: area_of_use_auth_name)