# CDC Databases for COVID

In [None]:
# Provisional COVID-19 Death Counts in the United States by County NCHS
"https://data.cdc.gov/resource/kn79-hsxy.json"

# United States COVID-19 Cases and Deaths by State over Time Case Surveillance
"https://data.cdc.gov/resource/9mfq-cb36.json"

# Distribution of COVID-19 Deaths and Populations, by Jurisdiction, Age, and Race and Hispanic Origin NCHS
"https://data.cdc.gov/resource/jwta-jxbg.json"

# Conditions Contributing to COVID-19 Deaths, by State and Age, Provisional 2020-2022 NCHS
"https://data.cdc.gov/resource/hk9y-quqm.json"

# COVID-19 Case Surveillance Public Use Data with Geography Case Surveillance
"https://data.cdc.gov/resource/n8mc-b4w4.json"

# COVID-19 Case Surveillance Public Use DataCase Surveillance
"https://data.cdc.gov/resource/vbim-akqf.json"

# United States COVID-19 Community Levels by County Public Health Surveillance
"https://data.cdc.gov/resource/3nnm-4jni.json"

# COVID-19 Vaccinations in the United States,County Vaccinations
"https://data.cdc.gov/resource/8xkx-amqh.json"


In [1]:
#!/usr/bin/env python

# make sure to install these packages before running:
# pip install pandas
# pip install sodapy

import pandas as pd
from sodapy import Socrata

In [2]:
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.cdc.gov", None)



In [3]:
# Example authenticated client (needed for non-public datasets):
# client = Socrata(data.cdc.gov,
#                  MyAppToken,
#                  userame="user@example.com",
#                  password="AFakePassword")

# First 2000 results, returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
results = client.get("hk9y-quqm", limit=50000)


In [4]:
# Convert to pandas DataFrame
results_df = pd.DataFrame.from_records(results)
results_df

Unnamed: 0,data_as_of,start_date,end_date,group,state,condition_group,condition,icd10_codes,age_group,covid_19_deaths,number_of_mentions,flag,year,month
0,2022-03-27T00:00:00.000,2020-01-01T00:00:00.000,2022-03-26T00:00:00.000,By Total,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,0-24,1284,1341,,,
1,2022-03-27T00:00:00.000,2020-01-01T00:00:00.000,2022-03-26T00:00:00.000,By Total,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,25-34,5390,5591,,,
2,2022-03-27T00:00:00.000,2020-01-01T00:00:00.000,2022-03-26T00:00:00.000,By Total,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,35-44,14166,14726,,,
3,2022-03-27T00:00:00.000,2020-01-01T00:00:00.000,2022-03-26T00:00:00.000,By Total,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,45-54,35320,36658,,,
4,2022-03-27T00:00:00.000,2020-01-01T00:00:00.000,2022-03-26T00:00:00.000,By Total,United States,Respiratory diseases,Influenza and pneumonia,J09-J18,55-64,77221,79932,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,2022-03-27T00:00:00.000,2021-07-01T00:00:00.000,2021-07-31T00:00:00.000,By Month,United States,Respiratory diseases,Chronic lower respiratory diseases,J40-J47,25-34,10,10,,2021,7
49996,2022-03-27T00:00:00.000,2021-08-01T00:00:00.000,2021-08-31T00:00:00.000,By Month,United States,Respiratory diseases,Chronic lower respiratory diseases,J40-J47,25-34,40,42,,2021,8
49997,2022-03-27T00:00:00.000,2021-09-01T00:00:00.000,2021-09-30T00:00:00.000,By Month,United States,Respiratory diseases,Chronic lower respiratory diseases,J40-J47,25-34,42,42,,2021,9
49998,2022-03-27T00:00:00.000,2021-10-01T00:00:00.000,2021-10-31T00:00:00.000,By Month,United States,Respiratory diseases,Chronic lower respiratory diseases,J40-J47,25-34,21,22,,2021,10


This seems to work but to get all the data I need an API token.  How many rows are there anyways?

In [5]:
a= results_df['state'].unique()
a.sort()
a

array(['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
       'Colorado', 'Connecticut', 'Delaware', 'District of Columbia',
       'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana',
       'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
       'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi',
       'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire',
       'New Jersey', 'New Mexico', 'New York', 'New York City',
       'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon',
       'Pennsylvania', 'Puerto Rico', 'Rhode Island', 'South Carolina',
       'South Dakota', 'Tennessee', 'Texas', 'United States', 'Utah',
       'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin',
       'Wyoming'], dtype=object)

In [None]:
# Or passed as a URL perameter
https://data.seattle.gov/resource/kzjm-xkqj.json?$$app_token=APP_TOKEN


In [8]:
with open("APP_TOKEN.txt",'r') as f:
    API_TOKEN=f.read()
client = Socrata("data.cdc.gov", API_TOKEN)
results = client.get("9mfq-cb36", limit=50000)

In [9]:
# Convert to pandas DataFrame
results_df = pd.DataFrame.from_records(results)
results_df

Unnamed: 0,submission_date,state,tot_cases,conf_cases,prob_cases,new_case,pnew_case,tot_death,new_death,pnew_death,created_at,consent_cases,consent_deaths,conf_death,prob_death
0,2022-01-14T00:00:00.000,KS,621273,470516.0,150757,19414.0,6964,7162,21.0,4,2022-01-15T14:59:30.476,Agree,,,
1,2022-01-02T00:00:00.000,AS,11,,,0.0,0,0,0.0,0,2022-01-03T15:18:16.200,,,,
2,2020-08-22T00:00:00.000,AR,56199,,,547.0,0,674,11.0,0,2020-08-23T14:15:28.102,Not agree,Not agree,,
3,2020-07-17T00:00:00.000,MP,37,37.0,0,1.0,0,2,0.0,0,2020-07-19T00:00:00.000,Agree,Agree,2.0,0
4,2020-08-12T00:00:00.000,AS,0,,,0.0,0,0,0.0,0,2020-08-13T14:12:28.259,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47935,2020-05-28T00:00:00.000,IA,18585,,,228.0,0,506,14.0,0,2020-05-29T14:19:55.781,Not agree,Not agree,,
47936,2020-06-07T00:00:00.000,SD,5438,,,71.0,0,65,0.0,1,2020-06-08T14:55:08.000,,Agree,64.0,1
47937,2021-04-30T00:00:00.000,SD,122660,,,128.0,17,1967,5.0,1,2021-05-01T13:43:22.175,,Agree,1601.0,366
47938,2020-04-06T00:00:00.000,NM,686,,,62.0,0,12,0.0,0,2020-04-08T00:00:00.000,,Not agree,,


In [10]:
a= results_df['state'].unique()
a.sort()
a

array(['AK', 'AL', 'AR', 'AS', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL',
       'FSM', 'GA', 'GU', 'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA',
       'MA', 'MD', 'ME', 'MI', 'MN', 'MO', 'MP', 'MS', 'MT', 'NC', 'ND',
       'NE', 'NH', 'NJ', 'NM', 'NV', 'NY', 'NYC', 'OH', 'OK', 'OR', 'PA',
       'PR', 'PW', 'RI', 'RMI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VA', 'VI',
       'VT', 'WA', 'WI', 'WV', 'WY'], dtype=object)

In [None]:
# 60 "states" which include New York City and Washington DC and Puerto Rico.  Don't know what PW and RMI are.  I think GU is Guam, but FSM?  Flying Speghetti Monster?