In [1]:
#import Pandas, Numpy, and MatPlotLib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np


In [2]:
#import requests in order to call the COVID data API data 
#import time in order to pause between API calls and not freeze in the process
#import pprint in order to pretty-print the data.
import requests
import time
from pprint import pprint


In [3]:
##SECTION 1 ##
###### DO NOT RUN THESE CELLS UNLESS YOU WANT TO RE-CALL THE API. #####
###### SKIP TO THE 'START HERE' CELL #######

#The COVID19 Data has more than 4,481,062 rows
#To collect the data efficiently, without losing system performance
    #I will call the data in batches (paging) using a limit and offset
    #This will require multiple API calls

In [23]:
#This is a variable which calls the API link from the COVID data website
    #it orders the data by CDC Report Date
    #and it has a limit larger than the max number of rows in the CDC data
r = requests.get('https://data.cdc.gov/resource/vbim-akqf.json?$order=cdc_report_dt&$limit=5500000')

#This is the API call with a 1 second pause between requests
covid_json = r.json()
time.sleep(1)

In [24]:
#pretty-print the first 2 entries to preview the data.
pprint(covid_json[:2])

[{'age_group': '60 - 69 Years',
  'cdc_report_dt': '2020-01-01T00:00:00.000',
  'current_status': 'Probable Case',
  'death_yn': 'No',
  'hosp_yn': 'No',
  'icu_yn': 'Missing',
  'medcond_yn': 'Yes',
  'onset_dt': '2020-01-01T00:00:00.000',
  'race_ethnicity_combined': 'White, Non-Hispanic',
  'sex': 'Male'},
 {'age_group': '40 - 49 Years',
  'cdc_report_dt': '2020-01-01T00:00:00.000',
  'current_status': 'Laboratory-confirmed case',
  'death_yn': 'No',
  'hosp_yn': 'Yes',
  'icu_yn': 'Yes',
  'medcond_yn': 'Missing',
  'onset_dt': '2020-01-01T00:00:00.000',
  'pos_spec_dt': '2020-03-15T00:00:00.000',
  'race_ethnicity_combined': 'Black, Non-Hispanic',
  'sex': 'Male'}]


In [None]:
#I want to compare the following COVID19 data points:
    #case report date = 'cdc_rpt_dt' - number of confirmed cases
    #death status = 'death_yn' - number of deaths
    #hospitalization status = 'hosp_yn' - number of people hospitalized

#these are the key stats reported re: COVID19 counts in the news
#So these are the data columns I want to create a Pandas DataFrame with.

#Note: technically, I would pull positive result confirmation date, but this is missing from some cases

In [25]:
#Right now, the API data is a list with nested dictionaries

print(type(covid_json))

<class 'list'>


In [26]:
#Loop through the list and call each desired datapoint, by its key, into its own list.
#Here is a sample

case_date, death, hospital = [],[],[]

for data in covid_json:
    case_date.append(data['cdc_report_dt'])
    death.append(data['death_yn'])
    hospital.append(data['hosp_yn'])


In [27]:
#Now that each datapoint is separated, add them back into a dictionary 
#This will make sure that each datapoint column is labelled
covid_values = {'cdc_report_dt': case_date, 'death_yn': death, 'hosp_yn': hospital}

#Convert the dictionary into a DataFrame
covid_df = pd.DataFrame(covid_values)


In [None]:
########## START HERE  ############### 
#Only run the cells above this one if you want to update (add to) the COVID data.
#This dataset is continually added to by the CDC and the API call will refresh it.

In [28]:
#Check the COVID DataFrame with a summary

covid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4481062 entries, 0 to 4481061
Data columns (total 3 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   cdc_report_dt  object
 1   death_yn       object
 2   hosp_yn        object
dtypes: object(3)
memory usage: 102.6+ MB


In [29]:
##SECTION 2##

#Here is a list of next steps:
#1. convert the COVID dates into standard date format
#2. group the covid data by days (pos_spec_dt)
    #for each day, I will have total number of cases, deaths, hospitalizations

#Preview the COVID DataFrame
covid_df.head()

Unnamed: 0,cdc_report_dt,death_yn,hosp_yn
0,2020-01-01T00:00:00.000,No,No
1,2020-01-01T00:00:00.000,No,Yes
2,2020-01-01T00:00:00.000,No,No
3,2020-01-01T00:00:00.000,Unknown,Unknown
4,2020-01-01T00:00:00.000,No,No
