In [2]:
#import Pandas, Numpy, and MatPlotLib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np


In [3]:
#import requests in order to call the COVID data API data 
#import time in order to pause between API calls and not freeze in the process
#import pprint in order to pretty-print the data.
import requests
import time
from pprint import pprint


# SECTION 1 #
## DO NOT RUN THESE CELLS UNLESS YOU WANT TO RE-CALL THE API. ##
## SKIP TO THE 'START HERE' CELL ##

The COVID19 Data has more than 4,480,000 rows

In [5]:
#This is a variable which calls the API link from the COVID data website
    #it orders the data by CDC Report Date
    #and it has a limit larger than the max number of rows in the CDC data
r = requests.get('https://data.cdc.gov/resource/vbim-akqf.json?$order=cdc_report_dt&$limit=5500000')

#This is the API call with a 1 second pause between requests
covid_json = r.json()
time.sleep(1)

In [6]:
#pretty-print the first 2 entries to preview the data.
pprint(covid_json[:2])

[{'age_group': '60 - 69 Years',
  'cdc_report_dt': '2020-01-01T00:00:00.000',
  'current_status': 'Probable Case',
  'death_yn': 'No',
  'hosp_yn': 'No',
  'icu_yn': 'Missing',
  'medcond_yn': 'Yes',
  'onset_dt': '2020-01-01T00:00:00.000',
  'race_ethnicity_combined': 'White, Non-Hispanic',
  'sex': 'Male'},
 {'age_group': '40 - 49 Years',
  'cdc_report_dt': '2020-01-01T00:00:00.000',
  'current_status': 'Laboratory-confirmed case',
  'death_yn': 'No',
  'hosp_yn': 'Yes',
  'icu_yn': 'Yes',
  'medcond_yn': 'Missing',
  'onset_dt': '2020-01-01T00:00:00.000',
  'pos_spec_dt': '2020-03-15T00:00:00.000',
  'race_ethnicity_combined': 'Black, Non-Hispanic',
  'sex': 'Male'}]


**I want to compare the following COVID19 data points:** 

    **case report date** = 'cdc_rpt_dt' - number of confirmed cases

    **death status** = 'death_yn' - number of deaths
    
    **hospitalization status** = 'hosp_yn' - number of people hospitalized
    
These are the key stats reported re: COVID19 counts in the news.
So these are the data columns I want to create a Pandas DataFrame with.

* Note: technically, I would pull positive result confirmation date, but this is missing from some cases*

In [8]:
#Right now, the API data is a list with nested dictionaries

print(type(covid_json))

<class 'list'>


In [9]:
#Loop through the list and call each desired datapoint, by its key, into its own list.
#Here is a sample

case_date, death, hospital = [],[],[]

for data in covid_json:
    case_date.append(data['cdc_report_dt'])
    death.append(data['death_yn'])
    hospital.append(data['hosp_yn'])


In [10]:
#Now that each datapoint is separated, add them back into a dictionary 
#This will make sure that each datapoint column is labelled
covid_values = {'cdc_report_dt': case_date, 'death_yn': death, 'hosp_yn': hospital}

#Convert the dictionary into a DataFrame
covid_df = pd.DataFrame(covid_values)


In [11]:
########## START HERE  ############### 
#Only run the cells above this one if you want to update (add to) the COVID data.
#This dataset is continually added to by the CDC and the API call will refresh it.

In [12]:
#Check the COVID DataFrame with a summary

covid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4481062 entries, 0 to 4481061
Data columns (total 3 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   cdc_report_dt  object
 1   death_yn       object
 2   hosp_yn        object
dtypes: object(3)
memory usage: 102.6+ MB


In [13]:
##SECTION 2##

#Here is a list of next steps:
#1. convert the COVID dates into standard date format
#2. group the covid data by days (cdc_report_dt)
    #for each day, I will have total number of cases, deaths, hospitalizations

#Preview the COVID DataFrame
covid_df.head()

Unnamed: 0,cdc_report_dt,death_yn,hosp_yn
0,2020-01-01T00:00:00.000,No,No
1,2020-01-01T00:00:00.000,No,Yes
2,2020-01-01T00:00:00.000,No,No
3,2020-01-01T00:00:00.000,Unknown,Unknown
4,2020-01-01T00:00:00.000,No,No


In [43]:
#Convert the COVID dates into standard date format
#Import the datetime module and use it to extract date details
import datetime

rpt_date = pd.to_datetime(covid_df["cdc_report_dt"]) 
report_date = pd.DataFrame(rpt_date)

In [46]:
report_date.head()

Unnamed: 0,cdc_report_dt
0,2020-01-01
1,2020-01-01
2,2020-01-01
3,2020-01-01
4,2020-01-01


In [101]:
#Count the number of cases each day by adding a column with '1' for each case
    #The numpy 'where' function will add the column based on the condition I define
    #np.where(condition, value if condition is true, value if condition is false)
    #my code asks for '1' to be listed in a new column if the report date is not blank (list '0' if blank)
    
report_date['case_reported'] = np.where(report_date['cdc_report_dt']!= '[]', 1, 0)

#here is a preview of the new DataFrame
report_date.head()

Unnamed: 0,cdc_report_dt,case_reported
0,2020-01-01,1
1,2020-01-01,1
2,2020-01-01,1
3,2020-01-01,1
4,2020-01-01,1


In [152]:
#Re-Create the COVID DataFrame with the new date format

#Create a separate dataframe with each column from the original 
deceased = pd.DataFrame(covid_df["death_yn"])
hospital = pd.DataFrame(covid_df["hosp_yn"])

#Concatenate the report_date, deceased, and hospital DataFrames
covid_data = pd.concat([report_date, deceased, hospital], axis=1)
covid_data.sort_values('death_yn')

Unnamed: 0,cdc_report_dt,case_reported,death_yn,hosp_yn
2240530,2020-07-04,1,Missing,Missing
1575898,2020-06-12,1,Missing,Yes
1575897,2020-06-12,1,Missing,Missing
1575896,2020-06-12,1,Missing,Missing
1575894,2020-06-12,1,Missing,Missing
...,...,...,...,...
4079364,2020-08-29,1,Yes,Yes
4079363,2020-08-29,1,Yes,No
234769,2020-04-05,1,Yes,Yes
234783,2020-04-05,1,Yes,No


In [184]:
#Count the number of deaths each day by adding a column with '1' for each death or '0' if no death 
    #death = Yes ('1'), No ('0'), Missing ('0'), or Unknown ('0')
    #This code is using a for loop to create the column

num_deaths = []

for value in covid_data["death_yn"]:
    if value == 'Yes':
        num_deaths.append(1)
    else: 
        num_deaths.append(0)

covid_data["deaths"] = num_deaths
#print(covid_data)
covid_data_r = pd.DataFrame(covid_data)
covid_data_r.sort_values('deaths')

Unnamed: 0,cdc_report_dt,case_reported,death_yn,hosp_yn,deaths,hospitalizations
0,2020-01-01,1,No,No,0,0
2978322,2020-07-26,1,No,No,0,0
2978323,2020-07-26,1,Missing,Missing,0,0
2978324,2020-07-26,1,No,No,0,0
2978325,2020-07-26,1,Missing,Missing,0,0
...,...,...,...,...,...,...
513743,2020-04-20,1,Yes,Yes,1,1
513744,2020-04-20,1,Yes,No,1,0
513745,2020-04-20,1,Yes,Yes,1,1
513747,2020-04-20,1,Yes,Yes,1,1


In [185]:
#Count the number of hospitalizations each day by adding a column for each hosp_yn value
    #hosptialization = Yes ('1'), No ('0'), Missing ('0'), or Unknown ('0')
    #This code is using a for loop to create the column

num_hospital = []

for value in covid_data_r["hosp_yn"]:
    if value == 'Yes':
        num_hospital.append(1)
    else: 
        num_hospital.append(0)

covid_data_r["hospitalizations"] = num_hospital
#print(covid_data)
covid_data_r2 = pd.DataFrame(covid_data_r)
covid_data_r2.sort_values('hospitalizations')

Unnamed: 0,cdc_report_dt,case_reported,death_yn,hosp_yn,deaths,hospitalizations
0,2020-01-01,1,No,No,0,0
2951325,2020-07-25,1,Missing,No,0,0
2951326,2020-07-25,1,Missing,Missing,0,0
2951327,2020-07-25,1,Missing,Missing,0,0
2951328,2020-07-25,1,Missing,Missing,0,0
...,...,...,...,...,...,...
3936000,2020-08-24,1,Missing,Yes,0,1
2601097,2020-07-15,1,No,Yes,0,1
3936005,2020-08-24,1,Missing,Yes,0,1
2601104,2020-07-15,1,No,Yes,0,1


In [190]:
#Create a new DataFrame which only shows counts of each datapoint
    #do this by filtering the previous DataFrame by the new counts columns
    
covid_counts = covid_data_r2[["cdc_report_dt", "case_reported", "deaths", "hospitalizations"]]
covid_counts.tail()

Unnamed: 0,cdc_report_dt,case_reported,deaths,hospitalizations
4481057,2020-09-15,1,0,0
4481058,2020-09-15,1,0,0
4481059,2020-09-15,1,0,0
4481060,2020-09-15,1,0,0
4481061,2020-09-15,1,0,0


In [192]:
#Group the covid data by days with a total count for each value (case reported, deaths, hospitalizations)

#Group the COVID data by report date, give it a new variable name
covid_by_day = covid_counts.groupby(['cdc_report_dt'])

#Create total count of for each value, by day
daily_cases = covid_by_day['case_reported'].sum()
daily_deaths = covid_by_day['deaths'].sum()
daily_hospitalizations = covid_by_day['hospitalizations'].sum()

#Create a new DataFrame with these values
covid_daily_counts = pd.concat([daily_cases, daily_deaths, daily_hospitalizations], axis=1)
covid_daily_counts.columns = ["Cases", "Deaths", "Hospitalizations"]
covid_daily_counts

Unnamed: 0_level_0,Cases,Deaths,Hospitalizations
cdc_report_dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-01-01,12,0,1
2020-01-02,2,0,0
2020-01-03,2,0,0
2020-01-05,1,0,0
2020-01-08,1,0,0
...,...,...,...
2020-09-11,20163,188,812
2020-09-12,18277,100,574
2020-09-13,15436,154,560
2020-09-14,25893,163,1031


In [194]:
#Now, there are only 255 rows of data and this can be saved as a .csv file
covid_daily_counts.to_csv('covid_data.csv') 