In [104]:
#import Pandas, Numpy, and MatPlotLib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np


In [108]:
#import requests in order to call the COVID data API data 
#import time in order to pause between API calls and not freeze in the process
#import pprint in order to pretty-print the data.
import requests
import time
from pprint import pprint


In [109]:
#define a variable that makes the API requests, including the API link from the COVID data website.
r = requests.get('https://data.cdc.gov/resource/vbim-akqf.json')

#and then make the API call with a one second pause in between.
covid_json = r.json()
time.sleep(1)

In [129]:
#pretty-print the first 2 entries to preview the data.
pprint(covid_json[:2])

[{'age_group': '0 - 9 Years',
  'cdc_report_dt': '2020-03-22T00:00:00.000',
  'current_status': 'Laboratory-confirmed case',
  'death_yn': 'No',
  'hosp_yn': 'Yes',
  'icu_yn': 'Yes',
  'medcond_yn': 'Yes',
  'onset_dt': '2020-03-11T00:00:00.000',
  'pos_spec_dt': '2020-03-15T00:00:00.000',
  'race_ethnicity_combined': 'Multiple/Other, Non-Hispanic',
  'sex': 'Female'},
 {'age_group': '0 - 9 Years',
  'cdc_report_dt': '2020-03-23T00:00:00.000',
  'current_status': 'Laboratory-confirmed case',
  'death_yn': 'Unknown',
  'hosp_yn': 'Unknown',
  'icu_yn': 'Unknown',
  'medcond_yn': 'Unknown',
  'pos_spec_dt': '2020-03-15T00:00:00.000',
  'race_ethnicity_combined': 'Unknown',
  'sex': 'Female'}]


In [None]:
#I want to compare the following COVID19 data points:
    #test result confirmation date = 'pos_spec_dt' - number of confirmed cases
    #death status = 'death_yn' - number of deaths
    #hospitalization status = 'hosp_yn' - number of people hospitalized

#I'm also pulling in the date the case was reported to the cdc = 'cdc_rpt_dt' for reference 
#to be sure the dataset is complete and not missing rows (visually compare with preview on website)

#these are the key stats reported re: COVID19 counts in the news

#So these are the data columns I want to create a Pandas DataFrame with.

In [56]:
#Right now, the API data is a list with nested dictionaries

print(type(covid_json))

<class 'list'>


In [146]:
#Loop through the list and call each desired datapoint, by its key, into its own list.
#Here is a sample

cdc_received, case_date, death, hospital = [],[],[],[]

for data in covid_json:
    cdc_received.append(data['cdc_report_dt'])
    case_date.append(data['pos_spec_dt'])
    death.append(data['death_yn'])
    hospital.append(data['hosp_yn'])


In [147]:
#Now that each datapoint is separated, add them back into a dictionary 
#This will make sure that each datapoint column is labelled
covid_values = {'cdc_rpt_dt': cdc_received, 'pos_spec_dt': case_date, 'death_yn': death, 'hosp_yn': hospital}

#Convert the dictionary into a DataFrame
covid_df = pd.DataFrame(covid_values)

#Export the DataFrame into a CSV file
covid_df.to_csv('covid_data.csv')

In [None]:
#Now, only run the cells above this one if you want to update (add to) the COVID data.
#This dataset is continually added to by the CDC and the API call will refresh it.

In [155]:
#START HERE

#read COVID data csv file in as Pandas DataFrame and preview it
covid_data = pd.read_csv('covid_data.csv')
covid_data.head()

Unnamed: 0.1,Unnamed: 0,cdc_rpt_dt,pos_spec_dt,death_yn,hosp_yn
0,0,2020-03-22T00:00:00.000,2020-03-15T00:00:00.000,No,Yes
1,1,2020-03-23T00:00:00.000,2020-03-15T00:00:00.000,Unknown,Unknown
2,2,2020-03-15T00:00:00.000,2020-03-15T00:00:00.000,Missing,Missing
3,3,2020-03-15T00:00:00.000,2020-03-15T00:00:00.000,Missing,Missing
4,4,2020-03-15T00:00:00.000,2020-03-15T00:00:00.000,Missing,Missing


In [156]:
#Preview the data
tsa_data = pd.read_csv('2020_10_05_TSA_Data.csv',delimiter=',')
tsa_data.head()

Unnamed: 0,Date,Total Traveler Throughput,Total Traveler Throughput_1 Year Ago_Same Weekday
0,10/4/2020,900911,2542118
1,10/3/2020,677661,1921185
2,10/2/2020,857186,2526835
3,10/1/2020,855908,2447687
4,9/30/2020,634046,2082179


In [157]:
#QUESTION FOR MENTORS - MY COVID DATASET IS LIMITED TO 1,000 ROWS.  HOW DO I GET THE FULL DATA?

covid_data.info()

#here is the website: https://data.cdc.gov/Case-Surveillance/COVID-19-Case-Surveillance-Public-Use-Data/vbim-akqf

#there should be closer to 4.5million rows

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   1000 non-null   int64 
 1   cdc_rpt_dt   1000 non-null   object
 2   pos_spec_dt  1000 non-null   object
 3   death_yn     1000 non-null   object
 4   hosp_yn      1000 non-null   object
dtypes: int64(1), object(4)
memory usage: 39.2+ KB


In [None]:
#Here is a list of next steps:

#1. convert the COVID dates into standard date format

#2. group the covid data by days (pos_spec_dt)
    #for each day, I will have total number of cases, deaths, hospitalizations
    
#3. combine the COVID and TSA dataframes so that for each day, I have all values
    #so, perhaps make the date the index

#4. then begin creating the visualizations to show trends

In [None]:
#Visualization ideas
    #Scatterplots with trend lines
    #bar chart with statistical significance
