# OVERVIEW #

This notebook will import state-level United States COVID case data through an API, convert the data into useful datatypes, and then export a Pandas DataFrame to .csv file to your local machine.  

*After the first run, do not re-run this notebook unless you want to re-call/update the COVID data*

In [53]:
#import Pandas, Numpy, and MatPlotLib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [54]:
#import requests in order to call the COVID data API data 
#import time in order to pause between API calls and not freeze in the process
#import pprint in order to pretty-print the data.
import requests
import time
from pprint import pprint


# SECTION 1: Execute an API call to the CDC Public Use Data Website #

In [55]:
#This is a variable which calls the API link from the COVID data website
    #it orders the data by CDC Report Date
    #and it has a limit larger than the max number of rows in the CDC data
r = requests.get('https://data.cdc.gov/resource/9mfq-cb36.json?$limit=5000000')

#This is the API call with a 1 second pause between requests
state_covid_json = r.json()
time.sleep(1)

In [56]:
#pretty-print the first 2 entries to preview the data.
pprint(state_covid_json[:2])

[{'consent_cases': 'Agree',
  'consent_deaths': 'Not agree',
  'created_at': '2020-03-26T16:22:39.452',
  'new_case': '0',
  'new_death': '0',
  'state': 'WV',
  'submission_date': '2020-01-22T00:00:00.000',
  'tot_cases': '0',
  'tot_death': '0'},
 {'consent_cases': 'Agree',
  'consent_deaths': 'Not agree',
  'created_at': '2020-03-26T16:22:39.452',
  'new_case': '0.0',
  'new_death': '0.0',
  'state': 'WV',
  'submission_date': '2020-01-23T00:00:00.000',
  'tot_cases': '0',
  'tot_death': '0'}]


In [57]:
#Right now, the API data is a list with nested dictionaries

print(type(covid_json))

<class 'dict'>


In [58]:
#Loop through the list and call each desired datapoint, by its key, into its own list.
#Here is a sample

state, submission_date, total_cases, new_cases, deaths = [],[],[],[],[]

for data in state_covid_json:
    state.append(data['state'])
    submission_date.append(data['submission_date'])
    total_cases.append(data['tot_cases'])
    new_cases.append(data['new_case'])
    deaths.append(data['tot_death'])


In [59]:
#Now that each datapoint is separated, add them back into a dictionary 
#This will make sure that each datapoint column is labelled
state_covid_values = {'state': state, 'submission_date': submission_date, 'tot_cases': total_cases, 'new_case': new_cases, 'tot_death': deaths}

#Convert the dictionary into a DataFrame
state_covid_df = pd.DataFrame(state_covid_values)


In [60]:
#Check the COVID DataFrame with a summary

state_covid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19362 entries, 0 to 19361
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   state            19362 non-null  object
 1   submission_date  19362 non-null  object
 2   tot_cases        19362 non-null  object
 3   new_case         19362 non-null  object
 4   tot_death        19362 non-null  object
dtypes: object(5)
memory usage: 756.5+ KB


# SECTION 2: Data Conversion #

In [61]:
#Preview the COVID DataFrame created in Section 1
state_covid_df.head()

Unnamed: 0,state,submission_date,tot_cases,new_case,tot_death
0,WV,2020-01-22T00:00:00.000,0,0.0,0
1,WV,2020-01-23T00:00:00.000,0,0.0,0
2,WV,2020-01-24T00:00:00.000,0,0.0,0
3,WV,2020-01-25T00:00:00.000,0,0.0,0
4,WV,2020-01-26T00:00:00.000,0,0.0,0


#### Convert the COVID dates into standard date format ####

In [62]:
#Import the datetime module and use it to convert the date column
import datetime

state_covid_df['submission_date'] = pd.to_datetime(state_covid_df['submission_date']) 
state_covid_df

Unnamed: 0,state,submission_date,tot_cases,new_case,tot_death
0,WV,2020-01-22,0,0,0
1,WV,2020-01-23,0,0.0,0
2,WV,2020-01-24,0,0.0,0
3,WV,2020-01-25,0,0.0,0
4,WV,2020-01-26,0,0.0,0
...,...,...,...,...,...
19357,OR,2020-12-05,83243,1806.0,1027
19358,OR,2020-12-06,84496,1253.0,1033
19359,OR,2020-12-07,85788,1292.0,1045
19360,OR,2020-12-08,87082,1294.0,1080


In [63]:
#Check the submission date datatype - it has changed from object to datetime64
state_covid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19362 entries, 0 to 19361
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   state            19362 non-null  object        
 1   submission_date  19362 non-null  datetime64[ns]
 2   tot_cases        19362 non-null  object        
 3   new_case         19362 non-null  object        
 4   tot_death        19362 non-null  object        
dtypes: datetime64[ns](1), object(4)
memory usage: 756.5+ KB


#### Convert the case counts from objects to numbers ####

In [64]:
state_covid_df['tot_cases'] = pd.to_numeric(state_covid_df['tot_cases'])
state_covid_df['new_case'] = pd.to_numeric(state_covid_df['new_case'])
state_covid_df['tot_death'] = pd.to_numeric(state_covid_df['tot_death'])
state_covid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19362 entries, 0 to 19361
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   state            19362 non-null  object        
 1   submission_date  19362 non-null  datetime64[ns]
 2   tot_cases        19362 non-null  int64         
 3   new_case         19362 non-null  float64       
 4   tot_death        19362 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(2), object(1)
memory usage: 756.5+ KB


In [65]:
state_covid_df.tail()

Unnamed: 0,state,submission_date,tot_cases,new_case,tot_death
19357,OR,2020-12-05,83243,1806.0,1027
19358,OR,2020-12-06,84496,1253.0,1033
19359,OR,2020-12-07,85788,1292.0,1045
19360,OR,2020-12-08,87082,1294.0,1080
19361,OR,2020-12-09,88287,1205.0,1110


# SECTION 3: Export as .csv # 


In [66]:
state_covid_df.to_csv('state_covid_data.csv') 