In [1]:
# This is the notebook we will use to make COVID API calls and build the dataframe columns relevant to COVID

In [2]:
import csv
import requests
import pandas as pd
import numpy as np
import urllib.request
import codecs

In [3]:
base_url = 'http://coronavirusapi.com/getTimeSeries/'
state = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DC", 
         "DE", "FL", "GA", "HI", "ID", "IL", "IN", "IA", 
         "KS", "KY", "LA", "ME", "MD", "MA", "MI", "MN", 
         "MS", "MO", "MT", "NE", "NV", "NH", "NJ", "NM", 
         "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", 
         "SC", "SD", "TN", "TX", "UT", "VT", "VA", "WA", 
         "WV", "WI", "WY"]
master_list = []

In [4]:
# Because the dataset uses seconds from Unix Epoch date, function returns datetime, from floationg point value.
def epochConverter(num):
    days = num/3600/24
    date = pd.to_datetime('01-01-1970') + pd.DateOffset(days=days)
    return date

In [5]:
# Outer for loop opens url and converts each call for state from bytestring to something readable.
for state_url in state:
    state_response = urllib.request.urlopen(base_url+state_url)
    csvfile = csv.reader(codecs.iterdecode(state_response, 'utf-8'))

    #inner loop iterates over each row in the returned .csv from API and appends in list
    for row in csvfile:
        
        #because the API is returning headers as well for each call, the if statement removes the header line.
        if row[0] != 'seconds_since_Epoch':
            master_list.append({'State': state_url,
                             'Date': epochConverter(float(row[0])),
                             'Tested': float(row[1]),
                             'Positive': float(row[2]),
                             'Deaths': float(row[3]),
                             })  

In [6]:
#convert list of dictionaries into a dataframe
covid_state_df = pd.DataFrame(master_list)
covid_state_df

Unnamed: 0,State,Date,Tested,Positive,Deaths
0,AL,2020-05-01 00:23:02,87976.0,7068.0,272.0
1,AL,2020-05-01 15:13:02,89997.0,7085.0,279.0
2,AL,2020-05-01 22:41:02,91933.0,7158.0,279.0
3,AL,2020-05-01 23:47:02,91933.0,7294.0,289.0
4,AL,2020-05-02 16:21:02,91933.0,7345.0,289.0
...,...,...,...,...,...
3803,WY,2020-07-18 22:04:32,62594.0,1713.0,24.0
3804,WY,2020-07-20 00:34:09,63198.0,1728.0,24.0
3805,WY,2020-07-20 21:39:36,63771.0,1790.0,24.0
3806,WY,2020-07-22 04:07:48,64221.0,1830.0,25.0


In [7]:
# Create new dataframe with just the most recent values.  We can decide later if we want this or the dataframe above
# with all of the complete data from the API

recent_date_df = covid_state_df.groupby('State').max()
recent_date_df = recent_date_df.reset_index()
recent_date_df.head()

Unnamed: 0,State,Date,Tested,Positive,Deaths
0,AK,2020-07-23 03:02:13,185333.0,1693.0,19.0
1,AL,2020-07-23 03:02:13,608088.0,70413.0,1325.0
2,AR,2020-07-22 05:16:54,439635.0,34655.0,374.0
3,AZ,2020-07-23 07:07:29,1027125.0,150609.0,2974.0
4,CA,2020-07-23 07:07:29,6664998.0,413576.0,7870.0
