In [1]:
import json
import pandas as pd # Pandas
import pathlib # Built-in path manipulation library
import requests
from datetime import datetime

In [2]:
data_dir = pathlib.Path('../data/')

In [3]:
url = "https://raw.githubusercontent.com/COVID19Tracking/covid-tracking-data/master/data/states_daily_4pm_et.csv"
df_raw = pd.read_csv(url)

In [4]:
states = df_raw['state'].unique()

In [5]:
# Fix Michigan
import bs4
import numpy as np
import requests
response = requests.get('https://www.michigan.gov/coronavirus/0,9753,7-406-98163_98173_99207---,00.html')
soup = bs4.BeautifulSoup(response.content)
three_five_index = [x.find_all('td')[0].text for x in soup.find('table').find_all('tr') if x.find_all('td')].index('5-Mar')
daily_counts = [int(x.find_all('td')[-1].text) for x in soup.find('table').find_all('tr') if x.find_all('td')][:-1]
cum_counts = np.cumsum(daily_counts)
cum_counts = cum_counts[three_five_index:]
to_fix = df_raw.loc[df_raw['state']=='MI', 'date'].apply(lambda x: datetime.strptime(str(x), '%Y%m%d')).sort_values().index
df_raw.loc[to_fix[:len(cum_counts)], 'positive'] = cum_counts

In [6]:
good = []
bad = []
for state in states:  # For each country
    source = df_raw[df_raw['state']==state] # Only the given state
    if source['recovered'].sum() > 0: # If we have data in the downloaded file for that state
        df = pd.DataFrame(columns=['dates2', 'cum_cases', 'cum_deaths', 'cum_recover',
                               'new_cases', 'new_deaths', 'new_recover', 'new_uninfected'])
        df['dates2'] = source['date'].apply(lambda x: datetime.strftime(datetime.strptime(str(x), '%Y%m%d'), '%m/%d/%y')) # Convert date format
        df['cum_cases'] = source['positive'].values
        df['cum_deaths'] = source['death'].values
        df['cum_recover'] = source['recovered'].values
        df = df.set_index('dates2').fillna(0).astype(int)  # Fill NaN with 0 and convert to int
        df = df.sort_index() # Sort by date ascending
        df[['new_cases', 'new_deaths', 'new_recover']] = \
            df[['cum_cases', 'cum_deaths', 'cum_recover']].diff()
        if df['new_cases'].max() >= 5:
            df['new_uninfected'] = df['new_recover'] + df['new_deaths']
            df = df.fillna(0).astype(int)
            df.to_csv(data_dir / ('covidtimeseries_US_%s.csv' % state))  # Overwrite old data
            good.append(state)
        else:
            bad.append(state)
    else:
        bad.append(state)
        
print("Recovery data for %s" % ','.join(good))
print("No recovery data for %s" % ','.join(bad))

Recovery data for AK,AR,AZ,CO,DC,DE,GU,HI,IA,ID,KS,KY,LA,MD,ME,MI,MN,MS,MT,ND,NH,NJ,NM,NV,NY,OK,RI,SC,SD,TN,TX,UT,VA,VI,VT,WI,WV,WY
No recovery data for AL,AS,CA,CT,FL,GA,IL,IN,MA,MO,MP,NC,NE,OH,OR,PA,PR,WA


In [7]:
df.index

Index(['03/07/20', '03/08/20', '03/09/20', '03/10/20', '03/11/20', '03/12/20',
       '03/13/20', '03/14/20', '03/15/20', '03/16/20', '03/17/20', '03/18/20',
       '03/19/20', '03/20/20', '03/21/20', '03/22/20', '03/23/20', '03/24/20',
       '03/25/20', '03/26/20', '03/27/20', '03/28/20', '03/29/20', '03/30/20',
       '03/31/20', '04/01/20', '04/02/20', '04/03/20', '04/04/20', '04/05/20',
       '04/06/20', '04/07/20', '04/08/20', '04/09/20', '04/10/20', '04/11/20',
       '04/12/20', '04/13/20', '04/14/20', '04/15/20', '04/16/20', '04/17/20',
       '04/18/20', '04/19/20', '04/20/20', '04/21/20', '04/22/20', '04/23/20',
       '04/24/20', '04/25/20', '04/26/20', '04/27/20', '04/28/20', '04/29/20',
       '04/30/20'],
      dtype='object', name='dates2')