In [61]:
import json
import pandas as pd # Pandas
import pathlib # Built-in path manipulation library
import requests
from datetime import datetime

In [62]:
url = "https://raw.githubusercontent.com/COVID19Tracking/covid-tracking-data/master/data/states_daily_4pm_et.csv"
df_raw = pd.read_csv(url)

In [63]:
states = df_raw['state'].unique()

In [64]:
# Fix Michigan
import bs4
import numpy as np
import requests
response = requests.get('https://www.michigan.gov/coronavirus/0,9753,7-406-98163_98173_99207---,00.html')
soup = bs4.BeautifulSoup(response.content)
three_five_index = [x.find_all('td')[0].text for x in soup.find('table').find_all('tr') if x.find_all('td')].index('5-Mar')
daily_counts = [int(x.find_all('td')[-1].text) for x in soup.find('table').find_all('tr') if x.find_all('td')][:-1]
cum_counts = np.cumsum(daily_counts)
cum_counts = cum_counts[three_five_index:]
to_fix = df_raw.loc[df_raw['state']=='MI', 'date'].apply(lambda x: datetime.strptime(str(x), '%Y%m%d')).sort_values().index
df_raw.loc[to_fix[:len(cum_counts)], 'positive'] = cum_counts

In [70]:
good = []
bad = []
for state in states:  # For each country
    source = df_raw[df_raw['state']==state] # Only the given state
    if source['recovered'].sum() > 0: # If we have data in the downloaded file for that state
        df = pd.DataFrame(columns=['dates2', 'cum_cases', 'cum_deaths', 'cum_recover',
                               'new_cases', 'new_deaths', 'new_recover', 'new_uninfected'])
        df['dates2'] = source['date'].apply(lambda x: datetime.strftime(datetime.strptime(str(x), '%Y%m%d'), '%m/%d/%y')) # Convert date format
        df['cum_cases'] = source['positive'].values
        df['cum_deaths'] = source['death'].values
        df['cum_recover'] = source['recovered'].values
        df = df.set_index('dates2').fillna(0).astype(int)  # Fill NaN with 0 and convert to int
        df = df.sort_index() # Sort by date ascending
        df[['new_cases', 'new_deaths', 'new_recover']] = \
            df[['cum_cases', 'cum_deaths', 'cum_recover']].diff()
        if df['new_cases'].max() >= 5:
            df['new_uninfected'] = df['new_recover'] + df['new_deaths']
            df = df.fillna(0).astype(int)
            df.to_csv('data/covid_timeseries_US_%s.csv' % state)  # Overwrite old data
            good.append(state)
        else:
            bad.append(state)
    else:
        bad.append(state)
        
print("Recovery data for %s" % ','.join(good))
print("No recovery data for %s" % ','.join(bad))

Recovery data for AK,AR,DC,DE,GU,HI,IA,KY,MD,ME,MI,MN,MT,ND,NH,NJ,NM,NY,OK,RI,SD,TN,TX,VI,VT,WV,WY
No recovery data for AL,AS,AZ,CA,CO,CT,FL,GA,ID,IL,IN,KS,LA,MA,MO,MP,MS,NC,NE,NV,OH,OR,PA,PR,SC,UT,VA,WA,WI


In [71]:
len(good), len(bad)

(27, 29)