In [1]:
from datetime import datetime
import pandas as pd # Pandas
import pathlib # Built-in path manipulation library
from urllib.error import HTTPError

In [2]:
data_dir = pathlib.Path('../data/')

In [3]:
url_template = ("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/"
                "csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_%s_%s.csv")

dfs = {}
for region in ['global', 'US']:
    dfs[region] = {}
    for kind in ['confirmed', 'deaths', 'recovered']:
        url = url_template % (kind, region) # Create the full data URL
        try:
            df = pd.read_csv(url) # Download the data into a dataframe
        except HTTPError:
            print("Could not download data for %s, %s" % (kind, region))
        else:
            if region == 'global':
                df1 = df[df['Province/State'].isnull()].set_index('Country/Region') # Whole countries only,
                                                                                   # use country name as index
                df2 = df[df['Country/Region']=='China'].sum(axis=0, skipna=False).to_frame().T
                df2['Country/Region'] = 'China'
                df2 = df2.set_index('Country/Region')
                df = pd.concat([df1, df2])
            elif region == 'US':
                df = df.set_index('Province_State') # Use state name as index
            df = df[[x for x in df if '20' in x]] # Use only data columns
            dfs[region][kind] = df # Add to dictionary of dataframes

Could not download data for recovered, US


In [4]:
# Generate list of countries or states currently in the repository (ours, not JHU's)
countries_states = [x.name.split('.')[0].split('_')[-1]
                    for x in data_dir.iterdir()
                    if 'covidtimeseries_' in str(x)]
# Just the countries
countries = [x for x in countries_states if len(x)>2]
# Just the states
states = [x for x in countries_states if len(x)==2]

In [5]:
has_recoveries = dfs['global']['recovered'][dfs['global']['recovered'].max(axis=1)>0].index
enough_cases = dfs['global']['confirmed'][dfs['global']['confirmed'].diff(axis=1).max(axis=1)>=5].index
reports_deaths = dfs['global']['deaths'][dfs['global']['deaths'].max(axis=1)>0].index
good_countries = list(has_recoveries.intersection(enough_cases).intersection(reports_deaths))
print(len(good_countries))

151


In [6]:
source = dfs['global']
for country in good_countries:  # For each country
    if country in source['confirmed'].index:  # If we have data in the downloaded JHU files for that country
        df = pd.DataFrame(columns=['dates2', 'cum_cases', 'cum_deaths', 'cum_recover',
                               'new_cases', 'new_deaths', 'new_recover', 'new_uninfected'])
        df['dates2'] = source['confirmed'].columns
        df['dates2'] = df['dates2'].apply(lambda x: datetime.strftime(datetime.strptime(x, '%m/%d/%y'), '%m/%d/%y'))
        df['cum_cases'] = source['confirmed'].loc[country].values
        df['cum_deaths'] = source['deaths'].loc[country].values
        df['cum_recover'] = source['recovered'].loc[country].values
        df[['new_cases', 'new_deaths', 'new_recover']] = \
            df[['cum_cases', 'cum_deaths', 'cum_recover']].diff()
        df['new_uninfected'] = df['new_recover'] + df['new_deaths']
        dfs[country] = df.set_index('dates2').fillna(0).astype(int)  # Fill NaN with 0 and convert to int
        dfs[country].to_csv(data_dir /('covidtimeseries_%s.csv' % country))  # Overwrite old data
    else:
        print("No data for %s" % country)