In [1]:
import pandas as pd # Pandas
import pathlib # Built-in path manipulation library
from urllib.error import HTTPError

In [2]:
url_template = ("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/"
                "csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_%s_%s.csv")

dfs = {}
for region in ['global', 'US']:
    dfs[region] = {}
    for kind in ['confirmed', 'deaths', 'recovered']:
        url = url_template % (kind, region) # Create the full data URL
        try:
            df = pd.read_csv(url) # Download the data into a dataframe
        except HTTPError:
            print("Could not download data for %s, %s" % (kind, region))
        else:
            if region == 'global':
                df1 = df[df['Province/State'].isnull()].set_index('Country/Region') # Whole countries only,
                                                                                   # use country name as index
                df2 = df[df['Country/Region']=='China'].sum(axis=0, skipna=False).to_frame().T
                df2['Country/Region'] = 'China'
                df2 = df2.set_index('Country/Region')
                df = pd.concat([df1, df2])
            elif region == 'US':
                df = df.set_index('Province_State') # Use state name as index
            df = df[[x for x in df if '20' in x]] # Use only data columns
            dfs[region][kind] = df # Add to dictionary of dataframes

Could not download data for recovered, US


In [3]:
# Generate list of countries or states currently in the repository (ours, not JHU's)
countries_states = [x.name.split('.')[0].split('_')[-1]
                    for x in pathlib.Path('data').iterdir()
                    if 'covid_timeseries_' in str(x)]
# Just the countries
countries = [x for x in countries_states if len(x)>2]
# Just the states
states = [x for x in countries_states if len(x)==2]

In [10]:
dfs['global']['recovered']

Unnamed: 0_level_0,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,1/31/20,...,4/2/20,4/3/20,4/4/20,4/5/20,4/6/20,4/7/20,4/8/20,4/9/20,4/10/20,4/11/20
Country/Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,0,0,0,0,0,0,0,0,0,0,...,10,10,10,15,18,18,29,32,32,32
Albania,0,0,0,0,0,0,0,0,0,0,...,76,89,99,104,116,131,154,165,182,197
Algeria,0,0,0,0,0,0,0,0,0,0,...,61,62,90,90,90,113,237,347,405,460
Andorra,0,0,0,0,0,0,0,0,0,0,...,10,16,21,26,31,39,52,58,71,71
Angola,0,0,0,0,0,0,0,0,0,0,...,1,1,2,2,2,2,2,2,2,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
South Sudan,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Western Sahara,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Sao Tome and Principe,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Yemen,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
has_recoveries = dfs['global']['recovered'][dfs['global']['recovered'].sum(axis=1)>0].index
enough_cases = dfs['global']['confirmed'][dfs['global']['confirmed'].sum(axis=1)>=10].index
reports_deaths = dfs['global']['deaths'][dfs['global']['deaths'].sum(axis=1)>0].index
good_countries = list(has_recoveries.intersection(enough_cases).intersection(reports_deaths))

In [16]:
source = dfs['global']
for country in good_countries:  # For each country
    if country in source['confirmed'].index:  # If we have data in the downloaded JHU files for that country
        df = pd.DataFrame(columns=['dates2', 'cum_cases', 'cum_deaths', 'cum_recover',
                               'new_cases', 'new_deaths', 'new_recover', 'new_uninfected'])
        df['dates2'] = source['confirmed'].columns
        df['cum_cases'] = source['confirmed'].loc[country].values
        df['cum_deaths'] = source['deaths'].loc[country].values
        df['cum_recover'] = source['recovered'].loc[country].values
        df[['new_cases', 'new_deaths', 'new_recover']] = \
            df[['cum_cases', 'cum_deaths', 'cum_recover']].diff()
        df['new_uninfected'] = df['new_recover'] + df['new_deaths']
        dfs[country] = df.set_index('dates2').fillna(0).astype(int)  # Fill NaN with 0 and convert to int
        dfs[country].to_csv('data/covid_timeseries_%s.csv' % country)  # Overwrite old data
    else:
        print("No data for %s" % country)