In [2]:
import pandas as pd # Pandas
import pathlib # Built-in path manipulation library
from urllib.error import HTTPError

In [3]:
url_template = ("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/"
                "csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_%s_%s.csv")

dfs = {}
for region in ['global', 'US']:
    dfs[region] = {}
    for kind in ['confirmed', 'deaths', 'recovered']:
        url = url_template % (kind, region) # Create the full data URL
        try:
            df = pd.read_csv(url) # Download the data into a dataframe
        except HTTPError:
            print("Could not download data for %s, %s" % (kind, region))
        else:
            if region == 'global':
                df = df[df['Province/State'].isnull()].set_index('Country/Region') # Whole countries only,
                                                                                   # use country name as index
            elif region == 'US':
                df = df.set_index('Province_State') # Use state name as index
            df = df[[x for x in df if '20' in x]] # Use only data columns
            dfs[region][kind] = df # Add to dictionary of dataframes

Could not download data for recovered, US


In [4]:
# Generate list of countries or states currently in the repository (ours, not JHU's)
countries_states = [x.name.split('.')[0].split('_')[-1]
                    for x in pathlib.Path('data').iterdir()
                    if 'covid_timeseries_' in str(x)]
# Just the countries
countries = [x for x in countries_states if len(x)>2]
# Just the states
states = [x for x in countries_states if len(x)==2]

In [5]:
source = dfs['global']
for country in countries:  # For each country
    if country in source['confirmed'].index:  # If we have data in the downloaded JHU files for that country
        df = pd.DataFrame(columns=['dates2', 'cum_cases', 'cum_deaths', 'cum_recover',
                               'new_cases', 'new_deaths', 'new_recover', 'new_uninfected'])
        ['confirmed']
        df['dates2'] = source['confirmed'].columns
        df['cum_cases'] = source['confirmed'].loc[country].values
        df['cum_deaths'] = source['deaths'].loc[country].values
        df['cum_recover'] = source['recovered'].loc[country].values
        df[['new_cases', 'new_deaths', 'new_recover']] = \
            df[['cum_cases', 'cum_deaths', 'cum_recover']].diff()
        df['new_uninfected'] = df['new_recover'] + df['new_deaths']
        dfs[country] = df.set_index('dates2').fillna(0).astype(int)  # Fill NaN with 0 and convert to int
        dfs[country].to_csv('data/covid_timeseries_%s.csv' % country)  # Overwrite old data
    else:
        print("No data for %s" % country)

No data for China
