# SUMMARY 
This notebook is responsible for the basic data collection and preparation. The dataset we use comes from the ECDC and provides day-by-day accounts of cases and deaths for each affected country. This data file is updated each dat and so needs to be downloaded to keep it up to date.

We do some basic data processing to convert daily numbers into cumulative numbers etc. The processed dataframe is then saved to file for later use.

In [68]:
from datetime import date

import pandas as pd
import numpy as np


# Download the Raw Dataset

Switched to using a daily dataset from https://www.ecdc.europa.eu/en/novel-coronavirus-china
The main reason for this is that this dataset goes back to 31/12/2019 whereas the JH dataset starts about a month later when China already had 500+ cases and 17 deaths.

In [69]:
# The download file template (encodes the current day's date)
ecdc_dataset_raw = 'https://www.ecdc.europa.eu/sites/default/files/documents/COVID-19-geographic-disbtribution-worldwide{}.xlsx'

# Save the processed data to here.
ecdc_dataset_processed = '../data/processed/ecdc_dataset.csv'

In [70]:
# Create the filename for tpday's dataset and download it.
todays_date = "-{}-{:02}-{:02}".format(date.today().year, date.today().month, date.today().day)

ecdc_dataset_raw.format(todays_date)

'https://www.ecdc.europa.eu/sites/default/files/documents/COVID-19-geographic-disbtribution-worldwide-2020-03-26.xlsx'

In [71]:
df = pd.read_excel(ecdc_dataset_raw.format(todays_date))
df.shape

(6931, 10)

In [72]:
# Change the column names.
df.columns = ['date', 'day', 'month', 'year', 'cases', 'deaths', 'country', 'code', 'pop']

# Add a number of days since records began column.
df['day'] = (df['date'] - df['date'].min()).map(lambda d: d.days)

# Sort by day, oldest first
df = df.sort_values(by='day', ascending=True).set_index('day')


ValueError: Length mismatch: Expected axis has 10 elements, new values have 9 elements

In [50]:
df.country.unique(), df.country.nunique()

(array(['Indonesia', 'United_Arab_Emirates', 'Austria', 'Switzerland',
        'Nepal', 'Brazil', 'India', 'Lebanon', 'Malaysia',
        'Cases_on_an_international_conveyance_Japan', 'Azerbaijan',
        'Estonia', 'Iceland', 'Russia', 'Netherlands', 'Thailand',
        'Finland', 'Afghanistan', 'United_States_of_America', 'Bahrain',
        'New_Zealand', 'Philippines', 'United_Kingdom', 'Singapore',
        'China', 'Kuwait', 'Egypt', 'Japan', 'Denmark', 'Pakistan',
        'Armenia', 'Italy', 'South_Korea', 'Mexico', 'Taiwan', 'Algeria',
        'Romania', 'Iran', 'Monaco', 'Cambodia', 'Israel', 'Ireland',
        'Australia', 'Ecuador', 'Canada', 'Czech_Republic', 'Oman', 'Iraq',
        'Croatia', 'Dominican_Republic', 'Qatar', 'Norway', 'Germany',
        'France', 'San_Marino', 'Sri_Lanka', 'Spain', 'North_Macedonia',
        'Georgia', 'Sweden', 'Nigeria', 'Greece', 'Belarus', 'Luxembourg',
        'Lithuania', 'Vietnam', 'Belgium', 'Portugal', 'Jordan', 'Latvia',
        'An

In [51]:
# Canada is represented in title and Upper case. Looks like a data entry error.
df['country'] = np.where(df['country']=='CANADA', 'Canada', df['country'])

df.country.nunique()

188

In [52]:
df.head()

Unnamed: 0_level_0,date,month,year,cases,deaths,country,code,pop
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,2019-12-31,12,2019,0,0,Indonesia,ID,267663435.0
0,2019-12-31,12,2019,0,0,United_Arab_Emirates,AE,9630959.0
0,2019-12-31,12,2019,0,0,Austria,AT,8847037.0
0,2019-12-31,12,2019,0,0,Switzerland,CH,8516543.0
0,2019-12-31,12,2019,0,0,Nepal,NP,28087871.0


# Basic Data Processing
We will add a bunch of new columns to capture various features that we will need for future processing.

## Add Cumulative Totals for Cases and Deaths

In [53]:
df['cum_cases'] = df.groupby('country')['cases'].transform('cumsum')
df['cum_deaths'] = df.groupby('country')['deaths'].transform('cumsum')

## Calculate Daily %Change for Each Country

In [54]:
cum_cases_pct_change = df.groupby('country').apply(lambda g: g['cum_cases'].pct_change())
cum_deaths_pct_change = df.groupby('country').apply(lambda g: g['cum_deaths'].pct_change())

df = df.reset_index()\
        .set_index(['country', 'day'])\
        .join(cum_cases_pct_change, rsuffix='_pct_change')\
        .join(cum_deaths_pct_change, rsuffix='_pct_change')

df.sample()

Unnamed: 0_level_0,Unnamed: 1_level_0,date,month,year,cases,deaths,code,pop,cum_cases,cum_deaths,cum_cases_pct_change,cum_deaths_pct_change
country,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Kenya,79,2020-03-19,3,2020,3,0,KE,51393010.0,7,0,0.75,


In [55]:
df['cum_cases_pct_change'] = df['cum_cases_pct_change'].replace(np.inf, np.nan)
df['cum_deaths_pct_change'] = df['cum_deaths_pct_change'].replace(np.inf, np.nan)

df.shape

(6738, 11)

## Calculate Log10 Values for Cases and Deaths

In [56]:
df = df.reset_index().set_index('day')
df.sample()

Unnamed: 0_level_0,country,date,month,year,cases,deaths,code,pop,cum_cases,cum_deaths,cum_cases_pct_change,cum_deaths_pct_change
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
31,Bahrain,2020-01-31,1,2020,0,0,BH,1569439.0,0,0,,


In [57]:
cum_cases_log10 = df.groupby('country').apply(lambda g: g['cum_cases'].map(lambda n: np.log10(n) if n>0 else np.nan))
cum_deaths_log10 = df.groupby('country').apply(lambda g: g['cum_deaths'].map(lambda n: np.log10(n) if n>0 else np.nan))

df = df.reset_index()\
    .set_index(['country', 'day'])\
    .join(cum_cases_log10, rsuffix='_log10')\
    .join(cum_deaths_log10, rsuffix='_log10')

df.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,date,month,year,cases,deaths,code,pop,cum_cases,cum_deaths,cum_cases_pct_change,cum_deaths_pct_change,cum_cases_log10,cum_deaths_log10
country,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Indonesia,0,2019-12-31,12,2019,0,0,ID,267663435.0,0,0,,,,
United_Arab_Emirates,0,2019-12-31,12,2019,0,0,AE,9630959.0,0,0,,,,
Austria,0,2019-12-31,12,2019,0,0,AT,8847037.0,0,0,,,,
Switzerland,0,2019-12-31,12,2019,0,0,CH,8516543.0,0,0,,,,
Nepal,0,2019-12-31,12,2019,0,0,NP,28087871.0,0,0,,,,


## Calculate Doubling Time
The doubling time represents the number of days it takes for cases/deaths to double. 

In [58]:
df = df.reset_index().set_index('day')
df.sample()

Unnamed: 0_level_0,country,date,month,year,cases,deaths,code,pop,cum_cases,cum_deaths,cum_cases_pct_change,cum_deaths_pct_change,cum_cases_log10,cum_deaths_log10
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
80,Czech_Republic,2020-03-20,3,2020,172,0,CZ,10625695.0,694,0,0.329502,,2.841359,


In [59]:
def doubling_time(s, d=5): return d*(np.log(2)/np.log(s/s.shift(d)))

cum_cases_dt = df.groupby('country').apply(lambda g: doubling_time(g['cum_cases']))

df = df.reset_index()\
    .set_index(['country', 'day'])\
    .join(cum_cases_dt, rsuffix='_dt')

df.sample()

Unnamed: 0_level_0,Unnamed: 1_level_0,date,month,year,cases,deaths,code,pop,cum_cases,cum_deaths,cum_cases_pct_change,cum_deaths_pct_change,cum_cases_log10,cum_deaths_log10,cum_cases_dt
country,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Lebanon,53,2020-02-22,2,2020,1,0,LB,6848925.0,1,0,,,0.0,,0.0


# Calculate Day Zero Days
To align the cases/deaths across different countries we choose a staring day based on when countries reach 100/10 cases/deaths.

In [60]:
df = df.reset_index().set_index('day')
df.sample()

Unnamed: 0_level_0,country,date,month,year,cases,deaths,code,pop,cum_cases,cum_deaths,cum_cases_pct_change,cum_deaths_pct_change,cum_cases_log10,cum_deaths_log10,cum_cases_dt
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
48,Canada,2020-02-17,2,2020,0,0,CA,37058856.0,8,0,0.0,,0.90309,,25.954465


In [61]:
def day_with_n_at_least_k(g, k): 
        
    # The zero_day offset
    d = g[g>=k].index.values[0] if g.max()>=k else np.nan
        
    return pd.Series(g.index.values-d, name='day', index=g.index.values)


min_cases, min_deaths = 100, 10


day_zero_for_cases = df.groupby('country').apply(
    lambda g: day_with_n_at_least_k(g['cum_cases'], min_cases)).reset_index()
day_zero_for_cases.columns = ['country', 'day', 'day_zero_cases']

day_zero_for_deaths = df.groupby('country').apply(
    lambda g: day_with_n_at_least_k(g['cum_deaths'], min_deaths)).reset_index()
day_zero_for_deaths.columns = ['country', 'day', 'day_zero_deaths']

df = df.reset_index()\
    .set_index(['country', 'day'])\
    .join(day_zero_for_cases.set_index(['country', 'day']))\
    .join(day_zero_for_deaths.set_index(['country', 'day']))\

    
df.sample()

Unnamed: 0_level_0,Unnamed: 1_level_0,date,month,year,cases,deaths,code,pop,cum_cases,cum_deaths,cum_cases_pct_change,cum_deaths_pct_change,cum_cases_log10,cum_deaths_log10,cum_cases_dt,day_zero_cases,day_zero_deaths
country,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Sweden,53,2020-02-22,2,2020,0,0,SE,10183175.0,1,0,0.0,,0.0,,inf,-14.0,-26.0


# Save Processed Datasets

In [62]:
df.reset_index().to_csv(ecdc_dataset_processed, index=False)

df.shape


(6738, 16)