In [1]:
from datetime import date

import pandas as pd
import numpy as np

from matplotlib.pylab import plt

import seaborn as sns

from scipy import stats

%matplotlib inline

sns.set_style("white")
sns.set_context('talk')

%matplotlib inline

# Setup

## Raw Dataset
Switched to using a daily dataset from https://www.ecdc.europa.eu/en/novel-coronavirus-china
The main reason for this is that this dataset goes back to 31/12/2019 whereas the JH dataset starts about a month later when China already had 500+ cases and 17 deaths.

In [2]:
ecdc_dataset_raw = 'https://www.ecdc.europa.eu/sites/default/files/documents/COVID-19-geographic-disbtribution-worldwide{}.xlsx'

ecdc_dataset_processed = '../data/processed/ecdc_dataset.csv'

In [3]:
todays_date = "-{}-{:02}-{:02}".format(date.today().year, date.today().month, date.today().day)

df = pd.read_excel(ecdc_dataset_raw.format(todays_date))

df.columns = ['date', 'day', 'month', 'year', 'cases', 'deaths', 'country', 'code', 'pop']

# Add a number of days since records began column.
df['day'] = (df['date'] - df['date'].min()).map(lambda d: d.days)

# Sort by day, oldest first
df = df.sort_values(by='day', ascending=True).set_index('day')

df.shape
    

(6738, 8)

In [4]:
df.country.unique()

array(['Indonesia', 'United_Arab_Emirates', 'Austria', 'Switzerland',
       'Nepal', 'Brazil', 'India', 'Lebanon', 'Malaysia',
       'Cases_on_an_international_conveyance_Japan', 'Azerbaijan',
       'Estonia', 'Iceland', 'Russia', 'Netherlands', 'Thailand',
       'Finland', 'Afghanistan', 'United_States_of_America', 'Bahrain',
       'New_Zealand', 'Philippines', 'United_Kingdom', 'Singapore',
       'China', 'Kuwait', 'Egypt', 'Japan', 'Denmark', 'Pakistan',
       'Armenia', 'Italy', 'South_Korea', 'Mexico', 'Taiwan', 'Algeria',
       'Romania', 'Iran', 'Monaco', 'Cambodia', 'Israel', 'Ireland',
       'Australia', 'Ecuador', 'Canada', 'Czech_Republic', 'Oman', 'Iraq',
       'Croatia', 'Dominican_Republic', 'Qatar', 'Norway', 'Germany',
       'France', 'San_Marino', 'Sri_Lanka', 'Spain', 'North_Macedonia',
       'Georgia', 'Sweden', 'Nigeria', 'Greece', 'Belarus', 'Luxembourg',
       'Lithuania', 'Vietnam', 'Belgium', 'Portugal', 'Jordan', 'Latvia',
       'Andorra', 'Saudi_

In [5]:
df.head()

Unnamed: 0_level_0,date,month,year,cases,deaths,country,code,pop
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,2019-12-31,12,2019,0,0,Indonesia,ID,267663435.0
0,2019-12-31,12,2019,0,0,United_Arab_Emirates,AE,9630959.0
0,2019-12-31,12,2019,0,0,Austria,AT,8847037.0
0,2019-12-31,12,2019,0,0,Switzerland,CH,8516543.0
0,2019-12-31,12,2019,0,0,Nepal,NP,28087871.0


# Processing

## Add Cumulative Totals

In [6]:
df['cum_cases'] = df.groupby('country')['cases'].transform('cumsum')
df['cum_deaths'] = df.groupby('country')['deaths'].transform('cumsum')

In [7]:
cum_cases_pct_change = df.groupby('country').apply(lambda g: g['cum_cases'].pct_change())
cum_deaths_pct_change = df.groupby('country').apply(lambda g: g['cum_deaths'].pct_change())

df = df.reset_index()\
        .set_index(['country', 'day'])\
        .join(cum_cases_pct_change, rsuffix='_pct_change')\
        .join(cum_deaths_pct_change, rsuffix='_pct_change')

df.sample()

Unnamed: 0_level_0,Unnamed: 1_level_0,date,month,year,cases,deaths,code,pop,cum_cases,cum_deaths,cum_cases_pct_change,cum_deaths_pct_change
country,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Australia,26,2020-01-26,1,2020,3,0,AU,24992369.0,4,0,3.0,


In [8]:
df['cum_cases_pct_change'] = df['cum_cases_pct_change'].replace(np.inf, np.nan)
df['cum_deaths_pct_change'] = df['cum_deaths_pct_change'].replace(np.inf, np.nan)

df.shape

(6738, 11)

## Calculate Log10 Values

In [9]:
df = df.reset_index().set_index('day')
df.sample()

Unnamed: 0_level_0,country,date,month,year,cases,deaths,code,pop,cum_cases,cum_deaths,cum_cases_pct_change,cum_deaths_pct_change
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
30,Cases_on_an_international_conveyance_Japan,2020-01-30,1,2020,0,0,JPG11668,,0,0,,


In [10]:

cum_cases_log10 = df.groupby('country').apply(lambda g: g['cum_cases'].map(lambda n: np.log10(n) if n>0 else np.nan))
cum_deaths_log10 = df.groupby('country').apply(lambda g: g['cum_deaths'].map(lambda n: np.log10(n) if n>0 else np.nan))

df = df.reset_index()\
    .set_index(['country', 'day'])\
    .join(cum_cases_log10, rsuffix='_log10')\
    .join(cum_deaths_log10, rsuffix='_log10')

df.head()


Unnamed: 0_level_0,Unnamed: 1_level_0,date,month,year,cases,deaths,code,pop,cum_cases,cum_deaths,cum_cases_pct_change,cum_deaths_pct_change,cum_cases_log10,cum_deaths_log10
country,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Indonesia,0,2019-12-31,12,2019,0,0,ID,267663435.0,0,0,,,,
United_Arab_Emirates,0,2019-12-31,12,2019,0,0,AE,9630959.0,0,0,,,,
Austria,0,2019-12-31,12,2019,0,0,AT,8847037.0,0,0,,,,
Switzerland,0,2019-12-31,12,2019,0,0,CH,8516543.0,0,0,,,,
Nepal,0,2019-12-31,12,2019,0,0,NP,28087871.0,0,0,,,,


## Calculate Doubling Time 

In [11]:
df = df.reset_index().set_index('day')
df.sample()

Unnamed: 0_level_0,country,date,month,year,cases,deaths,code,pop,cum_cases,cum_deaths,cum_cases_pct_change,cum_deaths_pct_change,cum_cases_log10,cum_deaths_log10
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
12,Austria,2020-01-12,1,2020,0,0,AT,8847037.0,0,0,,,,


In [12]:
def doubling_time(s, d=5): return d*(np.log(2)/np.log(s/s.shift(d)))

cum_cases_dt = df.groupby('country').apply(lambda g: doubling_time(g['cum_cases']))

df = df.reset_index()\
    .set_index(['country', 'day'])\
    .join(cum_cases_dt, rsuffix='_dt')

df.sample()

Unnamed: 0_level_0,Unnamed: 1_level_0,date,month,year,cases,deaths,code,pop,cum_cases,cum_deaths,cum_cases_pct_change,cum_deaths_pct_change,cum_cases_log10,cum_deaths_log10,cum_cases_dt
country,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
United_Arab_Emirates,11,2020-01-11,1,2020,0,0,AE,9630959.0,0,0,,,,,


# Calculate Day Zero Days
To align the cases/deaths across different countries we choose a staring day based on when countries reach 100/10 cases/deaths.

In [13]:
df = df.reset_index().set_index('day')
df.sample()

Unnamed: 0_level_0,country,date,month,year,cases,deaths,code,pop,cum_cases,cum_deaths,cum_cases_pct_change,cum_deaths_pct_change,cum_cases_log10,cum_deaths_log10,cum_cases_dt
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
66,Taiwan,2020-03-06,3,2020,2,0,TW,,44,1,0.047619,0.0,1.643453,0.0,28.730778


In [14]:
def day_with_n_at_least_k(g, k): 
        
    # The zero_day offset
    d = g[g>=k].index.values[0] if g.max()>=k else np.nan
        
    return pd.Series(g.index.values-d, name='day', index=g.index.values)


min_cases, min_deaths = 100, 10


day_zero_for_cases = df.groupby('country').apply(
    lambda g: day_with_n_at_least_k(g['cum_cases'], min_cases)).reset_index()
day_zero_for_cases.columns = ['country', 'day', 'day_zero_cases']

day_zero_for_deaths = df.groupby('country').apply(
    lambda g: day_with_n_at_least_k(g['cum_deaths'], min_deaths)).reset_index()
day_zero_for_deaths.columns = ['country', 'day', 'day_zero_deaths']

df = df.reset_index()\
    .set_index(['country', 'day'])\
    .join(day_zero_for_cases.set_index(['country', 'day']))\
    .join(day_zero_for_deaths.set_index(['country', 'day']))\

    
df.sample()

Unnamed: 0_level_0,Unnamed: 1_level_0,date,month,year,cases,deaths,code,pop,cum_cases,cum_deaths,cum_cases_pct_change,cum_deaths_pct_change,cum_cases_log10,cum_deaths_log10,cum_cases_dt,day_zero_cases,day_zero_deaths
country,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Egypt,20,2020-01-20,1,2020,0,0,EG,98423595.0,0,0,,,,,,-56.0,-63.0


# Save Processed Datasets

In [15]:
df.reset_index().to_csv(ecdc_dataset_processed, index=False)

df.shape


(6738, 16)

In [16]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,date,month,year,cases,deaths,code,pop,cum_cases,cum_deaths,cum_cases_pct_change,cum_deaths_pct_change,cum_cases_log10,cum_deaths_log10,cum_cases_dt,day_zero_cases,day_zero_deaths
country,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Indonesia,0,2019-12-31,12,2019,0,0,ID,267663435.0,0,0,,,,,,-76.0,-80.0
United_Arab_Emirates,0,2019-12-31,12,2019,0,0,AE,9630959.0,0,0,,,,,,-78.0,
Austria,0,2019-12-31,12,2019,0,0,AT,8847037.0,0,0,,,,,,-69.0,-83.0
Switzerland,0,2019-12-31,12,2019,0,0,CH,8516543.0,0,0,,,,,,-67.0,-75.0
Nepal,0,2019-12-31,12,2019,0,0,NP,28087871.0,0,0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
New_Caledonia,85,2020-03-25,3,2020,2,0,NC,284060.0,9,0,0.285714,,0.954243,,,,
Haiti,85,2020-03-25,3,2020,2,0,HT,11123176.0,7,0,0.400000,,0.845098,,2.766474,,
Cambodia,85,2020-03-25,3,2020,4,0,KH,16249798.0,91,0,0.045977,,1.959041,,2.600331,,
Rwanda,85,2020-03-25,3,2020,4,0,RW,12301939.0,40,0,0.111111,,1.602060,,2.684569,,


In [17]:
df.reset_index().set_index('country').loc['Spain'].reset_index().set_index('day_zero_deaths').loc[-5:]

Unnamed: 0_level_0,country,day,date,month,year,cases,deaths,code,pop,cum_cases,cum_deaths,cum_cases_pct_change,cum_deaths_pct_change,cum_cases_log10,cum_deaths_log10,cum_cases_dt,day_zero_cases
day_zero_deaths,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
-5.0,Spain,65,2020-03-05,3,2020,49,1,ES,46723749.0,200,1,0.324503,,2.30103,0.0,1.955881,2.0
-4.0,Spain,66,2020-03-06,3,2020,61,2,ES,46723749.0,261,3,0.305,2.0,2.416641,0.477121,2.520781,3.0
-3.0,Spain,67,2020-03-07,3,2020,113,2,ES,46723749.0,374,5,0.43295,0.666667,2.572872,0.69897,2.302179,4.0
-2.0,Spain,68,2020-03-08,3,2020,56,0,ES,46723749.0,430,5,0.149733,0.0,2.633468,0.69897,2.610553,5.0
-1.0,Spain,69,2020-03-09,3,2020,159,0,ES,46723749.0,589,5,0.369767,0.0,2.770115,0.69897,2.546189,6.0
0.0,Spain,70,2020-03-10,3,2020,615,23,ES,46723749.0,1204,28,1.044143,4.6,3.080626,1.447158,1.930678,7.0
1.0,Spain,71,2020-03-11,3,2020,435,7,ES,46723749.0,1639,35,0.361296,0.25,3.214579,1.544068,1.886298,8.0
2.0,Spain,72,2020-03-12,3,2020,501,12,ES,46723749.0,2140,47,0.305674,0.342857,3.330414,1.672098,1.986886,9.0
3.0,Spain,73,2020-03-13,3,2020,864,37,ES,46723749.0,3004,84,0.403738,0.787234,3.4777,1.924279,1.782864,10.0
4.0,Spain,74,2020-03-14,3,2020,1227,37,ES,46723749.0,4231,121,0.408455,0.440476,3.626443,2.082785,1.75768,11.0
