In [24]:
from datetime import date

import pandas as pd
import numpy as np

from loguru import logger

%matplotlib inline

# Setup

In [35]:
# Params

input_dataset = '../data/processed/000_jh_dataset.csv'
output_dataset = '../data/processed/010_jh_dataset.csv'


num_cols = ['num_cases', 'num_deaths', 'num_recovered']

In [26]:
# Load dataset

num_dtypes = {col: int for col in num_cols}

df = pd.read_csv(input_dataset, dtype=num_dtypes)
df.shape

(11375, 6)

In [27]:
df.head()

Unnamed: 0,country,date,day,num_cases,num_deaths,num_recovered
0,Afghanistan,1/22/20,0,0,0,0
1,Albania,1/22/20,0,0,0,0
2,Algeria,1/22/20,0,0,0,0
3,Andorra,1/22/20,0,0,0,0
4,Angola,1/22/20,0,0,0,0


# Add Additional Columns

## Cumulative Totals 

In [28]:
daily_nums_cols = df.filter(like='num_').columns

df = pd.concat([
    df, 
    df.groupby(
        ['country'])[daily_nums_cols].cumsum().rename(
            columns={col:col.replace('num_', 'total_') for col in daily_nums_cols})
    ], axis=1
)

df.shape

(11375, 9)

## Log Totals

In [29]:
daily_totals_cols = df.filter(like='total_').columns

df = pd.concat([
    df,
    df[daily_totals_cols].applymap(
        lambda n: np.log10(n) if n>0 else np.nan).rename(
            columns={col:col.replace('total_', 'log10_total_') for col in daily_totals_cols})
], axis=1)

df.shape

(11375, 12)

## Doubling Times

In [30]:
dt_interval = 5

def doubling_time(s, d=dt_interval): return d*(np.log(2)/np.log(s/s.shift(d)))

df = pd.concat([
    df,
    df.groupby('country')[daily_totals_cols].apply(lambda s: doubling_time(s)).rename(
            columns={col:col.replace('total_', 'dt_total_') for col in daily_totals_cols}).applymap(lambda n: n if n!=np.inf else np.nan)
], axis=1)



df.shape

  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until


(11375, 15)

In [31]:
df.head()

Unnamed: 0,country,date,day,num_cases,num_deaths,num_recovered,total_cases,total_deaths,total_recovered,log10_total_cases,log10_total_deaths,log10_total_recovered,dt_total_cases,dt_total_deaths,dt_total_recovered
0,Afghanistan,1/22/20,0,0,0,0,0,0,0,,,,,,
1,Albania,1/22/20,0,0,0,0,0,0,0,,,,,,
2,Algeria,1/22/20,0,0,0,0,0,0,0,,,,,,
3,Andorra,1/22/20,0,0,0,0,0,0,0,,,,,,
4,Angola,1/22/20,0,0,0,0,0,0,0,,,,,,


## Day-Zero for Cases & Deaths

In [32]:
min_cases, min_deaths = 100, 10

def day_with_n_at_least_k(g, col, k):
    
    if g[col].max()>=k:
        return g[g[col]>=k]['day'].iloc[0]
    else:
        return np.nan


In [33]:
day_zero_cases = pd.DataFrame(df.groupby('country').apply(
    lambda g: day_with_n_at_least_k(g, 'total_cases', min_cases)), columns=['day_zero_cases'])

day_zero_deaths = pd.DataFrame(df.groupby('country').apply(
    lambda g: day_with_n_at_least_k(g, 'total_deaths', min_cases)), columns=['day_zero_deaths'])

df = df.set_index('country').join(day_zero_cases).join(day_zero_deaths).reset_index()

df['day_zero_cases'] = df['day']-df['day_zero_cases']
df['day_zero_deaths'] = df['day']-df['day_zero_deaths']


df.head()

Unnamed: 0,country,date,day,num_cases,num_deaths,num_recovered,total_cases,total_deaths,total_recovered,log10_total_cases,log10_total_deaths,log10_total_recovered,dt_total_cases,dt_total_deaths,dt_total_recovered,day_zero_cases,day_zero_deaths
0,Afghanistan,1/22/20,0,0,0,0,0,0,0,,,,,,,,
1,Afghanistan,1/23/20,1,0,0,0,0,0,0,,,,,,,,
2,Afghanistan,1/24/20,2,0,0,0,0,0,0,,,,,,,,
3,Afghanistan,1/25/20,3,0,0,0,0,0,0,,,,,,,,
4,Afghanistan,1/26/20,4,0,0,0,0,0,0,,,,,,,,


In [34]:
df.set_index('country').loc['Ireland'].reset_index().set_index('day_zero_cases').loc[-1:]['total_cases']

day_zero_cases
-1.0       90
 0.0      129
 1.0      129
 2.0      169
 3.0      223
 4.0      292
 5.0      557
 6.0      683
 7.0      785
 8.0      906
 9.0     1125
 10.0    1329
 11.0    1564
 12.0    1819
Name: total_cases, dtype: int64

# Save 

In [36]:
df.to_csv(output_dataset, index=False)
df.shape

(11375, 17)