# SUMMARY 
In this notebook we prep the Johns Hopkins COVID-19 datasets for use. These datasets start on Jan 22 but they include recovered as well as cases and deaths. The dataset is converted into a common intermediate format so that different datasets can be transformed into a common representation for eas of future processing.

In [1]:
from datetime import date, datetime

import pandas as pd
import numpy as np

from loguru import logger


# Setup

In [2]:
# Params

raw_jh_datasets = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_{}_global.csv'
processed_jh_datasets = '../data/processed/jh_dataset.csv'

# Download the Raw Datasets
Instead of pulling the entire GitHub repo we just grab the files we need; these are the raw datafiles.

In [3]:
logger.info('Loading raw datasets.')

confirmed = pd.read_csv(raw_jh_datasets.format('confirmed'))
deaths = pd.read_csv(raw_jh_datasets.format('deaths'))
recovered = pd.read_csv(raw_jh_datasets.format('recovered'))

confirmed.shape, deaths.shape, recovered.shape


2020-03-27 15:45:42.990 | INFO     | __main__:<module>:1 - Loading raw datasets.


((248, 69), (248, 69), (234, 69))

# Reformat the datasets into a 'tidy' format and combine.

In [4]:
logger.info('Reformating and combining datasets.')

def reformat_dataset(df, label):
    
    df = df.rename(columns={'Country/Region': 'country', 'Province/State': 'province'})
    
    date_cols = df.filter(regex='^\d').columns
    
    df_by_country = df.filter(regex='^\d|country').groupby('country').sum().reset_index()
        
    tidy_df = df_by_country.melt(id_vars=['country'], var_name='date', value_name=label)
    
    dts = tidy_df['date'].map(lambda d: datetime.strptime(d, '%m/%d/%y'))
    tidy_df['day'] =  (dts-dts.min()).map(lambda d: d.days)
    
    # The JH data is cumuative. Replace with diffs.
    tidy_df[label] = tidy_df.groupby('country')[[label]].diff().fillna(0)

    return tidy_df

tidy_cases = reformat_dataset(confirmed, 'num_cases').set_index(['country', 'date', 'day'])
tidy_deaths = reformat_dataset(deaths, 'num_deaths').set_index(['country', 'date', 'day'])
tidy_recovered = reformat_dataset(recovered, 'num_recovered').set_index(['country', 'date', 'day'])

                              
tidy_cases.shape, tidy_deaths.shape, tidy_recovered.shape

2020-03-27 15:45:43.231 | INFO     | __main__:<module>:1 - Reformating and combining datasets.


((11375, 1), (11375, 1), (11375, 1))

In [5]:
tidy_cases.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,num_cases
country,date,day,Unnamed: 3_level_1
Afghanistan,1/22/20,0,0.0
Albania,1/22/20,0,0.0
Algeria,1/22/20,0,0.0
Andorra,1/22/20,0,0.0
Angola,1/22/20,0,0.0


In [6]:
tidy_df = tidy_cases.join(tidy_deaths).join(tidy_recovered).reset_index()
tidy_df.head()

Unnamed: 0,country,date,day,num_cases,num_deaths,num_recovered
0,Afghanistan,1/22/20,0,0.0,0.0,0.0
1,Albania,1/22/20,0,0.0,0.0,0.0
2,Algeria,1/22/20,0,0.0,0.0,0.0
3,Andorra,1/22/20,0,0.0,0.0,0.0
4,Angola,1/22/20,0,0.0,0.0,0.0


In [7]:
# A quick check
tidy_df.set_index('country').loc['Ireland'].set_index('day')[['num_cases', 'num_deaths', 'num_recovered']]

Unnamed: 0_level_0,num_cases,num_deaths,num_recovered
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.0,0.0,0.0
1,0.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,0.0,0.0
4,0.0,0.0,0.0
5,0.0,0.0,0.0
6,0.0,0.0,0.0
7,0.0,0.0,0.0
8,0.0,0.0,0.0
9,0.0,0.0,0.0


# Save the processed data

In [8]:
logger.info('Saving processed dataset to %s' % processed_jh_datasets)

tidy_df.to_csv(processed_jh_datasets, index=False)
tidy_df.shape

2020-03-27 15:45:44.153 | INFO     | __main__:<module>:1 - Saving processed dataset to ../data/processed/jh_dataset.csv


(11375, 6)

In [9]:
logger.info('Fin')



2020-03-27 15:45:44.240 | INFO     | __main__:<module>:1 - Fin
