# SUMMARY 
In this notebook we prep the Johns Hopkins COVID-19 datasets for use. These datasets start on Jan 22 but they include recovered as well as cases and deaths. The dataset is converted into a common intermediate format so that different datasets can be transformed into a common representation for eas of future processing.

In [1]:
from datetime import date, datetime

import pandas as pd
import numpy as np


# Setup

In [2]:
raw_jh_datasets = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_{}_global.csv'
processed_jh_datasets = '../data/processed/jh_dataset.csv'

# Download the Raw Datasets
Instead of pulling the entire GitHub repo we just grab the files we need; these are the raw datafiles.

In [3]:
confirmed = pd.read_csv(raw_jh_datasets.format('confirmed'))
deaths = pd.read_csv(raw_jh_datasets.format('deaths'))
recovered = pd.read_csv(raw_jh_datasets.format('recovered'))

confirmed.shape, deaths.shape, recovered.shape


((248, 69), (248, 69), (234, 69))

# Reformat the datasets into a 'tidy' format and combine.

In [4]:
def reformat_dataset(df, label):
    
    df = df.rename(columns={'Country/Region': 'country', 'Province/State': 'province'})
    
    date_cols = df.filter(regex='^\d').columns
    
    df_by_country = df.filter(regex='^\d|country').groupby('country').sum().reset_index()
        
    tidy_df = df_by_country.melt(id_vars=['country'], var_name='date', value_name=label)
    
    dts = tidy_df['date'].map(lambda d: datetime.strptime(d, '%m/%d/%y'))
    tidy_df['day'] =  (dts-dts.min()).map(lambda d: d.days)

    
    
    


    return tidy_df

tidy_cases = reformat_dataset(confirmed, 'cases').set_index(['country', 'date', 'day'])
tidy_deaths = reformat_dataset(deaths, 'deaths').set_index(['country', 'date', 'day'])
tidy_recovered = reformat_dataset(recovered, 'recovered').set_index(['country', 'date', 'day'])

                              
tidy_cases.shape, tidy_deaths.shape, tidy_recovered.shape

((11375, 1), (11375, 1), (11375, 1))

In [5]:
tidy_df = tidy_cases.join(tidy_deaths).join(tidy_recovered).reset_index()
tidy_df.head()

Unnamed: 0,country,date,day,cases,deaths,recovered
0,Afghanistan,1/22/20,0,0,0,0
1,Albania,1/22/20,0,0,0,0
2,Algeria,1/22/20,0,0,0,0
3,Andorra,1/22/20,0,0,0,0
4,Angola,1/22/20,0,0,0,0


In [6]:
# A quick check
tidy_df.set_index('country').loc['US'].set_index('day')[['cases', 'deaths', 'recovered']]

Unnamed: 0_level_0,cases,deaths,recovered
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,0,0
1,1,0,0
2,2,0,0
3,2,0,0
4,5,0,0
...,...,...,...
60,33276,417,178
61,43847,557,178
62,53740,706,348
63,65778,942,361


# Save the processed data

In [7]:
tidy_df.to_csv(processed_jh_datasets, index=False)
tidy_df.shape

(11375, 6)