# SUMMARY 
This notebook is responsible for the basic data collection and preparation. The dataset we use comes from the ECDC and provides day-by-day accounts of cases and deaths for each affected country. This data file is updated each dat and so needs to be downloaded to keep it up to date.

We do some basic data processing to convert daily numbers into cumulative numbers etc. The processed dataframe is then saved to file for later use.

In [23]:
from datetime import date

import pandas as pd
import numpy as np


# Setup 

In [24]:
# Params

# The download file template (encodes the current day's date)
raw_ecdc_dataset = 'https://www.ecdc.europa.eu/sites/default/files/documents/COVID-19-geographic-disbtribution-worldwide{}.xlsx'

# Save the processed data to here.
processed_ecdc_dataset = '../data/processed/ecdc_dataset.csv'

# Download the Raw Dataset

Using the dataset from https://www.ecdc.europa.eu/en/novel-coronavirus-china which dates back to 31/12/2019 and includes useful additional information such as common country codes and population information. 


In [25]:
# Create the filename for tpday's dataset and download it.
todays_date = "-{}-{:02}-{:02}".format(date.today().year, date.today().month, date.today().day-1)

raw_ecdc_dataset.format(todays_date)

'https://www.ecdc.europa.eu/sites/default/files/documents/COVID-19-geographic-disbtribution-worldwide-2020-03-26.xlsx'

In [26]:
df = pd.read_excel(raw_ecdc_dataset.format(todays_date))
df.shape

(6931, 10)

In [27]:
df.head()

Unnamed: 0,DateRep,Day,Month,Year,Cases,Deaths,Countries and territories,GeoId,Country Code,Pop_Data.2018
0,2020-03-26,26,3,2020,33,0,Afghanistan,AF,AFG,37172386.0
1,2020-03-25,25,3,2020,2,0,Afghanistan,AF,AFG,37172386.0
2,2020-03-24,24,3,2020,6,1,Afghanistan,AF,AFG,37172386.0
3,2020-03-23,23,3,2020,10,0,Afghanistan,AF,AFG,37172386.0
4,2020-03-22,22,3,2020,0,0,Afghanistan,AF,AFG,37172386.0


# Reformat the datasets into a 'tidy' format.

In [30]:
def reformat_ecdc_dataset(df):
    
    df.columns = ['date', 'day', 'month', 'year', 'num_cases', 'num_deaths', 'country', 'id', 'code', 'pop']
    
    # Add a number of days since records began column.
    df['day'] = (df['date'] - df['date'].min()).map(lambda d: d.days)

    # Sort by day, oldest first
    df = df.sort_values(by='day', ascending=True)[['date', 'day', 'country', 'num_cases', 'num_deaths', 'id', 'code', 'pop']]

    return df

In [31]:
df = reformat_ecdc_dataset(df)
df.head()

Unnamed: 0,date,day,country,num_cases,num_deaths,id,code,pop
4621,2019-12-31,0,New_Zealand,0,0,NZ,NZL,4885500.0
5198,2019-12-31,0,Philippines,0,0,PH,PHL,106651900.0
3278,2019-12-31,0,Ireland,0,0,IE,IRL,4853506.0
2941,2019-12-31,0,India,0,0,IN,IND,1352617000.0
5607,2019-12-31,0,San_Marino,0,0,SM,SMR,33785.0


In [33]:
# A quick check
df.set_index('country').loc['Ireland'].set_index('day')[['num_cases', 'num_deaths']]

Unnamed: 0_level_0,num_cases,num_deaths
day,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,0


# Save Processed Datasets

In [34]:
df.to_csv(processed_ecdc_dataset, index=False)

df.shape


(6931, 8)