# Downloading, reading and cleaning global COVID-19 data

This notebook implements a function to download 2019 Novel Coronavirus COVID-19 (2019-nCoV) Data Repository by Johns Hopkins CSSE, which can be found here: https://github.com/CSSEGISandData/COVID-19.
It also reads the files, cleans the data and returns a suitable pandas dataframe that indexes the numbers by countries and dates. 

Written by Bruno Abreu, April 20202
as part of the Racionalidade Cientifica initiative
https://sites.google.com/view/racionalidadecientifica/home

Code is public, and the author appreciates proper credits 
if used somewhere else.

In [6]:
import sys
import os
import pandas as pd
from datetime import datetime 

In [7]:
def fix_countries_names(countries):
    """
    Yes, the name says it all. Unfortunately there were inconsistencies in the way the data was reported 
    along the days. Since we won't be interested in granular in-country data, we will make these
    corrections by hand here. Not ideal, but it works. If you notice more, come and add them here!
    """
    fixed_list = []
    for country in countries:
        fixed_name = country.replace('Mainland ', '').replace('United Kingdom', 'UK')
        fixed_name = fixed_name.replace(' Azerbaijan', 'Azerbaijan').replace('Viet Nam', 'Vietnam')
        fixed_list.append(fixed_name)
    return fixed_list

In [8]:
def get_global_data():
    """
    This function does the whole magic of updating your local repository, reading the files and cleaning
    the dataframe. It returns a pandas.DataFrame object.
    It was intended to be used in association with data analysis techniques.
    """
    ## DOWNLOAD/UPDATE
    
    # this is the repository that you created when you first cloned from JHU's covid-19 github
    repo_path = "add_path_to_your_repo_here"
    pull = "git pull https://github.com/CSSEGISandData/COVID-19.git"
    os.chdir(repo_path)      # changes to local your repository
    os.system(pull)          # pulls it from github
    
    
    ## READ
    
    dd_path = repo_path + "/csse_covid_19_data/csse_covid_19_daily_reports"    # path to folder with files that we want
    os.chdir(dd_path)         # change to there
    filelist = os.listdir()   # read all the file names
    
    # create list of files and timestamps
    ind2 = []
    for file in filelist:
        if file[-3:] == 'csv':
            date = datetime.strptime(file.strip('.csv'), '%m-%d-%Y')
            ind2.append(date)
    ind2.sort()
    #remove unecessary files and sort them
    filelist.remove('README.md')
    filelist.remove('.gitignore')
    filelist.sort()
    
    # now we need to find all possible countries in these files
    # unfortunately this has been done in a non-consistent way by JHU
    names_for_countries = set()
    for file in filelist:
        df_temp = pd.read_csv(file)
        cols = []
        for col in list(df_temp.columns):
            cols.append(col.replace('/','_'))
        df_temp.columns = cols
        names_for_countries.update(list(df_temp['Country_Region']))
    names_for_countries = list(names_for_countries)
    fixed_names = fix_countries_names(names_for_countries)
    unique_countries = set(fixed_names)
    countries = list(unique_countries)
    countries.sort()
    ind1 = countries
    
    #double index with product country*date
    iterables = [ind1,ind2]
    double_ind = pd.MultiIndex.from_product(iterables, names=['Country','Date'])
    # create double indexed dataframe filled with zeros
    df_dd = pd.DataFrame(index=double_ind,columns=['Confirmed','Deaths','Recovered']).fillna(0)
    
    # read files and update dataframe
    for counter,file in enumerate(filelist):
        df_temp = pd.read_csv(file)
        cols = []
        for col in list(df_temp.columns):
            cols.append(col.replace('/','_'))    # little workaround cause they changed columns names
        df_temp.columns = cols
        countries = list(df_temp['Country_Region'])
        countries = fix_countries_names(countries)
        df_temp['Country_Region'] = countries
        df_temp = df_temp.groupby(['Country_Region']).sum() 
        x = [list(df_temp.index),[ind2[counter]]]
        tr = pd.MultiIndex.from_product(x)
        df_temp.index = tr
        df_dd.loc[df_temp.index] = df_temp[['Confirmed','Deaths','Recovered']]
        
    return df_dd

In [9]:
df = get_global_data()
df.sample(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Confirmed,Deaths,Recovered
Country,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Tunisia,2020-02-15,0.0,0.0,0.0
Oman,2020-01-24,0.0,0.0,0.0
Lebanon,2020-02-20,0.0,0.0,0.0
Italy,2020-04-01,110574.0,13155.0,16847.0
Saint Kitts and Nevis,2020-03-10,0.0,0.0,0.0
Maldives,2020-03-20,13.0,0.0,0.0
Tunisia,2020-02-23,0.0,0.0,0.0
France,2020-03-14,4480.0,91.0,12.0
Bolivia,2020-03-19,12.0,0.0,0.0
Laos,2020-03-12,0.0,0.0,0.0
