# This Notebook scrapes Wikipedia for daily updates on new cases of Covid-19

https://en.wikipedia.org/wiki/Template:2019%E2%80%9320_coronavirus_outbreak_data/WHO_situation_reports

In [None]:
import pandas as pd
import numpy as np
import datetime as dt
df_list = []

## Loop through the four tables and clean

In [None]:
for table in range(1, 6):
    df = pd.read_html('https://en.wikipedia.org/wiki/Template:2019%E2%80%9320_coronavirus_outbreak_data/WHO_situation_reports')[table]
    df = df.drop(range(0,5))
    if table == 1:
        df = df.drop([df.columns[-1],df.columns[-2]], axis='columns')
        df = df.head(-2)
    else:
        df = df.head(-1)
    df_list.append(df)

## Concat dataframes and shape

In [None]:
df_tot = pd.concat([pd.melt(df, id_vars=['Date','First reported case']) for df in df_list])

df_tot.rename(columns={'Date':'Country','variable':'Date', 'value':'Cases'}, inplace=True)

df_tot['Cases'] = df_tot['Cases'].replace(to_replace ='\[\d+\]', value = '', regex = True).fillna(0)

df_tot['Cases'] = pd.to_numeric(df_tot['Cases'])

df_tot['First reported case'] = df_tot['First reported case'].astype('datetime64[ns]')

df_tot

df_tot['Date'] = df_tot['Date'] + '-2020'

df_tot['Date'] = df_tot['Date'].astype('datetime64[ns]')

df_tot['Days from start'] = (df_tot['Date'] - df_tot['First reported case']).dt.days

#view output
df_tot

## Load Country Populations from Wikipedia
### https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)

In [None]:
df_pop = pd.read_html('https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)')[3]

In [None]:
df_pop['Country or area'] = df_pop['Country or area'].replace(to_replace ='\[[a-z]\]', value = '', regex = True).fillna(0)

In [None]:
# country remappings
cty_dict = { 'United States' : 'USA',
            'United Arab Emirates' : 'UAE',
            'Macau' : 'Macao',
            'Czech Republic' : 'Czechia' ,
            'United Kingdom' : 'UK',
            'Guernsey and Jersey' : 'Jersey',
            'DR Congo' : 'DRC',
            'Ivory Coast' : 'Cote d\'Ivoire' ,
            'Réunion' : 'Reunion'
            }

df_pop['Country or area'] = df_pop['Country or area'].map(cty_dict).fillna(df_pop['Country or area'])

In [None]:
df_pop

## Join to Virus Data

In [None]:
df_tot_pop = pd.merge(df_tot, df_pop, left_on='Country', right_on='Country or area', how='inner')

In [None]:
df_tot_pop

## Find unmatched countries here (add to mapping)
Currently only Saint Barthélemy and Saint Martin don't map as they lack population data

In [None]:
df_missing_country = pd.merge(df_tot, df_pop, left_on='Country', right_on='Country or area', how='left')

In [None]:
pd.set_option('display.max_rows', 500)
df_missing_country[df_missing_country['Country or area'].isnull()].Country.unique

## Output to csv in current directory

In [None]:
df_tot_pop.to_csv(r'covid19.csv', sep=';', index = False)