# This Notebook scrapes the wikipedia page with daily updates on new cases or Covid-19

In [84]:
import pandas as pd
import numpy as np
import datetime as dt
df_list = []

## Loop through the four tables and clean

In [85]:
for table in range(3,7):
    df = pd.read_html('https://en.wikipedia.org/wiki/Template:2019%E2%80%9320_coronavirus_outbreak_data/WHO_situation_reports')[table]
    df = df.drop(range(1,6))
    if table == 3:
        df = df.drop([df.columns[-1],df.columns[-2]], axis='columns')
    df.columns = df.iloc[0]
    df = df.tail(-1).head(-1)
    df_list.append(df)

## Concat dataframes and shape

In [86]:
df_tot = pd.concat([pd.melt(df, id_vars=['Date','First reported case']) for df in df_list])

df_tot.rename(columns={'Date':'Country',0:'Date', 'value':'Cases'}, inplace=True)

df_tot['Cases'] = df_tot['Cases'].replace(to_replace ='\[\d\]', value = '', regex = True).fillna(0)

df_tot['Cases'] = pd.to_numeric(df_tot['Cases'])

df_tot['First reported case'] = df_tot['First reported case'].astype('datetime64[ns]')

df_tot['Date'] = df_tot['Date'] + '-2020'

df_tot['Date'] = df_tot['Date'].astype('datetime64[ns]')

df_tot['Days from start'] = (df_tot['Date'] - df_tot['First reported case']).dt.days

In [87]:
#view output
df_tot

Unnamed: 0,Country,First reported case,Date,Cases,Days from start
0,China,2020-01-21,2020-01-21,278,0
1,Japan,2020-01-21,2020-01-21,1,0
2,Thailand,2020-01-21,2020-01-21,2,0
3,Singapore,2020-01-23,2020-01-21,0,-2
4,Hong Kong,2020-01-23,2020-01-21,0,-2
...,...,...,...,...,...
931,Serbia,2020-03-06,2020-03-09,1,3
932,Colombia,2020-03-07,2020-03-09,1,2
933,Togo,2020-03-07,2020-03-09,1,2
934,Vatican City,2020-03-07,2020-03-09,1,2


## Output to csv in current directory

In [88]:
df_tot.to_csv(r'covid19.csv', sep=';', index = False)