In [82]:
from selenium import webdriver
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
import numpy as np

# Analysis of Covid-19 Developmemt via Worldometer Data

## Utils

In [94]:
def get_day_string():
    day = datetime.now().day
    month = datetime.now().month
    return "{}_{}_2020".format(day,month)
x = get_day_string()
print("Today is {}".format(x))

Today is 28_5_2020


## Getting raw data from Worldometer

In [91]:
driver = webdriver.Chrome()
driver.get("https://www.worldometers.info/coronavirus/#countries")
table = driver.find_element_by_xpath('//*[@id="main_table_countries_today"]/tbody[1]')
txt = table.text

## Process data into dataframe

In [92]:
# get all countries with sufficient data
raw_countries = [x.split(" ")[1:]  for x in txt.split("\n")]
world = raw_countries[0]
raw_countries[0] = ["world"] + world
# delete "new cases" and "new deaths" and append splitted country names
countries = []
for row in raw_countries:
    for i in [0,1,2,3]:
        if not row[1][0].isdigit():
            name = row[0]
            row[0] = name+" "+row[1]
            del row[1]
    
    countries.append([elem for elem in row if not elem.startswith("+")])

weird_countries = [c[0] for c in countries if len(c) < 10]
print("number of countries with missing values: {}".format(len(weird_countries)))        
df = pd.DataFrame(countries)
# delete countries that have name consitsing of several words
df.columns = ["country","total_cases","total_deaths","total_recovered","active_cases","seriuos_cases","tot_cases_per_1m","death_per_1m","total_tests","total_tests_per_1m","pop"]
print("number of countries: {}".format(len(df)))

# convert strings to numeric columns
for column in df:
    if column != "country":
        df[column] = pd.to_numeric(df[column].str.replace(",",""), errors="coerce")

world_df = df[df["country"]=="world"]
df = df.set_index("country")
if df.loc["China","pop"].isnull():
    #clean up china (has no test data currently)
    df.loc["China","pop"] = df.loc["China","total_tests"]
    df.loc["China","total_tests"] = None
df.to_csv("data/{}_worldometer_raw.csv".format(get_day_string()))
df.head()

number of countries with missing values: 53
number of countruies: 216


Unnamed: 0_level_0,total_cases,total_deaths,total_recovered,active_cases,seriuos_cases,tot_cases_per_1m,death_per_1m,total_tests,total_tests_per_1m,pop
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
world,5788782,357425,2497593.0,2933764.0,52975.0,743.0,45.9,,,
USA,1745803,102107,490130.0,1153566.0,17166.0,5277.0,309.0,15875473.0,47988.0,330822304.0
Brazil,414661,25697,166647.0,222317.0,8318.0,1952.0,121.0,871839.0,4104.0,212418030.0
Russia,370680,3968,142208.0,224504.0,2300.0,2540.0,27.0,9415992.0,64525.0,145928826.0
Spain,283849,27118,196958.0,59773.0,854.0,6071.0,580.0,3556567.0,76071.0,46753147.0


## Read data

In [96]:
df = pd.read_csv("data/{}_worldometer_raw.csv".format(get_day_string()))
df.head()

Unnamed: 0,country,total_cases,total_deaths,total_recovered,active_cases,seriuos_cases,tot_cases_per_1m,death_per_1m,total_tests,total_tests_per_1m,pop
0,world,5788782,357425,2497593.0,2933764.0,52975.0,743.0,45.9,,,
1,USA,1745803,102107,490130.0,1153566.0,17166.0,5277.0,309.0,15875473.0,47988.0,330822304.0
2,Brazil,414661,25697,166647.0,222317.0,8318.0,1952.0,121.0,871839.0,4104.0,212418030.0
3,Russia,370680,3968,142208.0,224504.0,2300.0,2540.0,27.0,9415992.0,64525.0,145928826.0
4,Spain,283849,27118,196958.0,59773.0,854.0,6071.0,580.0,3556567.0,76071.0,46753147.0
