In [1]:
import requests
import lxml.html as lh
import pandas as pd
import numpy as np
import country_converter as coco

### Build df from Url

In [2]:
url='https://www.worldometers.info/coronavirus/'

# Scraping the Url
page = requests.get(url)
doc = lh.fromstring(page.content)

# Parse data
th_elements = doc.xpath('//th') # header
td_elements = doc.xpath('//td') # cells content

headers = [th_element.text_content() for th_element in th_elements]
content = [td_element.text_content() for td_element in td_elements]
rows_content = np.array(content).reshape(int(len(content)/len(headers)),len(headers)).tolist()

df = pd.DataFrame(rows_content)
df.columns = headers
df = df[:-1] # drop Total row

In [4]:
df

Unnamed: 0,"Country,Other",TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,"Country,Other.1",TotalCases.1,NewCases.1,TotalDeaths.1,NewDeaths.1,TotalRecovered.1,ActiveCases.1,"Serious,Critical.1",Tot Cases/1M pop.1
0,China,80928,+34,3245,+8,70420,7263,2274,56,Italy,41035,+5322,3405,+427,4440,33190,2498,679
1,Iran,18407,+1046,1284,+149,5979,11144,,219,Spain,17395,+2626,803,+165,1107,15485,800,372
2,Germany,15309,+2982,44,+16,114,15151,2,183,USA,11413,+2154,171,+21,108,11134,64,34
3,France,10995,+1861,372,+108,602,10021,931,168,S. Korea,8565,+152,91,+7,1947,6527,59,167
4,Switzerland,3944,+829,41,+8,15,3888,,456,UK,3269,+643,144,+40,65,3060,20,48
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171,Djibouti,1,+1,,,,1,,1,Gambia,1,,,,,1,,0.4
172,Guinea,1,,,,,1,,0.08,Vatican City,1,,,,,1,,1248
173,Montserrat,1,+1,,,,1,,200,St. Vincent Grenadines,1,,,,,1,,9
174,Sint Maarten,1,+1,,,,1,,23,Somalia,1,,,,,1,,0.06


In [3]:
# Convert values to float
for i,col_name in enumerate(df.columns):
    if i!=0:
        df[col_name] = pd.to_numeric(df[col_name].apply(lambda x:x.replace(",","")),errors='coerce')
        
# Convert country to ISO codes
countries_list = df["Country,Other"]\
.apply(lambda x: x.strip()).replace({'UK': 'Great Britain', 'UAE': 'United Arab Emirates'}).values.tolist()
df["iso_alpha"] = pd.Series(coco.convert(names=countries_list, to='ISO3', not_found=None))

# Rename comma seperated cols
df = df.rename(columns={'Country,Other': 'Country',
                  'Serious,Critical': 'Critical'})

df = df.fillna(0)

# Create text that will be display on hover
df["text"] = df['Country'].apply(lambda x: x.strip()) + '<br>' + \
    'Active Cases ' + df['ActiveCases'].astype(int).astype(str) + \
    '<br>' + 'Total Deaths ' + df['TotalDeaths'].astype(int).astype(str)

TypeError: arg must be a list, tuple, 1-d array, or Series

In [9]:
# Export Dataframe
df.to_csv("static/data/corona.csv",index=False,sep=",")

### Visualize df using Plotly (Optional)

In [None]:
# import plotly.express as px
# import datetime
# today_date = datetime.datetime.today().date().strftime("%d-%m-%Y")
# fig = px.choropleth(df, locations="iso_alpha",
#                     color="TotalCases",
#                     hover_name="Country",
#                     color_continuous_scale=px.colors.diverging.Portland,
#                    title='Daily Coronavirus Cases in the Word [{}]'.format(today_date)\
#                     +' Source: <a https://www.worldometers.info/coronavirus/">Worldometers</a>',
#                    height=600,
#                    range_color=[0,1000],
#                    labels={'TotalCases':'Min Number of cases'})
# fig.show()