In [5]:
import requests
import lxml.html as lh
import pandas as pd
import numpy as np
import country_converter as coco

### Build df from Url

In [20]:
url='https://www.worldometers.info/coronavirus/'

# Scraping the Url
page = requests.get(url)
doc = lh.fromstring(page.content)

# Parse data
th_elements = doc.xpath('//th') # header
td_elements = doc.xpath('//td') # cells content

headers = [th_element.text_content() for th_element in th_elements][:9]
content = [td_element.text_content() for td_element in td_elements]
rows_content = np.array(content).reshape(int(len(content)/len(headers)),len(headers)).tolist()

df = pd.DataFrame(rows_content)
df.columns = headers
df = df[:-1] # drop Total row

In [18]:
len(th_elements)

18

In [21]:
# Convert values to float
for i,col_name in enumerate(df.columns):
    print
    if i!=0:
        df[col_name] = pd.to_numeric(df[col_name].apply(lambda x:x.replace(",","")),errors='coerce')
        
# Convert country to ISO codes
countries_list = df["Country,Other"]\
.apply(lambda x: x.strip()).replace({'UK': 'Great Britain', 'UAE': 'United Arab Emirates'}).values.tolist()
df["iso_alpha"] = pd.Series(coco.convert(names=countries_list, to='ISO3', not_found=None))

# Rename comma seperated cols
df = df.rename(columns={'Country,Other': 'Country',
                  'Serious,Critical': 'Critical'})

df = df.fillna(0)

# Create text that will be display on hover
df["text"] = df['Country'].apply(lambda x: x.strip()) + '<br>' + \
    'Active Cases ' + df['ActiveCases'].astype(int).astype(str) + \
    '<br>' + 'Total Deaths ' + df['TotalDeaths'].astype(int).astype(str)



In [22]:
df[df["Country"]] = ""

Unnamed: 0,Country,TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,ActiveCases,Critical,Tot Cases/1M pop,iso_alpha,text
0,China,80928,34.0,3245.0,8.0,70420.0,7263,2274.0,56.00,CHN,China<br>Active Cases 7263<br>Total Deaths 3245
1,Italy,41035,5322.0,3405.0,427.0,4440.0,33190,2498.0,679.00,ITA,Italy<br>Active Cases 33190<br>Total Deaths 3405
2,Iran,18407,1046.0,1284.0,149.0,5979.0,11144,0.0,219.00,IRN,Iran<br>Active Cases 11144<br>Total Deaths 1284
3,Spain,17395,2626.0,803.0,165.0,1107.0,15485,800.0,372.00,ESP,Spain<br>Active Cases 15485<br>Total Deaths 803
4,Germany,15309,2982.0,44.0,16.0,114.0,15151,2.0,183.00,DEU,Germany<br>Active Cases 15151<br>Total Deaths 44
...,...,...,...,...,...,...,...,...,...,...,...
348,Sint Maarten,1,1.0,0.0,0.0,0.0,1,0.0,23.00,SXM,Sint Maarten<br>Active Cases 1<br>Total Deaths 0
349,Somalia,1,0.0,0.0,0.0,0.0,1,0.0,0.06,SOM,Somalia<br>Active Cases 1<br>Total Deaths 0
350,Suriname,1,0.0,0.0,0.0,0.0,1,0.0,2.00,SUR,Suriname<br>Active Cases 1<br>Total Deaths 0
351,Eswatini,1,0.0,0.0,0.0,0.0,1,0.0,0.90,SWZ,Eswatini<br>Active Cases 1<br>Total Deaths 0


In [9]:
# Export Dataframe
df.to_csv("static/data/corona.csv",index=False,sep=",")

### Visualize df using Plotly (Optional)

In [None]:
# import plotly.express as px
# import datetime
# today_date = datetime.datetime.today().date().strftime("%d-%m-%Y")
# fig = px.choropleth(df, locations="iso_alpha",
#                     color="TotalCases",
#                     hover_name="Country",
#                     color_continuous_scale=px.colors.diverging.Portland,
#                    title='Daily Coronavirus Cases in the Word [{}]'.format(today_date)\
#                     +' Source: <a https://www.worldometers.info/coronavirus/">Worldometers</a>',
#                    height=600,
#                    range_color=[0,1000],
#                    labels={'TotalCases':'Min Number of cases'})
# fig.show()