In [1]:
import requests
import lxml.html as lh
import pandas as pd
import numpy as np
import country_converter as coco
import itertools

### Build df from Url

In [11]:
url='https://www.worldometers.info/coronavirus/'

# Scraping the Url
page = requests.get(url)
doc = lh.fromstring(page.content)
todaydoc = doc.get_element_by_id("main_table_countries_today")

# Parse data
th_elements = todaydoc.xpath('//th') # header
td_elements = todaydoc.xpath('//td') # cells content

headers = [th_element.text_content() for th_element in th_elements]
headers = headers[:len(headers)//2]
headers = ['Country,Other',
  'TotalCases',
  'NewCases',
  'TotalDeaths',
  'NewDeaths',
  'TotalRecovered',
  'NewRecovered',
  'ActiveCases',
  'Serious,Critical',
  'Tot\xa0Cases/1M pop',
  'Deaths/1M pop',
  'TotalTests',
  'Tests/\n1M pop\n',
  'Population']

content = [td_element.text_content() for td_element in td_elements]
rows_content = np.array(content).reshape(int(len(content)/len(headers)),len(headers)).tolist()[:-1] #rm Today row
rows_content = list(k for k,_ in itertools.groupby(rows_content))

df = pd.DataFrame(rows_content)
df.columns = headers
df = df.drop_duplicates(subset= 'Country,Other', keep='first')

ValueError: cannot reshape array of size 13167 into shape (940,14)

In [12]:
content

['',
 '\nNorth America\n',
 '7,441,201',
 '+31,361',
 '274,154',
 '+1,491',
 '4,309,721',
 '+24,252',
 '2,857,326',
 '19,400',
 '',
 '',
 '',
 '',
 '',
 'North America',
 '\n',
 '',
 '',
 '',
 '\nSouth America\n',
 '6,400,791',
 '+12,178',
 '205,053',
 '+458',
 '5,030,288',
 '+10,939',
 '1,165,450',
 '15,224',
 '',
 '',
 '',
 '',
 '',
 'South America',
 '\n',
 '',
 '',
 '',
 '\nAsia\n',
 '7,295,935',
 '+107,446',
 '145,640',
 '+1,553',
 '5,854,866',
 '+86,756',
 '1,295,429',
 '18,841',
 '',
 '',
 '',
 '',
 '',
 'Asia',
 '\n',
 '',
 '',
 '',
 '\nEurope\n',
 '3,645,546',
 '+34,231',
 '208,555',
 '+381',
 '2,097,475',
 '+10,717',
 '1,339,516',
 '5,868',
 '',
 '',
 '',
 '',
 '',
 'Europe',
 '\n',
 '',
 '',
 '',
 '\nAfrica\n',
 '1,268,608',
 '+4,191',
 '30,177',
 '+73',
 '1,004,741',
 '+2,187',
 '233,690',
 '1,278',
 '',
 '',
 '',
 '',
 '',
 'Africa',
 '\n',
 '',
 '',
 '',
 '\nOceania\n',
 '28,765',
 '+109',
 '692',
 '+6',
 '23,779',
 '+195',
 '4,294',
 '35',
 '',
 '',
 '',
 '',
 '',
 'Aust

In [None]:
# Convert values to float
for i,col_name in enumerate(df.columns):
    print
    if i!=0:
        df[col_name] = pd.to_numeric(df[col_name].apply(lambda x:x.replace(",","")),errors='coerce')
        
# Convert country to ISO codes
countries_list = df["Country,Other"]\
.apply(lambda x: x.strip()).replace({'UK': 'Great Britain', 'UAE': 'United Arab Emirates'}).values.tolist()
df["iso_alpha"] = pd.Series(coco.convert(names=countries_list, to='ISO3', not_found=None))

# Rename comma seperated cols
df = df.rename(columns={'Country,Other': 'Country',
                  'Serious,Critical': 'Critical'})

df = df.fillna(0)

# Create text that will be display on hover
df["text"] = df['Country'].apply(lambda x: x.strip()) + '<br>' + \
    'Active Cases ' + df['ActiveCases'].astype(int).astype(str) + \
    '<br>' + 'Total Deaths ' + df['TotalDeaths'].astype(int).astype(str)

In [None]:
# Export Dataframe
df.to_csv("static/data/corona.csv",index=False,sep=",")

### Visualize df using Plotly (Optional)

In [15]:
# import plotly.express as px
# import datetime
# today_date = datetime.datetime.today().date().strftime("%d-%m-%Y")
# fig = px.choropleth(df, locations="iso_alpha",
#                     color="TotalCases",
#                     hover_name="Country",
#                     color_continuous_scale=px.colors.diverging.Portland,
#                    title='Daily Coronavirus Cases in the Word [{}]'.format(today_date)\
#                     +' Source: <a https://www.worldometers.info/coronavirus/">Worldometers</a>',
#                    height=600,
#                    range_color=[0,1000],
#                    labels={'TotalCases':'Min Number of cases'})
# fig.show()