In [1]:
#Webscrape wikipedia for lockdown start dates and end dates
import urllib.request
from bs4 import BeautifulSoup

In [97]:
#URL for the wikipedia page
url = "https://en.wikipedia.org/wiki/COVID-19_pandemic_lockdowns"
#Open the webpage
page = urllib.request.urlopen(url)
#Read and extract the HTML from the page
soup = BeautifulSoup(page, "lxml")

In [100]:
#Display title of the webpage
soup.title.string

'COVID-19 pandemic lockdowns - Wikipedia'

In [279]:
#Find the correct table with all the details needed
right_table = soup.find("table", class_="wikitable sortable mw-collapsible")

In [297]:
#Go through the HTML to find the <tr> tags which contain the country name, start date and end date of lockdowns
countries = []
startDate=[]
endDate=[]
#Extract information required from the tags
for row in right_table.findAll('tr'):
    #Information stored in <td> tags
    #Find all <td> tags
    cells=row.findAll('td')
    #3 <td> tags per <tr> tag
    if len(cells)==3:
        getCountry = cells[0].findAll('a')
        countries.append(getCountry[0].contents[0])
        startDate.append(cells[1].find(text=True))        
        endDate.append(cells[2].find(text=True))

In [303]:
import pandas as pd
#Create dataframe with country name, start date, end date
df=pd.DataFrame(countries,columns=['Country'])
df['Start Date']=startDate
df['End Date']=endDate
#Display 30 rows only
pd.set_option('display.max_rows',50)
df

Unnamed: 0,Country,Start Date,End Date
0,Armenia,2020-03-24,2020-05-04
1,Australia,2020-03-23,
2,Austria,2020-03-16,2020-04-13
3,Azerbaijan,2020-03-31,2020-04-20
4,Bangladesh,2020-03-26,2020-05-16
5,Barbados,2020-03-28,2020-05-03
6,Belgium,2020-03-18,2020-05-04
7,Bermuda,2020-04-04,2020-05-02
8,Bolivia,2020-03-22,2020-07-31
9,Botswana,2020-04-02,2020-04-30


In [245]:
#Checking to see if there are any null values to clean
df.isnull().values.any()

False

In [299]:
#Replace the anomalous data which contains the strings "Area" and "City". This has only occurred in the Saudi Arabia data.
df.at[45, 'End Date'] = ""
df.at[46, 'End Date'] = ""
df

Unnamed: 0,Country,Start Date,End Date
0,Armenia,2020-03-24,2020-05-04
1,Australia,2020-03-23,
2,Austria,2020-03-16,2020-04-13
3,Azerbaijan,2020-03-31,2020-04-20
4,Bangladesh,2020-03-26,2020-05-16
5,Barbados,2020-03-28,2020-05-03
6,Belgium,2020-03-18,2020-05-04
7,Bermuda,2020-04-04,2020-05-02
8,Bolivia,2020-03-22,2020-07-31
9,Botswana,2020-04-02,2020-04-30


In [300]:
#Output the number of countries announcing lockdown in March
print("Number of countries in lockdown beginning March: " + str(len(df.loc[df['Start Date'].str.contains('2020-03')])) + "/" + str(len(df['Start Date'])))

Number of countries in lockdown beginning March: 57/63


In [313]:
#Output the number of countries announcing the easing of lockdown by June
easeApril = len(df.loc[df['End Date'].str.contains('2020-04')])
easeMay = len(df.loc[df['End Date'].str.contains('2020-05')])
easeJune = len(df.loc[df['End Date'].str.contains('2020-06')])
easeJuly = len(df.loc[df['End Date'].str.contains('2020-07')])
print("Number of countries out of lockdown by April: " + str(easeApril) + "/" + str(len(df['End Date'])))
print("Number of countries out of lockdown by May: " + str((easeApril+easeMay)) + "/" + str(len(df['End Date'])))
print("Number of countries out of lockdown by June: " + str((easeApril+easeMay+easeJune)) + "/" + str(len(df['End Date'])))
print("Number of countries out of lockdown by July: " + str((easeApril+easeMay+easeJune+easeJuly)) + "/" + str(len(df['End Date'])))

Number of countries out of lockdown by April: 23/63
Number of countries out of lockdown by May: 40/63
Number of countries out of lockdown by June: 48/63
Number of countries out of lockdown by July: 49/63
