## Lab | Web Scraping Multiple Pages

Antonio Montilla

In [1]:
#importing libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd

### (1) Display the top 10 languages by number of native speakers stored in a pandas dataframe

https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers

In [2]:
#copying url
url = "https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers"

In [3]:
#downloading html
response = requests.get(url)
response.status_code # 200 status code means OK!

200

In [4]:
#parseing html with BeatifulSoup
soup = BeautifulSoup(response.content, "html.parser")

In [7]:
#selecting the first table
speakers = soup.select("table")[0]

In [9]:
rows = speakers.find_all('tr')[1:11]
languages = []
native_speakers = []

for row in rows:
    columns = row.find_all('td')    
    language = columns[0].text.strip()
    native_speaker = columns[1].text.strip()    
    languages.append(language)
    native_speakers.append(native_speaker)

top10_languages = pd.DataFrame({"language":languages, "native_speakers":native_speakers})

In [10]:
top10_languages

Unnamed: 0,language,native_speakers
0,Mandarin Chinese,939.0
1,Spanish,485.0
2,English,380.0
3,Hindi,345.0
4,Portuguese,236.0
5,Bengali,234.0
6,Russian,147.0
7,Japanese,123.0
8,Yue Chinese,86.1
9,Vietnamese,85.0


### (2) Display the 20 latest earthquakes info (date, time, latitude, longitude and region name) by the EMSC

https://www.emsc-csem.org/Earthquake/

In [11]:
url = 'https://www.emsc-csem.org/Earthquake/'

In [12]:
#downloading html
response = requests.get(url)
response.status_code # 200 status code means OK!

200

In [13]:
#parseing html with BeatifulSoup
soup = BeautifulSoup(response.content, "html.parser")

In [34]:
#body > div.content > div.htab > table > tbody
earthquake_table = soup.find('table', {'class': 'eqs'})
date_times = []
latitudes = []
longitudes = []
regions = []
for row in earthquake_table.find('tbody').find_all('tr'):#[:20]:  # to select only latest 20
    columns = row.find_all('td')    
    date_time = columns[3].text.strip()
    latitude = float(columns[4].text.strip())
    longitude = float(columns[5].text.strip())
    region = columns[8].text.strip()
    date_times.append(date_time)
    regions.append(region)
    latitudes.append(latitude)
    longitudes.append(longitude)

latest20_earthquakes = pd.DataFrame({"date_time":date_times, "latitude":latitudes, "longitude":longitudes, "region":regions})
#did not really delivers the data with this function

### (3) A list with the different kind of datasets available in url = 'https://data.gov.uk/'

In [36]:
url = 'https://data.gov.uk/'

In [37]:
#downloading html
response = requests.get(url)
response.status_code # 200 status code means OK!

200

In [38]:
#parseing html with BeatifulSoup
soup = BeautifulSoup(response.content, "html.parser")

In [85]:
websites = soup.select("#main-content > div:nth-child(3) > div > ul > li")
#websites

In [61]:
names = []
descriptions = []
links = []
for li in websites:
    name = li.find('h3', {'class': 'govuk-heading-s dgu-topics__heading'}).text.strip()
    description = li.find('p', {'class': 'govuk-body'}).text.strip()
    link = "https://www.data.gov.uk/"+li.find('a', {'class': 'govuk-link'})['href']
    names.append(name)
    descriptions.append(description)
    links.append(link)
gov_websites = pd.DataFrame({"website":names, "description":descriptions, "url":links})

In [62]:
gov_websites

Unnamed: 0,website,description,url
0,Business and economy,"Small businesses, industry, imports, exports a...",https://www.data.gov.uk//search?filters%5Btopi...
1,Crime and justice,"Courts, police, prison, offenders, borders and...",https://www.data.gov.uk//search?filters%5Btopi...
2,Defence,"Armed forces, health and safety, search and re...",https://www.data.gov.uk//search?filters%5Btopi...
3,Education,"Students, training, qualifications and the Nat...",https://www.data.gov.uk//search?filters%5Btopi...
4,Environment,"Weather, flooding, rivers, air quality, geolog...",https://www.data.gov.uk//search?filters%5Btopi...
5,Government,"Staff numbers and pay, local councillors and d...",https://www.data.gov.uk//search?filters%5Btopi...
6,Government spending,Includes all payments by government department...,https://www.data.gov.uk//search?filters%5Btopi...
7,Health,"Includes smoking, drugs, alcohol, medicine per...",https://www.data.gov.uk//search?filters%5Btopi...
8,Mapping,"Addresses, boundaries, land ownership, aerial ...",https://www.data.gov.uk//search?filters%5Btopi...
9,Society,"Employment, benefits, household finances, pove...",https://www.data.gov.uk//search?filters%5Btopi...


### (4) List all language names and number of related articles in the order they appear in wikipedia.org: 

url = 'https://www.wikipedia.org/'

In [65]:
url = 'https://www.wikipedia.org/'

In [66]:
#downloading html
response = requests.get(url)
response.status_code # 200 status code means OK!

200

In [67]:
#parseing html with BeatifulSoup
soup = BeautifulSoup(response.content, "html.parser")


In [80]:
#wiki = soup.select("div.central-featured")
##www-wikipedia-org > div.central-featured
wiki = soup.find_all('div', class_='central-featured-lang')

In [83]:
languages = []
articles = []

for div in wiki:
    language = div.find('strong').text.strip()
    articles_num = div.find('bdi').text.strip()
    languages.append(language)
    articles.append(articles_num)

wiki_articles = pd.DataFrame({"language":languages, "number_articles":articles})

In [84]:
wiki_articles

Unnamed: 0,language,number_articles
0,English,6 744 000+
1,Español,1 906 000+
2,Русский,1 947 000+
3,日本語,1 392 000+
4,Deutsch,2 852 000+
5,Français,2 567 000+
6,Italiano,1 835 000+
7,中文,1 387 000+
8,العربية,العربية
9,Português,1 113 000+
