In [17]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

In [18]:
def get_value(cell):
    
    """ 
        Extract text from the element, and convert them into numbers when possible. 
        Take a cell tag.
        Return the field value.
    """
    
    text = ''.join(cell.find_all(text=True, recursive=False)).strip().replace(u'\xa0', u'')
    
    if text.replace(" ", "").isnumeric():
        return int(text.replace(" ", ""))
    
    elif text.replace(" ", "").replace(".", "").isnumeric():
        return float(text.replace(" ", ""))
    
    elif text.replace(" ", "").replace(".", "").replace("/", "").isnumeric():
        return text.replace(" ", "")
    
    else:
        return text

In [19]:
def get_url(country):
    
    """ 
        Find the two digit country code using a country name, based on ISO 3166-1.
        Take a country name.
        Return the url linked to the country's profile.
    """
    
    file = pd.read_csv("data_csv.csv")
    mapping = dict(zip(file["Name"].to_list(),file["Code"].to_list()))
    
    name = country.title()
    iso_code = mapping[name].lower()
    url="http://data.un.org/en/iso/"+iso_code+".html"

    return url

In [20]:
def general_info(soup):
    
    """ 
        Extract General Information from the UNdata database.
        Take a bs4.BeautifulSoup object generated from querying the country's profile page.
        Return a dataframe containing the country's General Information
    """
    
    tables = soup.find_all("table")
    rows = tables[1].find_all("tr")
    
    index = [[get_value(r.find_all("td")[0]) for r in rows]]
    value = [[get_value(r.find_all("td")[-1]) for r in rows]]
    
    data = dict(zip(['Index', 2019], index+value))
    
    df = pd.DataFrame(data)
    
    return df

In [21]:
def indicator(soup, indicator):
    
    """ 
        Extract Indicators information from the UNdata database.
        Take a bs4.BeautifulSoup object generated from querying the country's profile page
           & the name of the indicator.
        Return a dataframe containing the country's indicators data (3 years). 
    """
    
    tables = soup.find_all("table")
    
    if indicator == 'Economic indicators':
        table = tables[2]
    elif indicator == 'Social indicators':
        table = tables[3]
    elif indicator == 'Environment and infrastructure indicators':
        table = tables[4]
    
    rows = table.find_all("tr")
    col_num = len(rows[0].find_all("td"))
        
    year = [get_value(rows[0].find_all("td")[i]) for i in range(col_num)][1:] 

    col_names = ['Index','Unit']+year
    
    index = [[get_value(r.find_all("td")[0]) for r in rows[1:]]] 

    unit = [[get_value(r.find_all("td")[0].small) for r in rows[1:]]]
    
    value = [[(get_value(r.find_all("td")[i].small)) for r in rows[1:]] for i in range(-col_num+1,0)]
    
        
    data = dict(zip(col_names, index+unit+value))
    
    df = pd.DataFrame(data)
    
    return df

In [22]:
def get_profile(url):
    
    """ 
        Take a url that is a country's profile page.
        Return a list of two dataframes, 
               which contains the country's General Information and 
               Indicators data (3 years) respectively. 
    """
    response = requests.get(url)
    
    soup = BeautifulSoup(response.content,'lxml')

    section = [i.get_text() for i in soup.find_all("summary")]

    GI_df = general_info(soup)

    Indicators_df = pd.concat([indicator(soup, i) for i in section[1:]], keys=section[1:])

    return [GI_df, Indicators_df]

In [23]:
country = "Anguilla"
url = get_url(country)
profile = get_profile(url)

In [24]:
profile[0]

Unnamed: 0,Index,2019
0,Region,Caribbean
1,"Population(000, 2019)",15
2,"Pop. density(per km2, 2019)",165.2
3,Capital city,The Valley
4,"Capital city pop.(000, 2019)",1.4
5,Surface area(km2),91
6,Sex ratio(m per 100 f),97.6
7,National currency,E. Caribbean Dollar (XCD)
8,Exchange rate(per US$),2.7


In [25]:
profile[1]

Unnamed: 0,Unnamed: 1,Index,Unit,2005,2010,2019
Economic indicators,0,GDP: Gross domestic product,(million current US$),229,268,281
Economic indicators,1,GDP growth rate,"(annual %, const. 2010 prices)",13.1,-4.5,-7.7
Economic indicators,2,GDP per capita,(current US$),18129,19459,18860.7
Economic indicators,3,Economy: Agriculture,(% of Gross Value Added),2.7,2,2.4
Economic indicators,4,Economy: Industry,(% of Gross Value Added),19.3,15.8,11
Economic indicators,5,Economy: Services and other activity,(% of GVA),78,82.2,86.6
Economic indicators,6,Employment: Agriculture,(% of employed),2.9,...,...
Economic indicators,7,Employment: Industry,(% of employed),18.9,...,...
Economic indicators,8,Employment: Services,(% employed),76.7,...,...
Economic indicators,9,Unemployment,(% of labour force),7.8,...,...
