In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

In [2]:
def get_value(cell):
    
    """ 
        Extract text from the element, and convert them into numbers when possible. 
        Take a cell tag.
        Return the field value.
    """
    
    text = ''.join(cell.find_all(text=True, recursive=False)).strip().replace(u'\xa0', u'')
    
    if text.replace(" ", "").isnumeric():
        return int(text.replace(" ", ""))
    
    elif text.replace(" ", "").replace(".", "").isnumeric():
        return float(text.replace(" ", ""))
    
    elif text.replace(" ", "").replace(".", "").replace("/", "").isnumeric():
        return text.replace(" ", "")
    
    else:
        return text

In [3]:
def get_url(country):
    
    """ 
        Find the two digit country code using a country name, based on ISO 3166-1.
        Take a country name.
        Return the url linked to the country's profile.
    """
    
    file = pd.read_csv("data_csv.csv")
    mapping = dict(zip(file["Name"].to_list(),file["Code"].to_list()))
    
    name = country.title()
    iso_code = mapping[name].lower()
    url="http://data.un.org/en/iso/"+iso_code+".html"

    return url

In [4]:
def general_info(soup):
    
    """ 
        Extract General Information from the UNdata database.
        Take a bs4.BeautifulSoup object generated from querying the country's profile page.
        Return a dataframe containing the country's General Information
    """
    
    tables = soup.find_all("table")
    rows = tables[1].find_all("tr")
    
    index = [[get_value(r.find_all("td")[0]) for r in rows]]
    value = [[get_value(r.find_all("td")[-1]) for r in rows]]
    
    data = dict(zip(['Index', 2019], index+value))
    
    df = pd.DataFrame(data)
    
    return df

In [5]:
def indicator(soup, indicator):
    
    """ 
        Extract Indicators information from the UNdata database.
        Take a bs4.BeautifulSoup object generated from querying the country's profile page
           & the name of the indicator.
        Return a dataframe containing the country's indicators data (3 years). 
    """
    
    tables = soup.find_all("table")
    
    if indicator == 'Economic indicators':
        table = tables[2]
    elif indicator == 'Social indicators':
        table = tables[3]
    elif indicator == 'Environment and infrastructure indicators':
        table = tables[4]
    
    rows = table.find_all("tr")
    col_num = len(rows[0].find_all("td"))
        
    year = [get_value(rows[0].find_all("td")[i]) for i in range(col_num)][1:] 

    col_names = ['Index','Unit']+year
    
    index = [[get_value(r.find_all("td")[0]) for r in rows[1:]]] 

    unit = [[get_value(r.find_all("td")[0].small) for r in rows[1:]]]
    
    value = [[(get_value(r.find_all("td")[i].small)) for r in rows[1:]] for i in range(-col_num+1,0)]
    
        
    data = dict(zip(col_names, index+unit+value))
    
    df = pd.DataFrame(data)
    
    return df

In [6]:
def get_profile(url):
    
    """ 
        Take a url that is a country's profile page.
        Return a list of two dataframes, 
               which contains the country's General Information and 
               Indicators data (3 years) respectively. 
    """
    response = requests.get(url)
    
    soup = BeautifulSoup(response.content,'lxml')

    section = [i.get_text() for i in soup.find_all("summary")]

    GI_df = general_info(soup)

    Indicators_df = pd.concat([indicator(soup, i) for i in section[1:]], keys=section[1:])

    return [GI_df, Indicators_df]

In [7]:
country = "CHINA"
url = get_url(country)
profile = get_profile(url)

In [8]:
profile[0]

Unnamed: 0,Index,2019
0,Region,Eastern Asia
1,"Population(000, 2019)",1433784
2,"Pop. density(per km2, 2019)",152.7
3,Capital city,Beijing
4,"Capital city pop.(000, 2019)",20035.5
5,UN membership date,24-Oct-45
6,Surface area(km2),9600000
7,Sex ratio(m per 100 f),105.4
8,National currency,Yuan Renminbi (CNY)
9,Exchange rate(per US$),6.9


In [9]:
profile[1]

Unnamed: 0,Unnamed: 1,Index,Unit,2005,2010,2019
Economic indicators,0,GDP: Gross domestic product,(million current US$),2285960,6100648,12237782
Economic indicators,1,GDP growth rate,"(annual %, const. 2010 prices)",11.4,10.6,6.9
Economic indicators,2,GDP per capita,(current US$),1729.7,4486.6,8682.2
Economic indicators,3,Economy: Agriculture,(% of Gross Value Added),12,9.8,8.4
Economic indicators,4,Economy: Industry,(% of Gross Value Added),47.2,46.6,41.3
Economic indicators,5,Economy: Services and other activity,(% of GVA),40.9,43.6,50.4
Economic indicators,6,Employment: Agriculture,(% of employed),35.8,26.2,15.5
Economic indicators,7,Employment: Industry,(% of employed),29.6,30.2,26
Economic indicators,8,Employment: Services,(% employed),34.6,43.6,58.5
Economic indicators,9,Unemployment,(% of labour force),4.1,4.2,4.8
