# Data Extraction from World Bank API 

Scraping a list of World Bank Indicators.

Needed input:
* `indicators`: List of Indicators
* `countries`: List of countries
* `years`: List of years 

In [1]:
!pip install -q world_bank_data
 
# install chromium, its driver, and selenium
!apt update

# Handle this command with care, I(Priyam Mehta) tried it out but it requires chromium browser
# not Chrome but chromium and was left with a broken package as chromium was not present in my system.
# I might be wrong and Rosana can update this?Thanks 
!apt install -q chromium-chromedriver


!pip install -q selenium


Reading package lists... Done
W: chmod 0700 of directory /var/lib/apt/lists/partial failed - SetupAPTPartialDirectory (1: Operation not permitted)
E: Could not open lock file /var/lib/apt/lists/lock - open (13: Permission denied)
E: Unable to lock directory /var/lib/apt/lists/
W: Problem unlinking the file /var/cache/apt/pkgcache.bin - RemoveCaches (13: Permission denied)
W: Problem unlinking the file /var/cache/apt/srcpkgcache.bin - RemoveCaches (13: Permission denied)
E: Could not open lock file /var/lib/dpkg/lock-frontend - open (13: Permission denied)
E: Unable to acquire the dpkg frontend lock (/var/lib/dpkg/lock-frontend), are you root?


In [2]:
!cp /usr/lib/chromium-browser/chromedriver /usr/bin


cp: '/usr/lib/chromium-browser/chromedriver' and '/usr/bin/chromedriver' are the same file


In [3]:

import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')

In [4]:

import world_bank_data as wb
import pandas as pd
import re
import time
#from google.colab import files

## Auxiliary functions

* `wb_interface_search(indicator)`: Returns indicator's ID  
* `indicators_dict(indicators)`: Returns dictionary of indicators and respective ID
* `build_wb_df(ind_dict)`: Returns dataframe with all selected indicators 
* `filter_wb(df, years, countries):` Returns filtered dataframe for specific `years` and `countries`


In [5]:
#Search 

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import re
import time


def wb_interface_search(indicator):
  '''Returns indicator's ID 
  `indicator`: world bank indicator as string 
  Example: 'Access to electricity, rural (% of rural population)'  
  '''

  t_start = time.time()
  options = webdriver.ChromeOptions()
  options.add_argument('--headless')
  options.add_argument('--no-sandbox')
  options.add_argument('--disable-dev-shm-usage')
  options.add_argument("--disable-notifications")

  weblink = 'https://data.worldbank.org/indicator?tab=all'
  #wd = webdriver.Chrome('/home/nandutu/Documents/chromedriver/chromedriver',options=options)
  wd = webdriver.Chrome('chromedriver',options=options)

  t_end = time.time()
  print('Initialized web driver in {} s'.format(t_end - t_start))

  t_start = time.time()
  wd.get(weblink)
  t_end = time.time()
  print('Downloaded website in {} s'.format(t_end - t_start))

  t_start = time.time()
  search_key = wd.find_element_by_xpath("//input[@id='overviewSearch']")
  time.sleep(5)    #Need to let the page load completely, otherwise the search result wont be narrowed down
  search_key.send_keys(indicator + Keys.RETURN + Keys.ENTER)
  t_end = time.time()
  print('Searched key in {} s'.format(t_end - t_start))

  t_start = time.time()
  all_results = wd.find_elements_by_xpath('//section[@class="nav-item"]/*/li/a')
  try:
      expected_result = all_results[0]   #Assume first result is the most relevant one, since we copied exact title
      link= expected_result.get_attribute("href")
      code = re.search('([A-Z].*)\?', link)[0].replace('?','')
      t_end = time.time()
      print('Found result in {} s'.format(t_end - t_start))

      return code
  except:
      return ''

def indicators_dict(indicators):
  '''Returns dictionary of indicators and respective ID
  `indicators`: list of world bank indicator as string 
  Example: ['Access to electricity, rural (% of rural population)', 'Arable land (% of land area)']  
  '''

  t_start = time.time()
  
  key = map(wb_interface_search, indicators)

  ind_dict = {}  

  for ind, i in zip(indicators, key):
    ind_dict[ind]= i

  t_end = time.time()
  print('Did everything in {} s'.format(t_end - t_start))


  return(ind_dict)



In [6]:
indicators = ['GDP (current US$)', ' GDP growth (annual %)', 'Population growth (annual %)',
             'Population in largest city','Population, total', 'Population density (people per sq. km of land area)',
              'Rural population (% of total population)', 'Rural population',
              'Urban population growth (annual %)']

ind_dict= indicators_dict(indicators)

SessionNotCreatedException: Message: session not created: This version of ChromeDriver only supports Chrome version 90
Current browser version is 87.0.4280.141 with binary path /usr/bin/google-chrome
Stacktrace:
#0 0x56060c66c9b9 <unknown>


In [7]:
# Extract data

def build_wb_df(ind_dict):
  '''
   Returns dataframe with all selected indicators 
   `ind_dict`: dictionary of indicators and respective IDs 
   Example: {'Access to electricity, rural (% of rural population)': 'EG.ELC.ACCS.RU.ZS', 'Arable land (% of land area)': 'AG.LND.ARBL.ZS'}
  '''


  data_frame = pd.DataFrame()
  df_set, title_set = [],[]

  count = 0
  for key,value in ind_dict.items():
    #print(value)
    #'date' parameter seems to be able to select one year only. So currently extracting all data available
    time.sleep(1)
    #print(value)

    df = wb.get_series(value).to_frame()    
    df = df.reset_index()

    # For checking individual dataset
    df_set.append(df)
    title_set.append(df['Series'][0])
    df = df.drop(['Series'], axis=1)

    if count > 0:
      data_frame = pd.merge(data_frame, df, on = ['Country','Year'], how = 'outer')
    else:
      data_frame = df

    count +=1

  inv_map = {v: k for k, v in ind_dict.items()}
  i = 2
  for k,v in inv_map.items():
    data_frame = data_frame.rename(columns={data_frame.columns[i]: v})
    i+=1
  return(data_frame)
    


In [8]:
df1= build_wb_df(ind_dict)

NameError: name 'ind_dict' is not defined

In [9]:
# Filtering year and countries


years = list(range(1960,2021))
countries = ['Peru', 'China', 'Costa Rica', 'Afghanistan', 'Bolivia', 'Panama', 
             'Greece', 'New Zealand', 'Haiti', 'Mexico', 'Argentina', 'Kyrgyzstan', 
             'Iran', 'Venezuela', 'Philippines', 'Japan', 'Indonesia', 'Guatemala', 
             'Papua New Guinea', 'Solomon Islands', 'Tajikistan', 'Bhutan', 'Chile', 
             'Myanmar', 'Ethiopia', 'Malawi', 'Colombia', 'Russia', 
             'Democratic Republic of the Congo', 'Taiwan', 'Ecuador', 
             'Vanuatu', 'United States of America', 'United Republic of Tanzania', 
             'India', 'Georgia', 'Albania', 'Namibia', 'Turkey', 'Pakistan', 'Iraq', 
             'Honduras', 'Kazakhstan', 'Brazil', 'Turkmenistan', 'North Macedonia', 
             'Saudi Arabia', 'Uzbekistan', 'Romania', 'Italy', 'Canada', 'South Africa', 
             'Dominican Republic', 'Nepal', 'East Timor', 'Antarctica', 'Iceland', 
             'Malaysia', 'Lebanon', 'Rwanda', 'Mozambique', 'Mongolia', 'Nicaragua', 
             'Bangladesh', 'France', 'Azerbaijan', 'Laos', 'El Salvador', 'Zambia', 
             'Uganda', 'Algeria', 'Bosnia and Herzegovina', 'Jamaica', 'Dominica', 
             'Morocco', 'Trinidad and Tobago', 'Slovenia', 'Jordan', 'Djibouti', 'Fiji', 
             'Tunisia', 'Kosovo', 'United Arab Emirates', 'Spain', 'Australia', 
             'Austria', 'Poland', 'Cyprus', 'Republic of Serbia', 'Denmark',
             'Republic of the Congo', 'Syria', 'Croatia', 'Egypt', 'Thailand', 
             'Sudan', 'Eritrea', 'Madagascar', 'Belgium', 'South Sudan', 'Burundi', 
             'Montenegro', 'Somaliland', 'Tonga', 'Germany', 'Bulgaria', 'Switzerland', 
             'Gabon', 'United Kingdom', 'Portugal', 'Zimbabwe', 'Antigua and Barbuda', 
             'Libya', 'Vietnam', 'Federated States of Micronesia', 'South Korea', 
             'Botswana', 'Kashmir', 'Kenya', 'Angola', 'Paraguay', 'Armenia', 'Hungary', 
             'Israel', 'Norway', 'Guinea', 'Yemen', 'North Korea']


def filter_wb(df, years, countries):
  '''
   Returns filtered dataframe for specific `years` and `countries`
   `df`: dataframe returned by function `build_wb_df` 
   `years`: list of input years as strings. Example: ['2010', '2011']
   `countries`: list of countries as strings. Example: ['Algeria', 'Angola'] 
  '''

  mask_year = (df['Year'] == years[0])
  for year in years[1:]:
    mask_year = mask_year | (df['Year'] == year)
  # mask_year = (df['Year'] == years[0]) | (df['Year'] == years[1]) | (df['Year'] == years[0])
  mask_country = (df['Country'] == countries[0])
  for country in countries[1:]:
    mask_country = mask_country | (df['Country'] == country)
  df = df[mask_year & mask_country]
  return(df.reset_index())

In [10]:
df = filter_wb(df1, years, countries)

df.head(20)

NameError: name 'df1' is not defined