In [None]:
# Libraries
import pandas as pd
import requests as rq
import re as re
import pickle

In [None]:
# Get all links of the indicators
df_links = pd.read_excel("data_files/Data-sheet-STUDENTS.xlsx", sheet_name="Data sources", header=None, usecols="C", skiprows=3, na_values="")
df_links = df_links.dropna()
links = []
for index, link in df_links[2].items():
    # We don't handle other source for now
    if "data.worldbank" not in link:
        continue
        
    # Match the useful indicator in the link
    match = re.search("indicator/([\.|\w]*)", link)
    links.append(match.group(1))

In [None]:
# Get countries data
df = pd.read_excel('data_files/Data-sheet-STUDENTS.xlsx', header=2, index_col=0)

In [None]:
# Store the info of Country - ISO Code
iso_codes = {}

In [None]:
# Get countries name for API request - Get ISO code
for index, name in df['Country (according to CN Gov):'].items():
    
    # Clean the data
    cleaned_name = name.split('(')[0].strip()
    if name == "Thailiand" :
        cleaned_name = "Thailand"
    elif name == "The United Arab Emirates":
        cleaned_name = "United Arab Emirates"
    elif name == "Brunei":
        cleaned_name = "Brunei Darussalam"
    elif name == "Myanmar":
        cleaned_name = "Burma"
    elif name == "East Timor":
        cleaned_name = "Timor-Leste"
    elif name == "Kyrgyzstan":
        cleaned_name = "Kyrgyz Republic"
    elif name == "United States of America":
        cleaned_name = "United States"
    
    # We don't want to redo the request if the name already exists
    if name in iso_codes :
        continue
         
    # Request part
    apiURL = "https://restcountries.eu/rest/v2/name/" + cleaned_name
    resp = rq.get(apiURL)
        
    # Error handler
    if(resp.status_code != 200) :
        print("STATUS_CODE ERROR", resp.status_code, resp.text)
        continue
        
    # Response handler
    data = resp.json()
    alpha3Code = data[0]['alpha3Code']
    alpha2Code = data[0]['alpha2Code']
    if alpha3Code == "IOT":
        alpha3Code = "IND"
    iso_codes[name] = {"iso3" : alpha3Code, "iso2" : alpha2Code, "other" : cleaned_name}

In [None]:
iso_codes

In [None]:
# Save isocodes object for later use
with open('objects_saved/isocodes.pkl', 'wb') as handle:
  pickle.dump(iso_codes, handle)

In [None]:
# Store the info of ISO Code + the other indicators
countries_inds = {}

In [None]:
indicators = {
    "GC.TAX.IMPT.ZS": "IMF WB\nCustoms and other import duties (% of tax revenue) 2016",
    "FB.AST.NPER.ZS": "WB IMF Non-performing loans to total gross loans ratio (%)",
    "IS.RRS.GOOD.MT.K6": "UIC Rail freight (mn ton x km travelled)",
    "IS.AIR.GOOD.MT.K1": "ICAO air freight (mn ton x km travelled)",
    "NE.CON.PRVT.PP.CD": "WB Household final consumption expenditure, PPP bn USD",
    "GB.XPD.RSDV.GD.ZS": "WB UNESCO R&D Expenditure % of GDP",
    "SP.POP.SCIE.RD.P6": "WB UNESCO Researchers p. mn inhabitants",
    "IC.REG.COST.PC.ZS": "WB Cost of Starting a Business",
    "GC.DOD.TOTL.GD.ZS": "WB Central Government Debt",
    "SH.XPD.CHEX.PP.CD": "WHO Current Health Expenditure p.c., PPP USD",
    "IP.PAT.RESD": "WIPO Patent Applications",
    "IP.TMK.RSCT": "WIPO Trademark applications",
    "BX.KLT.DINV.WD.GD.ZS": "IFDI Net Inflows (% of GDP)"
}

In [None]:
# Get all columns name of the indicators
for indic in links:
    for country in iso_codes:

        # If the data is already retrieved we don't make the request
        if country not in countries_inds.keys():
            countries_inds[country] = {}
        elif indic in countries_inds[country].keys():
            continue
        
        # Request part
        apiURL = "http://api.worldbank.org/v2/country/"+ iso_codes[country]['iso3'] +"/indicator/"+ indic +"?format=json&mrv=1"
        resp = rq.get(apiURL)

        # Error handler
        if(resp.status_code != 200) :
            print("STATUS_CODE ERROR", resp.status_code, resp.text)
            print("Indicator : " + indic + " - country : " + iso_codes[country]['iso3'])
            print("URL : " + apiURL)
            continue

        # Response handler
        # print("indicator : " + indic + " - country : " + country)
        data = resp.json()
        if data[1] is None:
            continue
        else :
            indic_value = data[1][0]['value']
        countries_inds[country].update({indic : indic_value})

In [None]:
countries_inds

In [None]:
for country in countries_inds.keys():
    for indic in countries_inds[country]:
        if pd.isnull( df[indicators[indic]].iloc[list(countries_inds).index(country)]):
            df[indicators[indic]].iloc[list(countries_inds).index(country)] = countries_inds[country][indic]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [None]:
df

In [None]:
df.to_pickle("objects_saved/data.pkl")