***The content of this file is largely from the teaching material of the Data Science unit of University of Bristol***

In [None]:
# Import tools that we will need:

# Opening web sites and web scraping: 
import requests

# OS. Sometimes need this for finding working directory:
import os

# JSON. This helps us make JSON look prettier and easier to read
import json

# Files.  This is part of Collab - allows you to upload and download files
from google.colab import files

import pandas as pd

import numpy as np

# glob. Module used to return all file paths that match a specific pattern 
import glob

Data from World Bank can't be loaded directly on Vega as the data is in double nested form. An alternative is using an XML to JSON converter, but this will only load data from a single country in Vega. The two methods that can be used is using for loop and panda data frame in Python or directly downloading the CSV files from World Bank. These two methods mean that the data are not automatically updated, and has to be manually ammended over time. Nonetheless, the uploaded data is unlikely to crash due to the automation from APIs.

list of indicators used in the project:
*   **NY.GDP.PCAP.PP.CD** - GDP per capita, PPP (current international $)
*   **SI.POV.GINI** - Gini index (World Bank estimate) 
*   **IC.LGL.CRED.XQ** - Strength of legal rights index
*   **GC.TAX.TOTL.GD.ZS** - Tax revenue (% of GDP)
*   **SP.DYN.IMRT.IN** - Mortality rate, infant (per 1,000 live births)
*   **IT.NET.SECR.P6** - Secure Internet servers (per 1 million people) 
*   **SE.SEC.NENR** - School enrollment, secondary (% net)



In [None]:
data=[]
# list of countries to batch download
# countries = ['RUS', 'EST', 'LVA', 'LTU','BLR', 'UKR', 'ARM', 'KGZ', 'GEO', 'GBR', 'MLT', 'SGP', 'IRL', 'IND','MYS', 'ZAF', 'FJI', 'EGY', 'ESP', 'LUX', 'ITA', 'NLD', 'MEX', 'PER', 'MAR', 'ARG', 'GTM']
countries = ['RUS', 'EST', 'LVA', 'LTU','BLR', 'UKR', 'ARM', 'KGZ', 'GEO', 'MDA','KAZ','TJK','UZB','AZE', 'GBR', 'MLT', 'SGP', 'IRL', 'IND','MYS', 'ZAF', 'FJI', 'EGY','ZWE','MMR','JAM','IRQ','LKA', 'ESP', 'LUX', 'ITA', 'NLD', 'MEX', 'PER', 'MAR', 'ARG', 'GTM','NIC','COL','VEN','URY','PRY']

for country in countries:

  # put indicator in brackets {}
  url='https://api.worldbank.org/v2/country/'+country+'/indicator/{}?format=json'
  
  # getting the data which is the 2nd element in the list 
  jsonData=requests.get(url).json()[1]

  # joins all countries in one list
  data.append(jsonData) 
  
  #print(data)

In [None]:
# flatten the array
flatData=np.array(data).flatten()

# creates a panda data frame from the flattened array
pdData=pd.json_normalize(flatData) 

pdData.head(3) 

In [None]:
# renaming columns, name the column {} the indicator
pdData.columns = ['countryCode','year','{}', 'unit','obsStat','decimal','indicatorCode','indicator','countryID','country']

# dropping any missing values in the specified column 
actData = pdData.dropna(subset=["{}"]) 

In [None]:
# add a new column named "category" with value "Russian" if the column "countryCode" is equal to the given value
actData.loc[actData['countryCode'] == "RUS", 'category'] = 'Russian' 
actData.loc[actData['countryCode'] == "EST", 'category'] = 'Russian' 
actData.loc[actData['countryCode'] == "LVA", 'category'] = 'Russian' 
actData.loc[actData['countryCode'] == "LTU", 'category'] = 'Russian' 
actData.loc[actData['countryCode'] == "BLR", 'category'] = 'Russian' 
actData.loc[actData['countryCode'] == "UKR", 'category'] = 'Russian' 
actData.loc[actData['countryCode'] == "ARM", 'category'] = 'Russian' 
actData.loc[actData['countryCode'] == "KGZ", 'category'] = 'Russian' 
actData.loc[actData['countryCode'] == "GEO", 'category'] = 'Russian' 
actData.loc[actData['countryCode'] == "MDA", 'category'] = 'Russian' 
actData.loc[actData['countryCode'] == "KAZ", 'category'] = 'Russian' 
actData.loc[actData['countryCode'] == "TJK", 'category'] = 'Russian' 
actData.loc[actData['countryCode'] == "UZB", 'category'] = 'Russian' 
actData.loc[actData['countryCode'] == "AZE", 'category'] = 'Russian' 

In [None]:
actData.loc[actData['countryCode'] == "GBR", 'category'] = 'British' 
actData.loc[actData['countryCode'] == "MLT", 'category'] = 'British' 
actData.loc[actData['countryCode'] == "SGP", 'category'] = 'British' 
actData.loc[actData['countryCode'] == "IRL", 'category'] = 'British' 
actData.loc[actData['countryCode'] == "IND", 'category'] = 'British' 
actData.loc[actData['countryCode'] == "MYS", 'category'] = 'British' 
actData.loc[actData['countryCode'] == "ZAF", 'category'] = 'British' 
actData.loc[actData['countryCode'] == "FJI", 'category'] = 'British' 
actData.loc[actData['countryCode'] == "EGY", 'category'] = 'British' 
actData.loc[actData['countryCode'] == "ZWE", 'category'] = 'British' 
actData.loc[actData['countryCode'] == "MMR", 'category'] = 'British' 
actData.loc[actData['countryCode'] == "JAM", 'category'] = 'British' 
actData.loc[actData['countryCode'] == "IRQ", 'category'] = 'British' 
actData.loc[actData['countryCode'] == "LKA", 'category'] = 'British' 

In [None]:
actData.loc[actData['countryCode'] == "ESP", 'category'] = 'Spanish' 
actData.loc[actData['countryCode'] == "LUX", 'category'] = 'Spanish' 
actData.loc[actData['countryCode'] == "ITA", 'category'] = 'Spanish' 
actData.loc[actData['countryCode'] == "NLD", 'category'] = 'Spanish' 
actData.loc[actData['countryCode'] == "MEX", 'category'] = 'Spanish' 
actData.loc[actData['countryCode'] == "PER", 'category'] = 'Spanish' 
actData.loc[actData['countryCode'] == "MAR", 'category'] = 'Spanish' 
actData.loc[actData['countryCode'] == "ARG", 'category'] = 'Spanish' 
actData.loc[actData['countryCode'] == "GTM", 'category'] = 'Spanish' 
actData.loc[actData['countryCode'] == "NIC", 'category'] = 'Spanish' 
actData.loc[actData['countryCode'] == "COL", 'category'] = 'Spanish' 
actData.loc[actData['countryCode'] == "VEN", 'category'] = 'Spanish' 
actData.loc[actData['countryCode'] == "URY", 'category'] = 'Spanish' 
actData.loc[actData['countryCode'] == "PRY", 'category'] = 'Spanish' 

In [None]:
actData.head(5)

In [None]:
# changing the working directory
os.chdir("/content") 

# downloading from panda data frame to csv, for columns country, {}, year and category
# name the file in the brackets {}
actData[['country','year','{}', 'category']].to_csv("{}.csv")