# Import libraries

In [47]:
from google.cloud import bigquery
import os
from google.oauth2 import service_account
import pandas as pd
import pickle
import seaborn as sns
from sklearn.impute import KNNImputer

# Obtain data

Set up client connection to bigquery api

In [4]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]= '../world-bank-indicators-3ccb8c5c2658.json'
client = bigquery.Client()

Get bigquery 'World Bank WDI' dataset

In [5]:
dataset_id = client.dataset("world_bank_wdi", project="bigquery-public-data")
dataset = client.get_dataset(dataset_id)

View tables in dataset

In [6]:
tables = list(client.list_tables(dataset))
for table in tables:
    print(table.table_id)

country_series_definitions
country_summary
footnotes
indicators_data
series_summary
series_time


View indicators_data table

In [7]:
table_id = dataset_id.table("indicators_data")
table = client.get_table(table_id)
[print(item) for item in table.schema]
client.list_rows(table, max_results=5).to_dataframe()

SchemaField('country_name', 'STRING', 'NULLABLE', '', (), None)
SchemaField('country_code', 'STRING', 'NULLABLE', '', (), None)
SchemaField('indicator_name', 'STRING', 'NULLABLE', '', (), None)
SchemaField('indicator_code', 'STRING', 'NULLABLE', '', (), None)
SchemaField('value', 'FLOAT', 'NULLABLE', '', (), None)
SchemaField('year', 'INTEGER', 'NULLABLE', '', (), None)


  if not self._validate_bqstorage(bqstorage_client, create_bqstorage_client):


Unnamed: 0,country_name,country_code,indicator_name,indicator_code,value,year
0,Syrian Arab Republic,SYR,"PPG, IBRD (DOD, current US$)",DT.DOD.MIBR.CD,484497000.0,1991
1,Syrian Arab Republic,SYR,"PPG, IDA (DOD, current US$)",DT.DOD.MIDA.CD,16389000.0,2009
2,Syrian Arab Republic,SYR,"PPG, IDA (DOD, current US$)",DT.DOD.MIDA.CD,14052000.0,2012
3,Syrian Arab Republic,SYR,Prevalence of anemia among women of reproducti...,SH.ANM.ALLW.ZS,37.4,1998
4,Syrian Arab Republic,SYR,"Prevalence of HIV, male (% ages 15-24)",SH.HIV.1524.MA.ZS,0.1,2008


Query anomoly detection dataset from indicators_data table

In [8]:
indicators_query = """
        SELECT *
        FROM `bigquery-public-data.world_bank_wdi.indicators_data`
        WHERE indicator_name IN ('Tax revenue (% of GDP)', 
                                 'General government final consumption expenditure (% of GDP)',
                                 'Children out of school (% of primary school age)',
                                 'GDP growth (annual %)',
                                 'Net investment in nonfinancial assets (% of GDP)',
                                 'Merchandise trade (% of GDP)',
                                 'Employment to population ratio, 15+, total (%) (national estimate)',
                                 'Gross savings (% of GDP)',
                                 'Interest rate spread (lending rate minus deposit rate, %)',
                                 'Inflation, consumer prices (annual %)')"""
indicators_query_job = client.query(indicators_query)
indicators_df = indicators_query_job.to_dataframe()
indicators_df.to_csv('Excel/dataset.csv', index=False)

  "Cannot create BigQuery Storage client, the dependency "


# Scrub data

Read df with indicators data into pandas

In [38]:
df = pd.read_csv('Excel/dataset.csv')
df.head()

Unnamed: 0,country_name,country_code,indicator_name,indicator_code,value,year
0,Tajikistan,TJK,GDP growth (annual %),NY.GDP.MKTP.KD.ZG,7.760139,2007
1,Tanzania,TZA,GDP growth (annual %),NY.GDP.MKTP.KD.ZG,6.781586,2013
2,Lebanon,LBN,GDP growth (annual %),NY.GDP.MKTP.KD.ZG,3.811023,2013
3,Lesotho,LSO,GDP growth (annual %),NY.GDP.MKTP.KD.ZG,-0.405521,1966
4,"Macao SAR, China",MAC,GDP growth (annual %),NY.GDP.MKTP.KD.ZG,-2.359157,1999


Remove rows with regional and other non-country country_names (which are specified in exclusions.csv) from the dataset

In [39]:
exclusions_df = pd.read_csv('Excel/exclusions.csv')
df.set_index('country_code', inplace=True)
df.drop(list(exclusions_df['country_code'].values), inplace=True)
df.reset_index(inplace=True)
df.head()

Unnamed: 0,country_code,country_name,indicator_name,indicator_code,value,year
0,TJK,Tajikistan,GDP growth (annual %),NY.GDP.MKTP.KD.ZG,7.760139,2007
1,TZA,Tanzania,GDP growth (annual %),NY.GDP.MKTP.KD.ZG,6.781586,2013
2,LBN,Lebanon,GDP growth (annual %),NY.GDP.MKTP.KD.ZG,3.811023,2013
3,LSO,Lesotho,GDP growth (annual %),NY.GDP.MKTP.KD.ZG,-0.405521,1966
4,STP,Sao Tome and Principe,GDP growth (annual %),NY.GDP.MKTP.KD.ZG,2.376878,2002


View number of repeated indicator measurements for the same country across different years

In [40]:
df['country_indicator'] = df['country_name'] + ' ' + df['indicator_name']
df['country_indicator'].value_counts()

Austria Merchandise trade (% of GDP)                                                   60
Algeria General government final consumption expenditure (% of GDP)                    60
Malawi Merchandise trade (% of GDP)                                                    60
Dominican Republic General government final consumption expenditure (% of GDP)         60
Jamaica Inflation, consumer prices (annual %)                                          60
                                                                                       ..
Libya Employment to population ratio, 15+, total (%) (national estimate)                1
South Sudan Employment to population ratio, 15+, total (%) (national estimate)          1
Papua New Guinea Employment to population ratio, 15+, total (%) (national estimate)     1
Guinea-Bissau Tax revenue (% of GDP)                                                    1
Turkmenistan Employment to population ratio, 15+, total (%) (national estimate)         1
Name: coun

Filter out outdated indicator values from before 2010

In [41]:
df = df[df['year']>=2010]
print(df.shape)
df.head()

(14285, 7)


Unnamed: 0,country_code,country_name,indicator_name,indicator_code,value,year,country_indicator
1,TZA,Tanzania,GDP growth (annual %),NY.GDP.MKTP.KD.ZG,6.781586,2013,Tanzania GDP growth (annual %)
2,LBN,Lebanon,GDP growth (annual %),NY.GDP.MKTP.KD.ZG,3.811023,2013,Lebanon GDP growth (annual %)
12,USA,United States,GDP growth (annual %),NY.GDP.MKTP.KD.ZG,2.563767,2010,United States GDP growth (annual %)
16,AUS,Australia,GDP growth (annual %),NY.GDP.MKTP.KD.ZG,2.940334,2018,Australia GDP growth (annual %)
23,MHL,Marshall Islands,GDP growth (annual %),NY.GDP.MKTP.KD.ZG,3.624753,2018,Marshall Islands GDP growth (annual %)


Create list specifying which df rows have the most recent annual measurements for their country/indicator combination

In [42]:
def to_keep(row):
    values = list(df[df['country_indicator']==row['country_indicator']]['year'].values)
    if row['year'] == max(values):
        return True
    else:
        return False
    
row_booleans = df.apply(to_keep, axis=1)
df['keep'] = row_booleans

Filter out df rows with non-most recent country indicator measurements

In [43]:
pre_pivot_df = df[df['keep']==True]
pre_pivot_df.drop(columns=['keep'], inplace=True)
print(pre_pivot_df.shape)
pre_pivot_df.head()

(1678, 7)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,country_code,country_name,indicator_name,indicator_code,value,year,country_indicator
23,MHL,Marshall Islands,GDP growth (annual %),NY.GDP.MKTP.KD.ZG,3.624753,2018,Marshall Islands GDP growth (annual %)
56,BGR,Bulgaria,GDP growth (annual %),NY.GDP.MKTP.KD.ZG,3.371616,2019,Bulgaria GDP growth (annual %)
93,DZA,Algeria,GDP growth (annual %),NY.GDP.MKTP.KD.ZG,0.8,2019,Algeria GDP growth (annual %)
97,GNB,Guinea-Bissau,GDP growth (annual %),NY.GDP.MKTP.KD.ZG,4.600094,2019,Guinea-Bissau GDP growth (annual %)
133,KAZ,Kazakhstan,GDP growth (annual %),NY.GDP.MKTP.KD.ZG,4.5,2019,Kazakhstan GDP growth (annual %)


Pivot indicator measurements table to countries as row indices, and indicators as column indices

In [44]:
wb_df = pre_pivot_df.pivot(index='country_name', columns='indicator_name', values='value')
wb_df.head()

indicator_name,Children out of school (% of primary school age),"Employment to population ratio, 15+, total (%) (national estimate)",GDP growth (annual %),General government final consumption expenditure (% of GDP),Gross savings (% of GDP),"Inflation, consumer prices (annual %)","Interest rate spread (lending rate minus deposit rate, %)",Merchandise trade (% of GDP),Net investment in nonfinancial assets (% of GDP),Tax revenue (% of GDP)
country_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Afghanistan,,42.014599,2.901229,13.123956,18.851059,2.302373,,43.478594,18.512123,9.250468
Albania,2.10689,53.391998,2.214063,11.531084,14.290923,1.411091,5.783333,56.374894,3.877773,18.557475
Algeria,0.39044,36.91,0.8,18.60284,37.789191,1.951768,6.25,45.743165,,
Andorra,,,1.849238,,,,,60.049625,,
Angola,18.40073,40.040001,-0.869394,10.474545,25.173194,17.14532,12.879965,51.622323,3.826951,9.223777


Perform wb_df.info() to understand what further scrubbing needed

In [45]:
wb_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 195 entries, Afghanistan to Zimbabwe
Data columns (total 10 columns):
 #   Column                                                              Non-Null Count  Dtype  
---  ------                                                              --------------  -----  
 0   Children out of school (% of primary school age)                    178 non-null    float64
 1   Employment to population ratio, 15+, total (%) (national estimate)  179 non-null    float64
 2   GDP growth (annual %)                                               193 non-null    float64
 3   General government final consumption expenditure (% of GDP)         174 non-null    float64
 4   Gross savings (% of GDP)                                            165 non-null    float64
 5   Inflation, consumer prices (annual %)                               183 non-null    float64
 6   Interest rate spread (lending rate minus deposit rate, %)           133 non-null    float64
 7   Merchan

Impute missing values with knnimputer

In [57]:
imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')
imputer.fit(wb_df)
wb_scrubbed = imputer.transform(wb_df)
wb_df_scrubbed = pd.DataFrame(wb_scrubbed, columns=wb_df.columns)
wb_df_scrubbed.index = wb_df.index
wb_df_scrubbed.head()

indicator_name,Children out of school (% of primary school age),"Employment to population ratio, 15+, total (%) (national estimate)",GDP growth (annual %),General government final consumption expenditure (% of GDP),Gross savings (% of GDP),"Inflation, consumer prices (annual %)","Interest rate spread (lending rate minus deposit rate, %)",Merchandise trade (% of GDP),Net investment in nonfinancial assets (% of GDP),Tax revenue (% of GDP)
country_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Afghanistan,8.824358,42.014599,2.901229,13.123956,18.851059,2.302373,7.204411,43.478594,18.512123,9.250468
Albania,2.10689,53.391998,2.214063,11.531084,14.290923,1.411091,5.783333,56.374894,3.877773,18.557475
Algeria,0.39044,36.91,0.8,18.60284,37.789191,1.951768,6.25,45.743165,7.51659,22.01367
Andorra,4.809204,56.970441,1.849238,21.687959,26.182784,2.863752,5.44005,60.049625,3.516514,23.959638
Angola,18.40073,40.040001,-0.869394,10.474545,25.173194,17.14532,12.879965,51.622323,3.826951,9.223777


Confirm no missing values with wb_df_scrubbed.info()

In [58]:
wb_df_scrubbed.info()

<class 'pandas.core.frame.DataFrame'>
Index: 195 entries, Afghanistan to Zimbabwe
Data columns (total 10 columns):
 #   Column                                                              Non-Null Count  Dtype  
---  ------                                                              --------------  -----  
 0   Children out of school (% of primary school age)                    195 non-null    float64
 1   Employment to population ratio, 15+, total (%) (national estimate)  195 non-null    float64
 2   GDP growth (annual %)                                               195 non-null    float64
 3   General government final consumption expenditure (% of GDP)         195 non-null    float64
 4   Gross savings (% of GDP)                                            195 non-null    float64
 5   Inflation, consumer prices (annual %)                               195 non-null    float64
 6   Interest rate spread (lending rate minus deposit rate, %)           195 non-null    float64
 7   Merchan

Engineer 'Budget Defecit' feature from 'Tax revenue (% of GDP)' and 'General government final consumption expenditure (% of GDP)'

In [62]:
wb_df_scrubbed['Budget Deficit'] = wb_df_scrubbed['Tax revenue (% of GDP)'] -
                                   wb_df_scrubbed['General government final consumption expenditure (% of GDP)']
wb_df_scrubbed.head()

SyntaxError: invalid syntax (<ipython-input-62-594663800679>, line 1)

Confirm that data is scrubbed with wb_df.info()

save scrubbed wb_df to csv file for use in modeling file

In [102]:
wb_df.reset_index().to_csv('Scrubbed_wb_dataset.csv', index=False)