# Getting data from the World Bank API

In [1]:
import pandas as pd
from pandas_datareader import wb

## API exploration
This API is actually implemented in pandas \o/

But there are a **lot** of different variables and informations to deal with: 17328 different indicators

In [2]:
# Getting the variable ('id' field)
wb.get_indicators()

Unnamed: 0,id,name,unit,source,sourceNote,sourceOrganization,topics
0,1.0.HCount.1.90usd,Poverty Headcount ($1.90 a day),,LAC Equity Lab,The poverty headcount index measures the propo...,b'LAC Equity Lab tabulations of SEDLAC (CEDLAS...,Poverty
1,1.0.HCount.2.5usd,Poverty Headcount ($2.50 a day),,LAC Equity Lab,The poverty headcount index measures the propo...,b'LAC Equity Lab tabulations of SEDLAC (CEDLAS...,Poverty
2,1.0.HCount.Mid10to50,Middle Class ($10-50 a day) Headcount,,LAC Equity Lab,The poverty headcount index measures the propo...,b'LAC Equity Lab tabulations of SEDLAC (CEDLAS...,Poverty
3,1.0.HCount.Ofcl,Official Moderate Poverty Rate-National,,LAC Equity Lab,The poverty headcount index measures the propo...,b'LAC Equity Lab tabulations of data from Nati...,Poverty
4,1.0.HCount.Poor4uds,Poverty Headcount ($4 a day),,LAC Equity Lab,The poverty headcount index measures the propo...,b'LAC Equity Lab tabulations of SEDLAC (CEDLAS...,Poverty
...,...,...,...,...,...,...,...
17323,per_sionl.overlap_q1_tot,Population in the 1st quintile (poorest) only ...,,The Atlas of Social Protection: Indicators of ...,,b'The Atlas of Social Protection: Indicators o...,Social Protection & Labor
17324,per_sionl.overlap_q1_urb,Population in the 1st quintile (poorest) only ...,,The Atlas of Social Protection: Indicators of ...,,b'The Atlas of Social Protection: Indicators o...,Social Protection & Labor
17325,s_loans_A1,"Outstanding loans per 1,000 adults",,G20 Financial Inclusion Indicators,,b'',
17326,s_policyholders_B2_life,"Insurance policy holders per 1,000 adults (life)",,G20 Financial Inclusion Indicators,Denotes the total number of life insurance pol...,"b'International Monetary Fund, Financial Acces...",


In [3]:
# Searching for variables
wb.search('alcohol')

Unnamed: 0,id,name,unit,source,sourceNote,sourceOrganization,topics
10711,SH.ALC.PCAP.FE.LI,"Total alcohol consumption per capita, female (...",,World Development Indicators,Total alcohol per capita consumption is define...,"b'World Health Organization, Global Health Obs...",Health
10712,SH.ALC.PCAP.LI,Total alcohol consumption per capita (liters o...,,World Development Indicators,Total alcohol per capita consumption is define...,"b'World Health Organization, Global Health Obs...",Health
10713,SH.ALC.PCAP.MA.LI,"Total alcohol consumption per capita, male (li...",,World Development Indicators,Total alcohol per capita consumption is define...,"b'World Health Organization, Global Health Obs...",Health


In [4]:
# Getting the list of countries
wb.get_countries().head(1)

Unnamed: 0,iso3c,iso2c,name,region,adminregion,incomeLevel,lendingType,capitalCity,longitude,latitude
0,ABW,AW,Aruba,Latin America & Caribbean,,High income,Not classified,Oranjestad,-70.0167,12.5167


## Getting ready to process the files

As these variables will be added to the data gathered from the [Wikipedia webscraping](wiki_data_combine.ipynb), we need to format it so it merges:

In [5]:
country_rename_dict = {"China, People's Republic of" : "China",
                       "Cote d'Ivoire" : "Ivory Coast",
                       "Côte d'Ivoire" : "Ivory Coast",
                       "People's Republic of China" : "China",
                       'Burma' : 'Myanmar',
                       'Congo' : 'Congo (Brazzaville)',
                       'Congo, Dem. Rep.' : 'Congo (Kinshasa)',
                       'Congo, Democratic Republic of the' : 'Congo (Kinshasa)',
                       'Congo, Rep.' : 'Congo (Brazzaville)',
                       'Congo, Republic of the' : 'Congo (Brazzaville)',
                       'Congo, Republic of' : 'Congo (Brazzaville)',
                       'Czech Republic' : 'Czechia',
                       'Democratic Republic of Congo' : 'Congo (Kinshasa)',
                       'Democratic Republic of the Congo' : 'Congo (Kinshasa)',
                       'DR Congo' : 'Congo (Kinshasa)',
                       'Egypt, Arab Rep.' : 'Egypt',
                       'Eswatini (Swaziland)' : 'Eswatini',
                       'Gambia, The' : 'Gambia',
                       'Great Britain': 'United Kingdom',
                       'Hong Kong SAR, China' : 'Hong Kong',
                       'Iran, Islamic Rep.' : 'Iran',
                       'Korea, North' : 'North Korea',
                       'Korea, Rep.' : 'South Korea',
                       'Korea, South' : 'South Korea',
                       'Kyrgyz Republic' : 'Kyrgyzstan',
                       'Lao PDR' : 'Laos',
                       'Republic of the Congo' : 'Congo (Brazzaville)',
                       'Russian Federation' : 'Russia',
                       'Swaziland' : 'Eswatini',
                       'Slovak Republic' : 'Slovakia',
                       'The Gambia' : 'Gambia',
                       'Trinidad and Tobago' : 'Trinidad & Tobago',
                       'United States' : 'United States of America',
                       'Venezuela, RB' : 'Venezuela',
                       'Yemen, Rep.' : 'Yemen'
                      }

In [6]:
def rename_country(country_name):
    if country_name in country_rename_dict.keys():
        return country_rename_dict[country_name]
    else:
        return country_name
    
def table_mod(table):
    '''
    Modifying the table so it has a 'countries' column (necessary for merging it to the existing DataFrame)
    And getting the same spelling for countries, using the 'country_rename_dict'
    '''
    table['country'] = table['country'].apply(lambda x: rename_country(x))
    table.rename(columns={'country':'countries'}, inplace=True)
    table.set_index('countries', inplace = True)
    return table

## Country list

Has some categorical information about the countries, such as low/med/high income, etc.

In [7]:
countries = wb.get_countries()
countries.rename(columns={'name':'country'}, inplace=True)
countries

Unnamed: 0,iso3c,iso2c,country,region,adminregion,incomeLevel,lendingType,capitalCity,longitude,latitude
0,ABW,AW,Aruba,Latin America & Caribbean,,High income,Not classified,Oranjestad,-70.0167,12.51670
1,AFG,AF,Afghanistan,South Asia,South Asia,Low income,IDA,Kabul,69.1761,34.52280
2,AFR,A9,Africa,Aggregates,,Aggregates,Aggregates,,,
3,AGO,AO,Angola,Sub-Saharan Africa,Sub-Saharan Africa (excluding high income),Lower middle income,IBRD,Luanda,13.2420,-8.81155
4,ALB,AL,Albania,Europe & Central Asia,Europe & Central Asia (excluding high income),Upper middle income,IBRD,Tirane,19.8172,41.33170
...,...,...,...,...,...,...,...,...,...,...
299,XZN,A5,Sub-Saharan Africa excluding South Africa and ...,Aggregates,,Aggregates,Aggregates,,,
300,YEM,YE,"Yemen, Rep.",Middle East & North Africa,Middle East & North Africa (excluding high inc...,Low income,IDA,Sana'a,44.2075,15.35200
301,ZAF,ZA,South Africa,Sub-Saharan Africa,Sub-Saharan Africa (excluding high income),Upper middle income,IBRD,Pretoria,28.1871,-25.74600
302,ZMB,ZM,Zambia,Sub-Saharan Africa,Sub-Saharan Africa (excluding high income),Lower middle income,IDA,Lusaka,28.2937,-15.39820


In [8]:
countries.to_csv('./data/Countries.csv')

## Internet access

Has information about the population access to internet, contains some age categories (15-34,35-59,60+)

In [9]:
df = wb.search('access to internet')

In [10]:
internet = wb.download(indicator=[i for i in df['id']],country='all',start=2017,end=2017)
internet.columns = [title for title in df['name']]
internet

Unnamed: 0_level_0,Unnamed: 1_level_0,Access to internet (% age 15+),"Access to internet, female (% age 15+)","Access to internet, male (% age 15+)","Access to internet, income, poorest 40% (% age 15+)","Access to internet, income, richest 60% (% age 15+)",Access to internet (% ages 15-34),Access to internet (% ages 35-59),Access to internet (% age 60+)
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Afghanistan,2017,14.80909,4.882765,24.62060,9.009466,18.66901,18.33455,9.081439,7.316440
Albania,2017,47.62765,44.803910,50.62286,34.689220,56.23954,75.53036,37.041380,17.318520
Algeria,2017,56.46716,52.998200,59.92853,42.665030,65.64495,69.20241,44.924090,27.347940
American Samoa,2017,,,,,,,,
Andorra,2017,,,,,,,,
...,...,...,...,...,...,...,...,...,...
Virgin Islands (U.S.),2017,,,,,,,,
West Bank and Gaza,2017,55.03128,53.860550,56.24088,30.812310,71.13063,60.99529,50.841650,29.335640
"Yemen, Rep.",2017,,,,,,,,
Zambia,2017,25.41447,20.395380,30.64323,12.752460,33.85024,30.59592,18.029440,8.907472


In [11]:
internet.to_csv('./data/Access_to_internet.csv')

## GDP (cross-referencing with Wikipedia)

Has information about the GDP per capita, in current USD. Conveniently, it is the one used on Wikipedia

In [12]:
gdp = wb.search('GDP per capita')

In [13]:
df = wb.download(indicator=['NY.GDP.PCAP.CD'],country='all',start=2018,end=2019)
df.columns = gdp[gdp['id']=='NY.GDP.PCAP.CD']['name']
df = df.dropna()

In [14]:
df.to_csv('./data/GDP.csv')

## Suicide

Has information about suicide rates: Male, Female and Global

In [15]:
suicide = wb.search('suicide')

In [16]:
df = wb.download(indicator=['SH.STA.SUIC.FE.P5','SH.STA.SUIC.MA.P5','SH.STA.SUIC.P5'],country='all',start=2016,end=2016)
df.columns = [title for title in suicide['name']]
df

Unnamed: 0_level_0,Unnamed: 1_level_0,"Suicide mortality rate, female (per 100,000 female population)","Suicide mortality rate, male (per 100,000 male population)","Suicide mortality rate (per 100,000 population)"
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Arab World,2016,2.445834,5.953132,4.271912
Caribbean small states,2016,3.906739,14.327918,9.100475
Central Europe and the Baltics,2016,5.155971,25.854199,15.157308
Early-demographic dividend,2016,8.066567,11.719699,9.935441
East Asia & Pacific,2016,8.411334,10.562305,9.508817
...,...,...,...,...
Virgin Islands (U.S.),2016,,,
West Bank and Gaza,2016,,,
"Yemen, Rep.",2016,5.600000,11.300000,8.500000
Zambia,2016,3.500000,8.800000,6.100000


In [17]:
df.to_csv('./data/Suicide.csv')

# Creating the master dataframe

Now that our files have been created, we import data from those csv, drop a few columns and readjust the index so it all can be added to the DataFrame we will use for our analysis.

We went throught the hassle of creating/importing csv files as it is more robust and faster when resuming work on the project (especially when pair programming) but these steps are not required, we could have created a database instead.

In [18]:
# Importing the csv
combined = pd.read_csv('./data/combined_df.csv')

continents = pd.read_csv('./data/continents.csv')
del continents['iso3c']

countries = pd.read_csv('./data/Countries.csv',index_col=0)

gdp = pd.read_csv('./data/GDP.csv',)
del gdp['year']

internet = pd.read_csv('./data/Access_to_internet.csv')
del internet['year']

suicide = pd.read_csv('./data/Suicide.csv')
del suicide['year']

In [19]:
# Merging into a master table, we could refactor as a function or even a class

table_mod(gdp)
df_combined = combined.join(gdp,on='countries',how='left')

table_mod(suicide)
df_combined = df_combined.join(suicide,on='countries',how='left')

table_mod(countries)
df_combined = df_combined.join(countries,on='countries',how='left')

table_mod(internet)
df_combined = df_combined.join(internet,on='countries',how='left')

table_mod(continents)
df_combined = df_combined.join(continents,on='countries',how='left')

In [20]:
df_combined

Unnamed: 0,countries,happiness,GDP_per_capita,social_support,healthy_life_exp,freedom,generosity,corruption,alcohol_consumption,cigarette_consumption,...,latitude,Access to internet (% age 15+),"Access to internet, female (% age 15+)","Access to internet, male (% age 15+)","Access to internet, income, poorest 40% (% age 15+)","Access to internet, income, richest 60% (% age 15+)",Access to internet (% ages 15-34),Access to internet (% ages 35-59),Access to internet (% age 60+),continent
0,Finland,7.769,1.340,1.587,0.986,0.596,0.153,0.393,12.3,1098.8,...,60.16080,91.492410,88.838040,94.352520,86.586430,94.761090,99.264210,97.733920,78.803990,Europe
1,Denmark,7.600,1.383,1.573,0.996,0.592,0.252,0.410,11.4,1298.0,...,55.67630,92.331520,89.203510,95.526910,88.256040,95.045990,99.494810,98.093170,77.035940,Europe
2,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341,7.7,552.8,...,59.91380,95.795970,95.889020,95.703790,94.078900,96.939320,100.000000,97.917570,88.694240,Europe
3,Iceland,7.494,1.380,1.624,1.026,0.591,0.354,0.118,7.1,848.1,...,64.13530,,,,,,,,,Europe
4,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298,9.9,1459.9,...,52.37380,91.965280,90.310570,93.652470,92.005390,91.938610,100.000000,96.294080,77.611290,Europe
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151,Rwanda,3.334,0.359,0.711,0.614,0.555,0.217,0.411,9.8,94.0,...,-1.95325,11.482790,8.280919,15.106390,5.251483,15.635840,16.327590,4.351039,1.870723,Africa
152,Tanzania,3.231,0.476,0.885,0.499,0.417,0.276,0.147,7.7,181.8,...,-6.17486,18.970620,13.864560,24.301670,10.125620,24.826540,25.424040,11.978890,2.366465,Africa
153,Afghanistan,3.203,0.350,0.517,0.361,0.000,0.158,0.025,0.7,311.6,...,34.52280,14.809090,4.882765,24.620600,9.009466,18.669010,18.334550,9.081439,7.316440,Asia
154,Central African Republic,3.083,0.026,0.000,0.105,0.225,0.235,0.035,3.8,213.2,...,5.63056,7.161354,4.901110,9.594541,3.569112,9.546126,9.016120,5.594250,0.000000,Africa


## Cleaning the data

In [21]:
# Rename some columns
df_combined.rename(columns = {'countries': 'country',
                              'lifeexps': 'life_expectancy',
                              'railways_per_capita': 'population_per_railways_km'
                             }, inplace = True);

In [22]:
# Populate missing with 0's where it makes sense
list_of_columns = ['gold_medals_2016',
                   'silver_medals_2016',
                   'bronze_medals_2016',
                   'oscar_winning'
                  ]
for column in list_of_columns:
    df_combined[column] = df_combined[column].fillna(0)

In [23]:
# Delete countries that are missing from lot of columns
list_of_countries_to_drop = ['Kosovo',
                             'Northern Cyprus',
                             'Palestinian Territories',
                             'South Sudan',
                            ]
for country in list_of_countries_to_drop:
    df_combined = df_combined[df_combined.country != country]

In [24]:
# Adjusting sectors and railways, they are not numbers.
df_combined['agricultural'] = df_combined['agricultural'].apply(lambda x: float(str(x)[:-1]))
df_combined['industrial'] = df_combined['industrial'].apply(lambda x: float(str(x)[:-1]))
df_combined['service'] = df_combined['service'].apply(lambda x: float(str(x)[:-1]))

In [25]:
def remove_comma(string):
    """
    Removes comma from string, input could be a float as well in which case don't touch it. 
    """
    if type(string) == str:
        string = string.replace(',', '')
        
    return string

In [26]:
df_combined['population_per_railways_km'] = df_combined['population_per_railways_km'].apply(lambda x: float(remove_comma(x)))

In [27]:
df_combined.to_csv('./data/finaldf.csv')