## Data preprocessing

In [840]:
# Filter the warnings
import warnings

warnings.filterwarnings('ignore')

**Importing the libraries**

In [841]:
# Miscellaneous libraries
import os
import json
import numpy as np
import pandas as pd
import geopandas as gpd

# For pretty display of pandas dataframes in the notebook
from IPython.display import display, HTML
# Extend the margins of the notebook
display(HTML("<style>.container { width:98% !important; }</style>"))

Make sure you are in the correct working directory:

In [842]:
# Current working directory
print(f'Current working directory: {os.getcwd()}')

# Set the relative data path for all the files
raw_path = r'./DATA/GENDER_GAP/raw/'
processed_path = r'./DATA/GENDER_GAP/processed/'

Current working directory: /Users/berges/Desktop/3r/VI/PROJECTE2


**Reading education data**

In [843]:
# Define the colnames for the data (there are names with whitespaces)
colnames = ['Country', 'Country_Code', 'Indicator', 'Indicator_Code']
colnames.extend([str(year) for year in range(1960, 2020)])
colnames.append('extra_comma')

# Read the file
education_data = pd.read_csv(
    raw_path + 'education_data.csv', names = colnames, header = 0
)

Tere are some columns that we do not want:
- We will delete both country and indicator codes, as we already have their full names
- We will delete years from 1960 to 2000, as the majority of the values are null for those years, and thus we have a visualization of the last 20 years.
- We will delete the last column, as all of its elements are null values, because for some reason all the rows from the raw csv data are ended with a comma ','

In [844]:
# Delete some useless columns
cols_to_drop = [
    'Country_Code',       # We already have the country name
    'Indicator_Code',     # We already have the indicator name
    'extra_comma'         # Last column: all the csv file rows are ended with a comma ',' (for some reason)
]
# Append to the latter list years from 1960 to 2000
cols_to_drop.extend([str(year) for year in range(1960, 2000)])
# Append 2018 and 2019 as well as they have very few observations
cols_to_drop.extend(['2018', '2019'])

education_data = education_data.drop(columns = cols_to_drop)

We performed an exhaustive exploration of the different indicator variables:
- There are a total of 570 different indicators for 263 different countries
- We manually selected those that are somehow related with **education**

In [845]:
educationIndicators = [
    # Population related
    "Population, female (% of total)",
    # Children in employment
    "Children in employment, female (% of female children ages 7-14)",
    "Children in employment, male (% of male children ages 7-14)",
    # Out-of-school
    "Children out of school, primary, female",
    "Children out of school, primary, male",
    "Male primary school age children out-of-school (%)",
    "Female primary school age children out-of-school (%)",
    "Rate of out-of-school youth of upper secondary school age, female (%)",
    "Rate of out-of-school youth of upper secondary school age, male (%)",
    # Educational attainment
    "Educational attainment, at least Bachelor's or equivalent, population 25+, female (%) (cumulative)",
    "Educational attainment, at least Bachelor's or equivalent, population 25+, male (%) (cumulative)",
    "Educational attainment, at least completed lower secondary, population 25+, female (%) (cumulative)",
    "Educational attainment, at least completed lower secondary, population 25+, male (%) (cumulative)",
    "Educational attainment, at least completed post-secondary, population 25+, female (%) (cumulative)",
    "Educational attainment, at least completed post-secondary, population 25+, male (%) (cumulative)",
    "Educational attainment, at least completed primary, population 25+ years, female (%) (cumulative)",
    "Educational attainment, at least completed primary, population 25+ years, male (%) (cumulative)",
    "Educational attainment, at least completed short-cycle tertiary, population 25+, female (%) (cumulative)",
    "Educational attainment, at least completed short-cycle tertiary, population 25+, male (%) (cumulative)",
    "Educational attainment, at least completed upper secondary, population 25+, female (%) (cumulative)",
    "Educational attainment, at least completed upper secondary, population 25+, male (%) (cumulative)",
    "Educational attainment, at least Master's or equivalent, population 25+, female (%) (cumulative)",
    "Educational attainment, at least Master's or equivalent, population 25+, male (%) (cumulative)",
    "Educational attainment, Doctoral or equivalent, population 25+, female (%) (cumulative)",
    "Educational attainment, Doctoral or equivalent, population 25+, male (%) (cumulative)",
    # Literacy rates
    "Literacy rate, adult female (% of females ages 15 and above)",
    "Literacy rate, adult male (% of males ages 15 and above)",
    "Literacy rate, youth (ages 15-24), gender parity index (GPI)",
    "Literacy rate, youth female (% of females ages 15-24)",
    "Literacy rate, youth male (% of males ages 15-24)",
    # Completion rates
    "Lower secondary completion rate, female (% of relevant age group)",
    "Lower secondary completion rate, male (% of relevant age group)",
    "Primary completion rate, female (% of relevant age group)",
    "Primary completion rate, male (% of relevant age group)",
    # Below minimum reading proficiency at end of primary
    "Male pupils below minimum reading proficiency at end of primary (%). Low GAML threshold",
    "Female pupils below minimum reading proficiency at end of primary (%). Low GAML threshold",
    # Number of pupils and teachers
    "Primary education, pupils (% female)",
    "Primary education, teachers (% female)",
    "Secondary education, pupils (% female)",
    "Secondary education, teachers (% female)",
    # Progression to secondary school
    "Progression to secondary school, female (%)",
    "Progression to secondary school, male (%)",
    # School enrollment
    "School enrollment, primary (gross), gender parity index (GPI)",
    "School enrollment, primary and secondary (gross), gender parity index (GPI)",
    "School enrollment, primary, female (% gross)",
    "School enrollment, primary, female (% net)",
    "School enrollment, primary, male (% gross)",
    "School enrollment, primary, male (% net)",
    "School enrollment, secondary (gross), gender parity index (GPI)",
    "School enrollment, secondary, female (% gross)",
    "School enrollment, secondary, female (% net)",
    "School enrollment, secondary, male (% gross)",
    "School enrollment, secondary, male (% net)",
    "School enrollment, tertiary (gross), gender parity index (GPI)",
    "School enrollment, tertiary, female (% gross)",
    "School enrollment, tertiary, male (% gross)",
    # Female-Male students share
    "Share of female students in lower secondary education enrolled in vocational programmes (%)",
    "Share of female students in post-secondary non-tertiary education enrolled in vocational programmes (%)",
    "Share of female students in upper secondary education enrolled in vocational programmes (%)",
    "Share of male students in lower secondary education enrolled in vocational programmes (%)",
    "Share of male students in post-secondary non-tertiary education enrolled in vocational programmes (%)",
    "Share of male students in upper secondary education enrolled in vocational programmes (%)"
]

Filter the data for those education indicators:

In [846]:
education_data = education_data[education_data.Indicator.isin(educationIndicators)]

**Filter the indicators considering null values**

Some of this indicators have lots of null values. In order to ease our visualization pipeline, we will consider those with more "complete" data.

In [847]:
indicator_null_rate = {}

for indicator, indicator_data in education_data.groupby('Indicator', sort = False):
    # Hence 5260 is the total amount of items per indicator: 20 years * 263 countries = 5260
    indicator_null_rate[indicator] = indicator_data.isnull().sum().sum() / 5260

# Visualize how a lot of indicators have lots of missing values
sorted(indicator_null_rate.items(), key = lambda x: -x[1])[:5]

[('Female primary school age children out-of-school (%)', 0.8562737642585552),
 ('Female pupils below minimum reading proficiency at end of primary (%). Low GAML threshold',
  0.8562737642585552),
 ('Male primary school age children out-of-school (%)', 0.8562737642585552),
 ('Male pupils below minimum reading proficiency at end of primary (%). Low GAML threshold',
  0.8562737642585552),
 ('Educational attainment, Doctoral or equivalent, population 25+, female (%) (cumulative)',
  0.8553231939163498)]

Hence that there are a lot of indicators with a high percentage of null values, thus we will only consider the following indicators:

In [848]:
educationIndicators = [
    # Population related
    "Population, female (% of total)",
    # Out-of-school
    "Children out of school, primary, female",
    "Children out of school, primary, male",
    "Rate of out-of-school youth of upper secondary school age, female (%)",
    "Rate of out-of-school youth of upper secondary school age, male (%)",
    # Completion rates
    "Lower secondary completion rate, female (% of relevant age group)",
    "Lower secondary completion rate, male (% of relevant age group)",
    "Primary completion rate, female (% of relevant age group)",
    "Primary completion rate, male (% of relevant age group)",
    # Number of pupils and teachers
    "Primary education, pupils (% female)",
    "Primary education, teachers (% female)",
    "Secondary education, pupils (% female)",
    "Secondary education, teachers (% female)",
    # School enrollment
    "School enrollment, primary (gross), gender parity index (GPI)",
    "School enrollment, primary, female (% gross)",
    "School enrollment, primary, male (% gross)",
    "School enrollment, secondary (gross), gender parity index (GPI)",
    "School enrollment, secondary, female (% gross)",
    "School enrollment, secondary, male (% gross)",
    "School enrollment, tertiary (gross), gender parity index (GPI)",
    "School enrollment, tertiary, female (% gross)",
    "School enrollment, tertiary, male (% gross)"
]

Finally, we reduced the total amount of indicators from 570 to 21! It is important to notice that this is not the final list of indicators we will consider, but will serve as a good reference for choosing the countries to take into consideration.

In [849]:
education_data = education_data[education_data.Indicator.isin(educationIndicators)].reset_index(drop = True)

**Country selection**: we will consider those countries with fewer null values. We will also take into account the dimension of the country and the relevance (in terms of what we expect the gender gap to be) in order to choose our 20 countries:

In [850]:
country_null_rate = {}

for country, country_data in education_data.groupby('Country', sort = False):
    # Hence 378 is the total amount of items per country: 18 years * 21 indicators = 378
    country_null_rate[country] = country_data.isnull().sum().sum() / 378

# Filter the countries that have more than 25% missing observations
items = [country for country, nulls in sorted(country_null_rate.items(), key = lambda x: x[1])]
items

['Arab World',
 'Central Europe and the Baltics',
 'Early-demographic dividend',
 'East Asia & Pacific',
 'East Asia & Pacific (excluding high income)',
 'East Asia & Pacific (IDA & IBRD)',
 'Euro area',
 'Europe & Central Asia',
 'Europe & Central Asia (excluding high income)',
 'Europe & Central Asia (IDA & IBRD)',
 'European Union',
 'Heavily indebted poor countries (HIPC)',
 'High income',
 'IBRD only',
 'IDA & IBRD total',
 'IDA blend',
 'IDA only',
 'Late-demographic dividend',
 'Latin America & Caribbean',
 'Latin America & Caribbean (excluding high income)',
 'Latin America & Caribbean (IDA & IBRD)',
 'Least developed countries: UN classification',
 'Low & middle income',
 'Low income',
 'Lower middle income',
 'Middle income',
 'OECD members',
 'Other small states',
 'Pre-demographic dividend',
 'Small states',
 'South Asia',
 'South Asia (IDA & IBRD)',
 'Sub-Saharan Africa',
 'Sub-Saharan Africa (excluding high income)',
 'Sub-Saharan Africa (IDA & IBRD)',
 'Upper middle inco

Select 20 countries for the visualization:

In [851]:
# Select only the desired countries:
countries = [
    'Finland', 'Korea, Rep.', 'Indonesia', 'Malaysia', 'Italy', 'Mexico', 'Morocco', 'Argentina', 'North America', 'Poland',
    'India', 'Iceland', 'Pakistan', 'Ukraine', 'Colombia', 'United Kingdom', 'Spain', 'France', 'Niger', 'Mozambique', 'Ghana',
    'Uzbekistan', 'Norway', 'Kyrgyz Republic', 'Senegal'
]

In [852]:
education_data = education_data[education_data.Country.isin(countries)].reset_index(drop = True)

In [853]:
indicator_null_rate = {}

for indicator, indicator_data in education_data.groupby('Indicator', sort = False):
    # Hence 360 is the total amount of items per indicator: 18 years * 20 countries = 360
    indicator_null_rate[indicator] = indicator_data.isnull().sum().sum() / 360

# Visualize how a lot of indicators have lots of missing values
sorted(indicator_null_rate.items(), key = lambda x: -x[1])

[('Children out of school, primary, female', 0.41944444444444445),
 ('Children out of school, primary, male', 0.41944444444444445),
 ('Secondary education, teachers (% female)', 0.36944444444444446),
 ('Lower secondary completion rate, female (% of relevant age group)',
  0.3194444444444444),
 ('Lower secondary completion rate, male (% of relevant age group)',
  0.3194444444444444),
 ('Rate of out-of-school youth of upper secondary school age, female (%)',
  0.2916666666666667),
 ('Rate of out-of-school youth of upper secondary school age, male (%)',
  0.2916666666666667),
 ('Primary completion rate, female (% of relevant age group)',
  0.2777777777777778),
 ('Primary completion rate, male (% of relevant age group)',
  0.2777777777777778),
 ('Primary education, teachers (% female)', 0.18055555555555555),
 ('School enrollment, tertiary (gross), gender parity index (GPI)',
  0.07777777777777778),
 ('School enrollment, tertiary, female (% gross)', 0.07777777777777778),
 ('School enrollmen

**School enrollment**

We decided to only select the variable related to `School Enrollment`

In [854]:
# Select only the school enrollment indicators
school_enrollment_indicators = [
    # Population related
    "Population, female (% of total)",
    # School enrollment
    "School enrollment, primary (gross), gender parity index (GPI)",
    "School enrollment, primary, female (% gross)",
    "School enrollment, primary, male (% gross)",
    "School enrollment, secondary (gross), gender parity index (GPI)",
    "School enrollment, secondary, female (% gross)",
    "School enrollment, secondary, male (% gross)",
    "School enrollment, tertiary (gross), gender parity index (GPI)",
    "School enrollment, tertiary, female (% gross)",
    "School enrollment, tertiary, male (% gross)"
]

school_enrollment = education_data[education_data.Indicator.isin(school_enrollment_indicators)].reset_index(drop = True)

Preview the data:

In [855]:
school_enrollment

Unnamed: 0,Country,Indicator,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
0,North America,"Population, female (% of total)",50.715858,50.695597,50.676566,50.659215,50.644041,50.631298,50.621378,50.613915,50.607462,50.600138,50.590735,50.57877,50.564851,50.550262,50.536815,50.525791,50.517375,50.511176
1,North America,"School enrollment, primary (gross), gender par...",0.986060,0.998450,1.008120,1.001590,0.994690,0.989450,1.006280,0.996860,1.001410,1.003570,0.991250,1.00430,0.986900,0.993310,0.997900,1.000440,0.998540,0.992800
2,North America,"School enrollment, primary, female (% gross)",100.010971,101.145172,100.270309,101.008476,100.721710,100.915031,102.230247,102.565102,102.859734,102.127113,99.742730,99.50912,98.525480,99.186210,99.719530,100.504600,101.258600,101.369700
3,North America,"School enrollment, primary, male (% gross)",101.425209,101.302452,99.462753,100.848289,101.259216,101.990868,101.592422,102.888077,102.715134,101.763550,100.623600,99.08313,99.833360,99.854550,99.929660,100.460800,101.406200,102.105200
4,North America,"School enrollment, secondary (gross), gender p...",1.016110,1.016390,0.993500,1.001720,1.017080,1.014580,0.994040,1.005900,0.995690,1.011810,1.008180,1.00215,1.004370,1.002440,1.011230,1.010200,0.996000,0.996520
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,Uzbekistan,"School enrollment, secondary, female (% gross)",86.702630,,,95.766040,95.776850,87.534530,87.629070,89.234810,88.075690,89.692690,89.899360,89.88127,91.764010,91.354520,91.474830,91.288450,92.007150,92.736430
246,Uzbekistan,"School enrollment, secondary, male (% gross)",89.496940,,,99.440100,99.837450,90.198700,89.520400,89.979200,88.334390,89.263550,89.844490,90.53452,92.385680,92.215700,92.523830,92.379210,93.180180,93.910070
247,Uzbekistan,"School enrollment, tertiary (gross), gender pa...",0.835610,0.809240,0.799560,0.769200,0.779590,0.692450,0.696760,0.700000,0.671150,0.690560,0.680580,0.64384,0.598200,0.596470,0.620430,0.629010,0.646710,0.607920
248,Uzbekistan,"School enrollment, tertiary, female (% gross)",11.891370,11.998790,12.306160,12.172180,12.441800,8.238910,8.439110,8.389340,8.198760,8.146870,7.580180,6.89394,5.993720,6.013590,6.173770,6.317760,6.613920,6.902540


**Append GDP information**

Read the new dataframe and append the complimentary info to the `school enrollment` dataframe to complement our visualization data:

In [856]:
# In order to read the data we will use tha same column names of the latter dataframe

# Read the file
GDP_data = pd.read_csv(
    raw_path + 'GDP_data.csv', names = colnames, header = 0
)

We will take tha same approach and we will delete the same columns as before:

In [857]:
GDP_data = GDP_data.drop(columns = cols_to_drop)

In [858]:
# Select only the desired countries
GDP_data = GDP_data[GDP_data.Country.isin(countries)].reset_index(drop = True)

Now we will append this new info about the Gross Domestic Product of the 20 countries of interest to the previously created dataframe:

In [859]:
school_enrollment = school_enrollment.append(GDP_data).reset_index(drop = True)

**Append population information**

Read the new dataframe and append the complimentary info to the `school enrollment` dataframe to complement our visualization data:

In [860]:
# In order to read the data we will use tha same column names of the latter dataframe

# Read the file
population_data = pd.read_csv(
    raw_path + 'population_data.csv', names = colnames, header = 0
)

We will take tha same approach and we will delete the same columns as before:

In [861]:
population_data = population_data.drop(columns = cols_to_drop)

In [862]:
# Select only the desired countries
population_data = population_data[population_data.Country.isin(countries)].reset_index(drop = True)

Now we will append this new info about the total population of the 20 countries of interest to the previously created dataframe:

In [863]:
school_enrollment = school_enrollment.append(population_data).reset_index(drop = True)

**Fix some country names**

We know that in the other dataframes we are going to use some countries are named differently. In order to join these dataframes by country we have to normalize the names:

In [864]:
# Change the problematic country names as before
# Korea, Rep. -> South Korea
school_enrollment = school_enrollment.replace(
    to_replace = 'Korea, Rep.',
    value = 'South Korea'
)
# North America -> United States of America
school_enrollment = school_enrollment.replace(
    to_replace = 'North America',
    value = 'United States of America'
)
# Kyrgyz Republic -> Kyrgyzstan
school_enrollment = school_enrollment.replace(
    to_replace = 'Kyrgyz Republic',
    value = 'Kyrgyzstan'
)

# New countries list
countries = list(school_enrollment.Country.unique())

**Change dtypes and normalize percentages to [0-1]**

In [865]:
# Change all the dtypes of the columns:
for i in range(2000, 2018):
    year = str(i)
    school_enrollment[year] = school_enrollment[year].astype('float')

In [866]:
# non-normalized percentage columns:
non_normalized = [
    # Population related
    "Population, female (% of total)",
    # School enrollment
    "School enrollment, primary, female (% gross)",
    "School enrollment, primary, male (% gross)",
    "School enrollment, secondary, female (% gross)",
    "School enrollment, secondary, male (% gross)",
    "School enrollment, tertiary, female (% gross)",
    "School enrollment, tertiary, male (% gross)"
]

# Select the subset of non-normalized items and divide them by 100
school_enrollment.loc[school_enrollment.Indicator.isin(non_normalized), [str(i) for i in range(2000, 2018)]] = school_enrollment.loc[school_enrollment.Indicator.isin(non_normalized), [str(i) for i in range(2000, 2018)]] / 100

**Address missing values**

Hence that despite taking into account selecting countries and indicators with fewer missing values there are still some here and there. In order to ease our visualization pipeline we will impute them. We will use the `.fillna(method = 'bfill')` method of pandas dataframes, that propagates the last valid observation backward to the next valid. In fact, this is not the best option regarding the veracity of the data, but taking into account that this is a visualization project we will keep the imputation method simple.

In [867]:
# Hence that there are still some missing values
print(f'There are still {school_enrollment.isnull().sum().sum()} missing values')

There are still 198 missing values


In [868]:
# Propagate last valid observation backward to next valid
school_enrollment = school_enrollment.fillna(method = 'bfill', axis = 1)
# Just in case there where null values in the back (last observations) propagate forward to make sure there are NO missing values
school_enrollment = school_enrollment.fillna(method = 'ffill', axis = 1)

In [869]:
print(f'There are {school_enrollment.isnull().sum().sum ()} missing values now')

There are 0 missing values now


**Feature engineering**

For our main bubble plot we want to show the percentage of school enrollment that there is in every country. Hence that in our dataframe we only have the GPI (Gender Parity Index) and the separate ratios for males and females.

In [870]:
total_ratios = []

for country, country_data in school_enrollment.groupby('Country'):
    # We need a single for each school level (primary, secondary and tertiary)
    primary = {'Country' : country, 'Indicator' : 'School enrollment, primary, total (% gross)'}
    secondary = {'Country' : country, 'Indicator' : 'School enrollment, secondary, total (% gross)'}
    tertiary = {'Country' : country, 'Indicator' : 'School enrollment, tertiary, total (% gross)'}
    
    # For every year
    for i in range(2000, 2018):
        year = str(i)
        # Female percentage of total population
        female = country_data[country_data.Indicator == 'Population, female (% of total)'][year].values[0]
        male = 1 - female
        primary[year] = ((female * (country_data[country_data.Indicator == 'School enrollment, primary, female (% gross)'][year].values[0])) + 
                         (male * (country_data[country_data.Indicator == 'School enrollment, primary, male (% gross)'][year].values[0])))
        secondary[year] = ((female * (country_data[country_data.Indicator == 'School enrollment, secondary, female (% gross)'][year].values[0])) + 
                           (male * (country_data[country_data.Indicator == 'School enrollment, secondary, male (% gross)'][year].values[0])))
        tertiary[year] = ((female * (country_data[country_data.Indicator == 'School enrollment, tertiary, female (% gross)'][year].values[0])) + 
                          (male * (country_data[country_data.Indicator == 'School enrollment, tertiary, male (% gross)'][year].values[0])))
    
    # In the end append each row to the list
    total_ratios.append(primary)
    total_ratios.append(secondary)
    total_ratios.append(tertiary)

# Convert it to a pandas dataframe
total_ratios = pd.DataFrame(total_ratios)

In [871]:
# Append the new features to the school_enrollment dataframe
school_enrollment = school_enrollment.append(total_ratios).reset_index(drop = True)

In [872]:
# Display the dataframe
school_enrollment

Unnamed: 0,Country,Indicator,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
0,United States of America,"Population, female (% of total)",0.507159,0.506956,0.506766,0.506592,0.506440,0.506313,0.506214,0.506139,0.506075,0.506001,0.505907,0.505788,0.505649,0.505503,0.505368,0.505258,0.505174,0.505112
1,United States of America,"School enrollment, primary (gross), gender par...",0.986060,0.998450,1.008120,1.001590,0.994690,0.989450,1.006280,0.996860,1.001410,1.003570,0.991250,1.004300,0.986900,0.993310,0.997900,1.000440,0.998540,0.992800
2,United States of America,"School enrollment, primary, female (% gross)",1.000110,1.011452,1.002703,1.010085,1.007217,1.009150,1.022302,1.025651,1.028597,1.021271,0.997427,0.995091,0.985255,0.991862,0.997195,1.005046,1.012586,1.013697
3,United States of America,"School enrollment, primary, male (% gross)",1.014252,1.013025,0.994628,1.008483,1.012592,1.019909,1.015924,1.028881,1.027151,1.017635,1.006236,0.990831,0.998334,0.998546,0.999297,1.004608,1.014062,1.021052
4,United States of America,"School enrollment, secondary (gross), gender p...",1.016110,1.016390,0.993500,1.001720,1.017080,1.014580,0.994040,1.005900,0.995690,1.011810,1.008180,1.002150,1.004370,1.002440,1.011230,1.010200,0.996000,0.996520
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
370,United States of America,"School enrollment, secondary, total (% gross)",0.948123,0.957980,0.950412,0.956530,0.957224,0.956513,0.953512,0.957074,0.956439,0.959263,0.955433,0.966568,0.974948,0.973775,0.980401,0.987748,0.999381,1.002038
371,United States of America,"School enrollment, tertiary, total (% gross)",0.673185,0.676303,0.766387,0.787949,0.797951,0.794350,0.817397,0.823606,0.835175,0.858099,0.901920,0.915361,0.909392,0.869795,0.868654,0.869790,0.870621,0.867233
372,Uzbekistan,"School enrollment, primary, total (% gross)",0.996998,0.994879,0.984359,0.973030,0.968177,0.970438,0.961939,0.945716,0.934925,0.927960,0.939091,0.945112,0.962931,0.976126,0.984525,0.999113,1.015519,1.036327
373,Uzbekistan,"School enrollment, secondary, total (% gross)",0.880921,0.975926,0.975922,0.975918,0.977944,0.888582,0.885688,0.896047,0.882043,0.894793,0.898721,0.902063,0.920734,0.917832,0.919972,0.918318,0.925917,0.933215


**DataFrame formating**

Let's restructure the latter dataframe for easier altair ploting:

In [873]:
school_enrollment = school_enrollment.set_index(['Country', 'Indicator']).sort_index().stack().unstack(level = 1)

# Remove the column indices name
del school_enrollment.columns.name

Convert the MultiIndex into two separate columns:

In [874]:
school_enrollment = school_enrollment.reset_index()

# Rename the columns
school_enrollment.columns = [
    'Country', 'YEAR', 'GDP', 'Pop-fem', 'Pop-total',
    'Primary_GPI', 'Primary_female', 'Primary_male', 'Primary_total',
    'Secondary_GPI', 'Secondary_female', 'Secondary_male', 'Secondary_total',
    'Tertiary_GPI', 'Tertiary_female', 'Tertiary_male', 'Tertiary_total'
]

In [875]:
school_enrollment

Unnamed: 0,Country,YEAR,GDP,Pop-fem,Pop-total,Primary_GPI,Primary_female,Primary_male,Primary_total,Secondary_GPI,Secondary_female,Secondary_male,Secondary_total,Tertiary_GPI,Tertiary_female,Tertiary_male,Tertiary_total
0,Argentina,2000,11916.582609,0.514664,36870787.0,0.99104,1.151662,1.162077,1.156717,1.04985,0.982887,0.936215,0.960236,1.51614,0.650086,0.428779,0.542678
1,Argentina,2001,11514.619107,0.514679,37275652.0,0.99237,1.157988,1.166889,1.162308,1.05215,0.993284,0.944051,0.969390,1.45064,0.698558,0.481551,0.593240
2,Argentina,2002,10310.129116,0.514598,37681749.0,0.99312,1.163891,1.171959,1.167807,1.05214,0.983593,0.934854,0.959935,1.46033,0.745017,0.510170,0.631022
3,Argentina,2003,11307.757984,0.514450,38087868.0,0.99286,1.140784,1.148983,1.144765,1.06097,0.978013,0.921813,0.950725,1.48568,0.779487,0.524665,0.655758
4,Argentina,2004,12527.800496,0.514280,38491972.0,0.99111,1.152805,1.163150,1.157830,1.07690,0.980483,0.910469,0.946476,1.40000,0.764508,0.546075,0.658410
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
445,Uzbekistan,2013,6394.541435,0.502202,30243200.0,0.98275,0.967672,0.984654,0.976126,0.99066,0.913545,0.922157,0.917832,0.59647,0.060136,0.100820,0.080389
446,Uzbekistan,2014,6866.481774,0.502040,30757700.0,0.98431,0.976773,0.992340,0.984525,0.98866,0.914748,0.925238,0.919972,0.62043,0.061738,0.099508,0.080546
447,Uzbekistan,2015,7327.832069,0.501873,31298900.0,0.98564,0.991916,1.006364,0.999113,0.98819,0.912884,0.923792,0.918318,0.62901,0.063178,0.100439,0.081739
448,Uzbekistan,2016,7723.937506,0.501703,31847900.0,0.98604,1.008406,1.022681,1.015519,0.98741,0.920071,0.931802,0.925917,0.64671,0.066139,0.102270,0.084143


Create a new dataframe useful for the bar plot later on:

In [876]:
# First we will select the necessary columns for plotting
barplot_data = school_enrollment[[
    'Country', 'YEAR',
    'Primary_GPI', 'Primary_female', 'Primary_male', 'Primary_total',
    'Secondary_GPI', 'Secondary_female', 'Secondary_male', 'Secondary_total',
    'Tertiary_GPI', 'Tertiary_female', 'Tertiary_male', 'Tertiary_total'
]].copy()

In [877]:
# Now we will convert the current column index space into a two level space:
indices, groups, others = set(), set(), set()

for c in barplot_data.columns:
    if '_' in c:
        (i, g) = c.split('_')
        c2 = pd.MultiIndex.from_tuples((i, g),)
        indices.add(i)
        groups.add(g)
    else:
        others.add(c)
        
import itertools

columns = list(itertools.product(groups, indices))
columns = pd.MultiIndex.from_tuples(columns)
ret = pd.DataFrame(columns = columns)

for c in columns:
    ret[c] = barplot_data['%s_%s' % (c[1], c[0])]
for c in others:
    ret[c] = barplot_data['%s' % c]

ret

Unnamed: 0_level_0,total,total,total,male,male,male,GPI,GPI,GPI,female,female,female,Country,YEAR
Unnamed: 0_level_1,Primary,Tertiary,Secondary,Primary,Tertiary,Secondary,Primary,Tertiary,Secondary,Primary,Tertiary,Secondary,Unnamed: 13_level_1,Unnamed: 14_level_1
0,1.156717,0.542678,0.960236,1.162077,0.428779,0.936215,0.99104,1.51614,1.04985,1.151662,0.650086,0.982887,Argentina,2000
1,1.162308,0.593240,0.969390,1.166889,0.481551,0.944051,0.99237,1.45064,1.05215,1.157988,0.698558,0.993284,Argentina,2001
2,1.167807,0.631022,0.959935,1.171959,0.510170,0.934854,0.99312,1.46033,1.05214,1.163891,0.745017,0.983593,Argentina,2002
3,1.144765,0.655758,0.950725,1.148983,0.524665,0.921813,0.99286,1.48568,1.06097,1.140784,0.779487,0.978013,Argentina,2003
4,1.157830,0.658410,0.946476,1.163150,0.546075,0.910469,0.99111,1.40000,1.07690,1.152805,0.764508,0.980483,Argentina,2004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
445,0.976126,0.080389,0.917832,0.984654,0.100820,0.922157,0.98275,0.59647,0.99066,0.967672,0.060136,0.913545,Uzbekistan,2013
446,0.984525,0.080546,0.919972,0.992340,0.099508,0.925238,0.98431,0.62043,0.98866,0.976773,0.061738,0.914748,Uzbekistan,2014
447,0.999113,0.081739,0.918318,1.006364,0.100439,0.923792,0.98564,0.62901,0.98819,0.991916,0.063178,0.912884,Uzbekistan,2015
448,1.015519,0.084143,0.925917,1.022681,0.102270,0.931802,0.98604,0.64671,0.98741,1.008406,0.066139,0.920071,Uzbekistan,2016


In [878]:
# Set Country and YEAR as indices, in order to stack the column space
ret = ret.set_index(['Country', 'YEAR']).stack(level = 0)

# Rename indices for reseting index
ret.index.names = ['Country', 'YEAR', 'Indicator']

ret = ret.reset_index()

In [879]:
# United States of America -> USA
ret = ret.replace(
    to_replace = 'United States of America',
    value = 'USA'
)

# United Kingdom -> UK
ret = ret.replace(
    to_replace = 'United Kingdom',
    value = 'UK'
)

In [880]:
ret

Unnamed: 0,Country,YEAR,Indicator,Primary,Secondary,Tertiary
0,Argentina,2000,GPI,0.991040,1.049850,1.516140
1,Argentina,2000,female,1.151662,0.982887,0.650086
2,Argentina,2000,male,1.162077,0.936215,0.428779
3,Argentina,2000,total,1.156717,0.960236,0.542678
4,Argentina,2001,GPI,0.992370,1.052150,1.450640
...,...,...,...,...,...,...
1795,Uzbekistan,2016,total,1.015519,0.925917,0.084143
1796,Uzbekistan,2017,GPI,0.985340,0.987500,0.607920
1797,Uzbekistan,2017,female,1.028698,0.927364,0.069025
1798,Uzbekistan,2017,male,1.044003,0.939101,0.113544


Export the latter dataframe into a csv file for ploting:

In [839]:
ret.to_csv(processed_path + 'barplot_data.csv', index = False)

**Modify the customized `.geojson` background map**

Load the customized world map `.geojson` file, without among others the Antarctica and a couple of extra variables:

In [736]:
# Read and parse the json file
with open(raw_path + 'custom.geo.json') as json_data:
    worldmap = json.load(json_data)

In [737]:
# Convert it into a GeoPandasDataFrame
gdf = gpd.GeoDataFrame.from_features(worldmap)

In [738]:
# Select only the desired columns
gdf = gdf[
    [
        'geometry',      # geometry of the country
        'sovereignt',    # sovereign country
        'sov_a3',        # abbreviation of the sovereign country
        'type',          # {Sovereign country, Country, Dependency, Disputed, Indeterminate}
        'admin',         # name of the administration
        'income_grp',    # level of income of the country
        'continent'      # relative continent
    ]
]

# Change column names for convenience
gdf.columns = ['geometry', 'Country', 'Code', 'Type', 'Admin', 'Income_group', 'Continent']

# Display the dataframe
gdf

Unnamed: 0,geometry,Country,Code,Type,Admin,Income_group,Continent
0,"POLYGON ((-61.20000 -51.85000, -60.00000 -51.2...",United Kingdom,GB1,Dependency,Falkland Islands,1. High income: OECD,South America
1,"POLYGON ((-57.62513 -30.21629, -56.29090 -28.8...",Brazil,BRA,Sovereign country,Brazil,3. Upper middle income,South America
2,"POLYGON ((-62.68506 -22.24903, -62.29118 -21.0...",Paraguay,PRY,Sovereign country,Paraguay,4. Lower middle income,South America
3,"POLYGON ((-80.30256 -3.40486, -79.77029 -2.657...",Ecuador,ECU,Sovereign country,Ecuador,3. Upper middle income,South America
4,"POLYGON ((-59.75828 8.36703, -59.10168 7.99920...",Guyana,GUY,Sovereign country,Guyana,4. Lower middle income,South America
...,...,...,...,...,...,...,...
171,"POLYGON ((18.85314 49.49623, 18.90957 49.43585...",Slovakia,SVK,Sovereign country,Slovakia,1. High income: OECD,Europe
172,"POLYGON ((13.80648 46.50931, 14.63247 46.43182...",Slovenia,SVN,Sovereign country,Slovenia,1. High income: OECD,Europe
173,"POLYGON ((22.18317 65.72374, 21.21352 65.02601...",Sweden,SWE,Sovereign country,Sweden,1. High income: OECD,Europe
174,"POLYGON ((31.78600 52.10168, 32.15941 52.06127...",Ukraine,UKR,Sovereign country,Ukraine,4. Lower middle income,Europe


Store the latter dataframe (it will be the background of our plot):

In [739]:
# Store the json files
with open(processed_path + 'background.geojson', 'w') as json_file:
    json.dump(gdf.to_json(), json_file)

**Select our countries of interest**

In [744]:
our_countries = gdf[gdf.Admin.isin(countries)].reset_index(drop = True).sort_values(by = ['Country']).reset_index(drop = True)

In [764]:
our_countries

Unnamed: 0,geometry,Country,Code,Type,Admin,Income_group,Continent
0,"MULTIPOLYGON (((-65.50000 -55.20000, -66.45000...",Argentina,ARG,Sovereign country,Argentina,3. Upper middle income,South America
1,"POLYGON ((-75.37322 -0.15203, -75.80147 0.0848...",Colombia,COL,Sovereign country,Colombia,3. Upper middle income,South America
2,"POLYGON ((28.59193 69.06478, 28.44594 68.36461...",Finland,FI1,Country,Finland,1. High income: OECD,Europe
3,"MULTIPOLYGON (((-52.55642 2.50471, -52.93966 2...",France,FR1,Country,France,1. High income: OECD,Europe
4,"POLYGON ((1.06012 5.92884, -0.50764 5.34347, -...",Ghana,GHA,Sovereign country,Ghana,4. Lower middle income,Africa
5,"POLYGON ((-14.50870 66.45589, -14.73964 65.808...",Iceland,ISL,Sovereign country,Iceland,1. High income: OECD,Europe
6,"POLYGON ((77.83745 35.49401, 78.91227 34.32194...",India,IND,Sovereign country,India,4. Lower middle income,Asia
7,"MULTIPOLYGON (((120.71561 -10.23958, 120.29501...",Indonesia,IDN,Sovereign country,Indonesia,4. Lower middle income,Asia
8,"MULTIPOLYGON (((15.52038 38.23116, 15.16024 37...",Italy,ITA,Sovereign country,Italy,1. High income: OECD,Europe
9,"POLYGON ((70.96231 42.26615, 71.18628 42.70429...",Kyrgyzstan,KGZ,Sovereign country,Kyrgyzstan,5. Low income,Asia


In [759]:
with open(processed_path + 'our_countries.geojson', 'w') as json_file:
    json.dump(our_countries.to_json(), json_file)

We will want to plot a point at every centroid of every selected country. In order to do so we use the in-built method in GeoPandas `centroid`. Notice that because of MULTIPOLYGONS we had to manually modify some of the centroids for better visualization, e.g. the USA has Hawaii and Alaska as well, and that messes up the center of gravity.

In [881]:
our_countries['centroid_lon'] = our_countries['geometry'].centroid.x
our_countries['centroid_lat'] = our_countries['geometry'].centroid.y

In [882]:
# Move the United States of America centroid coordinates
our_countries.loc[our_countries.Country == 'United States of America', ['centroid_lon', 'centroid_lat']] = [-98.0, 39.0]
# Move the United Kingdom centroid coordinates
our_countries.loc[our_countries.Country == 'United Kingdom', ['centroid_lon', 'centroid_lat']] = [-1, 52.0]
# Move the Finland centroid coordinates
our_countries.loc[our_countries.Country == 'Finland', ['centroid_lon', 'centroid_lat']] = [26.0, 62.5]
# Move the Norway centroid coordinates
our_countries.loc[our_countries.Country == 'Norway', ['centroid_lon', 'centroid_lat']] = [9.0, 61.5]
# Move the France centroid coordinates
our_countries.loc[our_countries.Country == 'France', ['centroid_lon', 'centroid_lat']] = [3.0, 47]
# Move the Malaysia centroid coordinates
our_countries.loc[our_countries.Country == 'Malaysia', ['centroid_lon', 'centroid_lat']] = [103.0, 3.72558]

In [883]:
centroids = our_countries[['Country', 'centroid_lon', 'centroid_lat']]

In [884]:
school_enrollment = school_enrollment.merge(centroids, on = 'Country')

In [885]:
school_enrollment

Unnamed: 0,Country,YEAR,GDP,Pop-fem,Pop-total,Primary_GPI,Primary_female,Primary_male,Primary_total,Secondary_GPI,Secondary_female,Secondary_male,Secondary_total,Tertiary_GPI,Tertiary_female,Tertiary_male,Tertiary_total,centroid_lon,centroid_lat
0,Argentina,2000,11916.582609,0.514664,36870787.0,0.99104,1.151662,1.162077,1.156717,1.04985,0.982887,0.936215,0.960236,1.51614,0.650086,0.428779,0.542678,-65.175361,-35.446821
1,Argentina,2001,11514.619107,0.514679,37275652.0,0.99237,1.157988,1.166889,1.162308,1.05215,0.993284,0.944051,0.969390,1.45064,0.698558,0.481551,0.593240,-65.175361,-35.446821
2,Argentina,2002,10310.129116,0.514598,37681749.0,0.99312,1.163891,1.171959,1.167807,1.05214,0.983593,0.934854,0.959935,1.46033,0.745017,0.510170,0.631022,-65.175361,-35.446821
3,Argentina,2003,11307.757984,0.514450,38087868.0,0.99286,1.140784,1.148983,1.144765,1.06097,0.978013,0.921813,0.950725,1.48568,0.779487,0.524665,0.655758,-65.175361,-35.446821
4,Argentina,2004,12527.800496,0.514280,38491972.0,0.99111,1.152805,1.163150,1.157830,1.07690,0.980483,0.910469,0.946476,1.40000,0.764508,0.546075,0.658410,-65.175361,-35.446821
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
445,Uzbekistan,2013,6394.541435,0.502202,30243200.0,0.98275,0.967672,0.984654,0.976126,0.99066,0.913545,0.922157,0.917832,0.59647,0.060136,0.100820,0.080389,63.203640,41.748603
446,Uzbekistan,2014,6866.481774,0.502040,30757700.0,0.98431,0.976773,0.992340,0.984525,0.98866,0.914748,0.925238,0.919972,0.62043,0.061738,0.099508,0.080546,63.203640,41.748603
447,Uzbekistan,2015,7327.832069,0.501873,31298900.0,0.98564,0.991916,1.006364,0.999113,0.98819,0.912884,0.923792,0.918318,0.62901,0.063178,0.100439,0.081739,63.203640,41.748603
448,Uzbekistan,2016,7723.937506,0.501703,31847900.0,0.98604,1.008406,1.022681,1.015519,0.98741,0.920071,0.931802,0.925917,0.64671,0.066139,0.102270,0.084143,63.203640,41.748603


Rename USA and UK:

In [831]:
# United States of America -> USA
school_enrollment = school_enrollment.replace(
    to_replace = 'United States of America',
    value = 'USA'
)

# United Kingdom -> UK
school_enrollment = school_enrollment.replace(
    to_replace = 'United Kingdom',
    value = 'UK'
)

In [834]:
school_enrollment.to_csv(processed_path + 'school_enrollment2.csv', index = False)