# DTSA-5304 Final
Data from world bank development indicators website.  
Obtained from https://databank.worldbank.org/reports.aspx?source=2&amp;country=WLD&amp;l=en

## Running the notebook
Since the notebook has been saved with all the charts already rendered, you can interact with the charts without having to run the notebook.

If you would like to tun the notebook, please choose `Runtime` -> `Run all` from the menu.

In [1]:
import pandas as pd
import numpy as np
import altair as alt

from vega_datasets import data as vega_data # for world map data background data

# google drive is no longer needed for data since I created a github gist for the data file
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
# Hack to display each cell output without requiring scrolling.
# https://stackoverflow.com/questions/55546869/google-colaboratory-is-there-any-way-to-expand-the-height-of-the-result-cell-of
from IPython.display import Javascript
def resize_colab_cell():
  display(Javascript('google.colab.output.setIframeHeight(0, true, {maxHeight: 5000})'))
get_ipython().events.register('pre_run_cell', resize_colab_cell)

## Get world bank data subset csv saved at github gist.

In [3]:
#data_path = "/content/drive/MyDrive/dtsa-5304-100/final/data.csv"
#data= pd.read_csv(data_path)
data_url = "https://gist.githubusercontent.com/asequeira-os/e126b64b420d72bf3a88b9b223b1411c/raw/e5231fdd0a47b6d81a81babe64127edbbd740e74/dtsa-5304-final-data.csv"
data= pd.read_csv(data_url)
print("Raw data from worldbank query saved in gist")
data.head(5)

<IPython.core.display.Javascript object>

Raw data from worldbank query saved in gist


Unnamed: 0,Country Name,Country Code,Series Name,Series Code,1960 [YR1960],1965 [YR1965],1970 [YR1970],1975 [YR1975],1980 [YR1980],1985 [YR1985],1990 [YR1990],1995 [YR1995],2000 [YR2000],2005 [YR2005],2010 [YR2010],2015 [YR2015],2020 [YR2020]
0,World,WLD,"Population, total",SP.POP.TOTL,3031564839,3328284623,3690306927,4070114517,4442440474,4850160867,5293517142.0,5726801833.0,6144322697.0,6552571570.0,6969631901.0,7404910892.0,7820981524.0
1,World,WLD,Population growth (annual %),SP.POP.GROW,..,2.07840627851978,2.09253903669646,1.85543075810469,1.75429079646716,1.75055631399105,1.74560235071817,1.50072911381007,1.35330175380903,1.2633713888996,1.22200562383263,1.1944247960642,1.01127220086579
2,World,WLD,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,50.866170951186,55.8380224044149,57.8247558223699,60.1736502416523,62.2248531940153,63.8066765931182,65.1808796624838,66.2083677233336,67.7000539909611,69.1983927865475,70.6706171214748,72.0955006895748,72.2650112702741
3,World,WLD,"Mortality rate, under-5 (per 1,000 live births)",SH.DYN.MORT,..,..,..,..,..,..,93.2,86.5,75.8,62.5,51.2,42.6,36.6
4,World,WLD,"Primary completion rate, total (% of relevant ...",SE.PRM.CMPT.ZS,..,..,74.3306884765625,75.6375885009766,78.7359313964844,81.6223983764648,81.3433532714844,81.8440399169922,81.9493408203125,85.4663619995117,88.9113082885742,88.7150192260742,90.4092102050781


In [4]:
# remove extra quotes cleanup
for i, col in enumerate(data.columns):
  data.iloc[:, i] = data.iloc[:, i].str.replace('"', '')

# https://gist.github.com/tadast/8827699
cc_geo_data = pd.read_csv("https://gist.githubusercontent.com/tadast/8827699/raw/f5cac3d42d16b78348610fc4ec301e9234f82821/countries_codes_and_coordinates.csv")

<IPython.core.display.Javascript object>

## Helpful constants to avoid duplicating strings and typos.

In [5]:
# columns
C_COUNRTY_CODE = "Country Code"
C_COUNTRY_NAME = "Country Name"
C_SERIES = "Series Code"
C_SERIES_NAME = "Series Name"
C_YEAR = "Year" # synthetic column for yeaar

# year columns
C_Y_1960 = "1960 [YR1960]"
C_Y_1965 = "1965 [YR1965]"
C_Y_1970 = "1970 [YR1970]"
C_Y_1975 = "1975 [YR1975]"
C_Y_1980 = "1980 [YR1980]"
C_Y_1985 = "1985 [YR1985]"
C_Y_1990 = "1990 [YR1990]"
C_Y_1995 = "1995 [YR1995]"
C_Y_2000 = "2000 [YR2000]"
C_Y_2005 = "2005 [YR2005]"
C_Y_2010 = "2010 [YR2010]"
C_Y_2015 = "2015 [YR2015]"
C_Y_2020 = "2020 [YR2020]"

YEAR_COLUMNS = [
  C_Y_1960,
  C_Y_1965,
  C_Y_1970,
  C_Y_1975,
  C_Y_1980,
  C_Y_1985,
  C_Y_1990,
  C_Y_1995,
  C_Y_2000,
  C_Y_2005,
  C_Y_2010,
  C_Y_2015,
  C_Y_2020,
]

YEAR_MAP = {
  C_Y_1960 : 1960,
  C_Y_1965 : 1965,
  C_Y_1970 : 1970,
  C_Y_1975 : 1975,
  C_Y_1980 : 1980,
  C_Y_1985 : 1985,
  C_Y_1990 : 1990,
  C_Y_1995 : 1995,
  C_Y_2000 : 2000,
  C_Y_2005 : 2005,
  C_Y_2010 : 2010,
  C_Y_2015 : 2015,
  C_Y_2020 : 2020,   
}

# Indicators (value of "Series code" aka C_SERIES column)
I_POP_TOTAL = "SP.POP.TOTL" # Population, total
I_POP_GROWTH = "SP.POP.GROW" # Population growth (annual %)
I_BIRTH_LIFE_EXPECTANCY = "SP.DYN.LE00.IN" # Life expectancy at birth, total (years)
I_MORT_RATE_UND5 = "SH.DYN.MORT" # Mortality rate, under-5 (per 1,000 live births)
I_PRIM_COMPLETION_RATE = "SE.PRM.CMPT.ZS" # Primary completion rate, total (% of relevant age group)
I_PRIM_ENROLL = "SE.PRM.ENRR" # School enrollment, primary (% gross)
I_SEC_ENROLL = "SE.SEC.ENRR" # School enrollment, secondary (% gross)
I_GPI = "SE.ENR.PRSC.FM.ZS" # School enrollment, primary and secondary (gross), gender parity index
I_GDP = "NY.GDP.MKTP.CD" # GDP (current US$)
I_GDP_GROWTH = "NY.GDP.MKTP.KD.ZG" # GDP growth (annual %)
I_TAX_REV = "GC.TAX.TOTL.GD.ZS" # Tax revenue (% of GDP)

BEETER_NAMES = {
  I_POP_TOTAL : "Population, total",
  I_POP_GROWTH : "Population annual growth",
  I_BIRTH_LIFE_EXPECTANCY : "Life expectancy",
  I_MORT_RATE_UND5 : "Mortality rate",
  I_PRIM_COMPLETION_RATE : "Primary completion",
  I_PRIM_ENROLL : "School enrollment primary",
  I_SEC_ENROLL : "School enrollment secondary",
  I_GPI : "Gender parity index",
  I_GDP : "GDP",
  I_GDP_GROWTH : "GDP Growth",
  I_TAX_REV : "Tax revenue"
}

ALL_INDICATORS = [
  I_POP_TOTAL,
  I_POP_GROWTH,
  I_BIRTH_LIFE_EXPECTANCY,
  I_MORT_RATE_UND5,
  I_PRIM_COMPLETION_RATE,
  I_PRIM_ENROLL,
  I_SEC_ENROLL,
  I_GPI,
  I_GDP,
  I_GDP_GROWTH,
  I_TAX_REV,
]

# World data
WORLD_CC = "WLD"  # Country code for world data
world_data = data.loc[data[C_COUNRTY_CODE] == WORLD_CC]

USA_CC = "USA" # country code for United States
NO_DATA = ".." # value of columns with missing data

<IPython.core.display.Javascript object>

In [6]:
# data uses two periods to indicate missing values - replace with NaN
# https://stackoverflow.com/questions/53668421/replace-a-string-value-with-nan-in-pandas-data-frame-python
data = data.replace(NO_DATA, np.nan) # replace ".." with NaN

<IPython.core.display.Javascript object>

In [7]:
# make each year column value into a row using 'melt'
# https://stackoverflow.com/questions/28654047/convert-columns-into-rows-with-pandas
data = data.melt(
  id_vars = [C_COUNRTY_CODE, C_COUNTRY_NAME, C_SERIES, C_SERIES_NAME],
  var_name = C_YEAR,
  value_name = "value"
)

<IPython.core.display.Javascript object>

In [8]:
data

<IPython.core.display.Javascript object>

Unnamed: 0,Country Code,Country Name,Series Code,Series Name,Year,value
0,WLD,World,SP.POP.TOTL,"Population, total",1960 [YR1960],3031564839
1,WLD,World,SP.POP.GROW,Population growth (annual %),1960 [YR1960],
2,WLD,World,SP.DYN.LE00.IN,"Life expectancy at birth, total (years)",1960 [YR1960],50.866170951186
3,WLD,World,SH.DYN.MORT,"Mortality rate, under-5 (per 1,000 live births)",1960 [YR1960],
4,WLD,World,SE.PRM.CMPT.ZS,"Primary completion rate, total (% of relevant ...",1960 [YR1960],
...,...,...,...,...,...,...
31169,ZWE,Zimbabwe,SE.SEC.ENRR,"School enrollment, secondary (% gross)",2020 [YR2020],
31170,ZWE,Zimbabwe,SE.ENR.PRSC.FM.ZS,"School enrollment, primary and secondary (gros...",2020 [YR2020],
31171,ZWE,Zimbabwe,NY.GDP.MKTP.CD,GDP (current US$),2020 [YR2020],21509698406.1116
31172,ZWE,Zimbabwe,NY.GDP.MKTP.KD.ZG,GDP growth (annual %),2020 [YR2020],-7.81695064681365


In [9]:
# create country codes, names, and lookup
COUNTRY_LOOKUP = data[[C_COUNRTY_CODE, C_COUNTRY_NAME]].drop_duplicates()
COUNTRY_CODES = COUNTRY_LOOKUP[C_COUNRTY_CODE].tolist()
COUNTRY_NAMES = COUNTRY_LOOKUP[C_COUNTRY_NAME].tolist()

<IPython.core.display.Javascript object>

In [10]:
# convert individual rows for each indicator value into a column using 'pivot'
# https://www.digitalocean.com/community/tutorials/pandas-melt-unmelt-pivot-function
data_pivot = data.pivot(index=[C_COUNRTY_CODE, C_COUNTRY_NAME, C_YEAR], columns=C_SERIES)['value'].reset_index()

# fix ugly column names for the year
data_pivot[C_YEAR] = data_pivot[C_YEAR].map(YEAR_MAP)

# fix type for numeric data columns
for indicator in ALL_INDICATORS:
  # https://datatofish.com/convert-string-to-float-dataframe/
  data_pivot[indicator] = data_pivot[indicator].astype(float)

# remove extraneous stuff
# https://stackoverflow.com/questions/29765548/remove-index-name-in-pandas
data_pivot = data_pivot.rename_axis(None, axis=1)
# data_pivot = data_pivot.dropna() # keep NaN - Altair simply skips those anyway


<IPython.core.display.Javascript object>

In [11]:
data_pivot.head(5)

<IPython.core.display.Javascript object>

Unnamed: 0,Country Code,Country Name,Year,GC.TAX.TOTL.GD.ZS,NY.GDP.MKTP.CD,NY.GDP.MKTP.KD.ZG,SE.ENR.PRSC.FM.ZS,SE.PRM.CMPT.ZS,SE.PRM.ENRR,SE.SEC.ENRR,SH.DYN.MORT,SP.DYN.LE00.IN,SP.POP.GROW,SP.POP.TOTL
0,ABW,Aruba,1960,,,,,,,,,64.152,,54608.0
1,ABW,Aruba,1965,,,,,,,,,65.502,1.032841,58782.0
2,ABW,Aruba,1970,,,,,,,,,67.583,-0.378264,59106.0
3,ABW,Aruba,1975,,,,,,,,,69.762,1.137966,60715.0
4,ABW,Aruba,1980,,,,,,,,,71.066,0.420044,62267.0


## Rename data columns so they are descriptive but not too long.

In [12]:
# altair -> vega -> javascript => causes issues with period in field names
# https://altair-viz.github.io/user_guide/troubleshooting.html#encodings-with-special-characters
for indicator in ALL_INDICATORS:
  # data_pivot = data_pivot.rename(columns={indicator: indicator.replace('.', '_')})
  data_pivot = data_pivot.rename(columns=BEETER_NAMES)
# reset 'constants' to match
# I_POP_TOTAL = I_POP_TOTAL.replace(".", "_")
# I_POP_GROWTH = I_POP_GROWTH.replace(".", "_")
# I_BIRTH_LIFE_EXPECTANCY = I_BIRTH_LIFE_EXPECTANCY.replace(".", "_")
# I_MORT_RATE_UND5 = I_MORT_RATE_UND5.replace(".", "_")
# I_PRIM_COMPLETION_RATE = I_PRIM_COMPLETION_RATE.replace(".", "_")
# I_PRIM_ENROLL = I_PRIM_ENROLL.replace(".", "_")
# I_SEC_ENROLL = I_SEC_ENROLL.replace(".", "_")
# I_GPI = I_GPI.replace(".", "_")
# I_GDP = I_GDP.replace(".", "_")
# I_GDP_GROWTH = I_GDP_GROWTH.replace(".", "_")
# I_TAX_REV = I_TAX_REV.replace(".", "_")

I_POP_TOTAL = BEETER_NAMES[I_POP_TOTAL]
I_POP_GROWTH = BEETER_NAMES[I_POP_GROWTH]
I_BIRTH_LIFE_EXPECTANCY = BEETER_NAMES[I_BIRTH_LIFE_EXPECTANCY]
I_MORT_RATE_UND5 = BEETER_NAMES[I_MORT_RATE_UND5]
I_PRIM_COMPLETION_RATE = BEETER_NAMES[I_PRIM_COMPLETION_RATE]
I_PRIM_ENROLL = BEETER_NAMES[I_PRIM_ENROLL]
I_SEC_ENROLL = BEETER_NAMES[I_SEC_ENROLL]
I_GPI = BEETER_NAMES[I_GPI]
I_GDP = BEETER_NAMES[I_GDP]
I_GDP_GROWTH = BEETER_NAMES[I_GDP_GROWTH]
I_TAX_REV = BEETER_NAMES[I_TAX_REV]


<IPython.core.display.Javascript object>

In [13]:
data_pivot.head(5)

<IPython.core.display.Javascript object>

Unnamed: 0,Country Code,Country Name,Year,Tax revenue,GDP,GDP Growth,Gender parity index,Primary completion,School enrollment primary,School enrollment secondary,Mortality rate,Life expectancy,Population annual growth,"Population, total"
0,ABW,Aruba,1960,,,,,,,,,64.152,,54608.0
1,ABW,Aruba,1965,,,,,,,,,65.502,1.032841,58782.0
2,ABW,Aruba,1970,,,,,,,,,67.583,-0.378264,59106.0
3,ABW,Aruba,1975,,,,,,,,,69.762,1.137966,60715.0
4,ABW,Aruba,1980,,,,,,,,,71.066,0.420044,62267.0


In [14]:
data_pivot.info() # helps check data counts and types

<IPython.core.display.Javascript object>

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2834 entries, 0 to 2833
Data columns (total 14 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Country Code                 2834 non-null   object 
 1   Country Name                 2834 non-null   object 
 2   Year                         2834 non-null   int64  
 3   Tax revenue                  882 non-null    float64
 4   GDP                          2181 non-null   float64
 5   GDP Growth                   1962 non-null   float64
 6   Gender parity index          1200 non-null   float64
 7   Primary completion           1061 non-null   float64
 8   School enrollment primary    1609 non-null   float64
 9   School enrollment secondary  1370 non-null   float64
 10  Mortality rate               2286 non-null   float64
 11  Life expectancy              2631 non-null   float64
 12  Population annual growth     2610 non-null   float64
 13  Population, total 

In [15]:
cc_geo_data.head(3)

<IPython.core.display.Javascript object>

Unnamed: 0,Country,Alpha-2 code,Alpha-3 code,Numeric code,Latitude (average),Longitude (average)
0,Afghanistan,"""AF""","""AFG""","""4""","""33""","""65"""
1,Albania,"""AL""","""ALB""","""8""","""41""","""20"""
2,Algeria,"""DZ""","""DZA""","""12""","""28""","""3"""


In [16]:
# clean up cc_geo_data
C_LATITUDE = "Latitude"
C_LONGITUDE = "Longitude"
cc_geo_data = cc_geo_data.rename(columns={
    "Alpha-3 code": C_COUNRTY_CODE, 
    "Latitude (average)": C_LATITUDE,
    "Longitude (average)": C_LONGITUDE,
    })
cc_geo_data.drop(columns=["Country", "Alpha-2 code", "Numeric code"], inplace=True)
# https://stackoverflow.com/questions/21491291/remove-all-quotes-within-values-in-pandas
for i, col in enumerate(cc_geo_data.columns):
  cc_geo_data.iloc[:, i] = cc_geo_data.iloc[:, i].str.replace('"', '')
for col in [C_LATITUDE, C_LONGITUDE]:
  cc_geo_data[col] = cc_geo_data[col].str.strip()
  cc_geo_data[col] = cc_geo_data[col].astype(float)
# https://stackoverflow.com/questions/49551336/pandas-trim-leading-trailing-white-space-in-a-dataframe
cc_geo_data = cc_geo_data.applymap(lambda x: x.strip() if isinstance(x, str) else x)
cc_geo_data.head(3)

<IPython.core.display.Javascript object>

Unnamed: 0,Country Code,Latitude,Longitude
0,AFG,33.0,65.0
1,ALB,41.0,20.0
2,DZA,28.0,3.0


In [17]:
# Merge main data with geo location data
data_pivot = cc_geo_data.merge(data_pivot, on=C_COUNRTY_CODE)

<IPython.core.display.Javascript object>

In [18]:
# convert GDP to millions (else causes large numbers and ugly display)
data_pivot[I_GDP] = data_pivot[I_GDP] // 1000000

<IPython.core.display.Javascript object>

In [19]:
data_pivot.describe()

<IPython.core.display.Javascript object>

Unnamed: 0,Latitude,Longitude,Year,Tax revenue,GDP,GDP Growth,Gender parity index,Primary completion,School enrollment primary,School enrollment secondary,Mortality rate,Life expectancy,Population annual growth,"Population, total"
count,2899.0,2899.0,2899.0,924.0,2282.0,2051.0,1238.0,1103.0,1682.0,1425.0,2417.0,2720.0,2670.0,2893.0
mean,19.100602,19.784043,1990.0,16.779338,178982.5,3.113175,0.943945,82.101974,97.999319,67.784291,75.567935,64.294639,1.697659,25350640.0
std,23.605615,70.666331,18.711514,7.552963,969641.2,6.491109,0.145178,24.90076,20.753369,34.129188,78.245596,11.46155,1.703642,102125500.0
min,-41.0,-175.0,1960.0,7.9e-05,8.0,-54.2359,0.05275,4.23546,14.11697,0.64277,1.8,11.995,-27.722225,4582.0
25%,4.5,-14.0,1975.0,11.715713,1378.0,0.689847,0.931915,68.092228,94.272591,38.683842,16.9,56.7885,0.671876,554021.0
50%,17.05,20.0,1990.0,16.295227,7804.5,3.506835,0.9932,92.956108,101.527172,75.831841,44.1,67.1775,1.68254,4490967.0
75%,39.0,54.0,2005.0,21.078989,51524.5,6.134617,1.01845,99.368,108.628828,94.908592,109.4,72.775762,2.615683,15210440.0
max,72.0,178.0,2020.0,110.182771,21060470.0,58.647331,1.44929,134.542511,211.321594,163.934723,429.7,85.387805,16.295475,1411100000.0


In [20]:
# from here on use 'df' as the data frame variable instead of 'data_pivot' for brevity
df = data_pivot
df.info()

<IPython.core.display.Javascript object>

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2899 entries, 0 to 2898
Data columns (total 16 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Country Code                 2899 non-null   object 
 1   Latitude                     2899 non-null   float64
 2   Longitude                    2899 non-null   float64
 3   Country Name                 2899 non-null   object 
 4   Year                         2899 non-null   int64  
 5   Tax revenue                  924 non-null    float64
 6   GDP                          2282 non-null   float64
 7   GDP Growth                   2051 non-null   float64
 8   Gender parity index          1238 non-null   float64
 9   Primary completion           1103 non-null   float64
 10  School enrollment primary    1682 non-null   float64
 11  School enrollment secondary  1425 non-null   float64
 12  Mortality rate               2417 non-null   float64
 13  Life expectancy   

In [21]:
# common tool tip for all charts
country_tooltip = [
    C_COUNTRY_NAME,
    I_BIRTH_LIFE_EXPECTANCY,
    I_GDP,
    I_MORT_RATE_UND5,
    I_SEC_ENROLL,
    I_GPI,
    I_TAX_REV,
    I_POP_TOTAL,
]


<IPython.core.display.Javascript object>

## A simple chart to visually confirm the data is rendering

In [22]:
POP_TOTAL_Y = alt.Y(
    f"{I_POP_TOTAL}:Q",
    scale=alt.Scale(
        # domain=(0, 12000000),
        type="log"  # https://altair-viz.github.io/gallery/line_with_log_scale.html
    )
    # ,impute=alt.ImputeParams(value=0)
)

POP_GROWTH_Y = alt.Y(
    f"{I_POP_GROWTH}:Q",
    axis=alt.Axis(title="Population growth"),
    # , scale=alt.Scale(domain=(0, 12000000))
    # ,impute=alt.ImputeParams(value=0)
)

YEAR_X_AXIS = alt.X(f"{C_YEAR}:Q", scale=alt.Scale(domain=(1960, 2020)))


# https://altair-viz.github.io/user_guide/generated/api/altair.binding_select.html
dropdown = alt.binding_select(
    options=COUNTRY_CODES,
    labels=COUNTRY_NAMES,
    name="Select a country: ",
)

cc_selection = alt.selection(
    type="single", fields=[C_COUNRTY_CODE], bind=dropdown, init={C_COUNRTY_CODE: USA_CC}
)

# slider = alt.binding_range(min=1960, max=2020, step=5)
# select_year = alt.selection_single(name=C_YEAR, fields=[C_YEAR],
#                                    bind=slider, init={C_YEAR: 1990})

demo_chart = alt.Chart(df).mark_line().encode(
    x=YEAR_X_AXIS,
    y=POP_TOTAL_Y,
    # color='year:O'
    color=alt.Color(C_COUNRTY_CODE, scale=alt.Scale(scheme="spectral")),
    # size=I_TAX_REV,
    tooltip=country_tooltip,
    opacity=alt.condition(cc_selection, alt.value(1), alt.value(0.2)),
).properties(
    width=800, height=500, title="Demo chart for basic sanity checks."
).add_selection(
    cc_selection
    # ).transform_calculate(
    #   y='datum.t / 1000'
    # )
    # .add_selection(
    #     select_year
    # ).transform_filter( # works - shows single country
    #     cc_selection
    # )
).interactive()


<IPython.core.display.Javascript object>

## Chart for basic check to ensure data is visualizable by Altair - not for analysis
Try selecting *United Arab Emirates* or *China* to see some variety.

In [23]:
demo_chart

<IPython.core.display.Javascript object>

In [24]:
# world map background data
countries = alt.topo_feature(vega_data.world_110m.url, "countries")

# create world map background
world_map = (
    alt.Chart(countries)
    .mark_geoshape(fill="lightgray", stroke="white")
    .project("equirectangular")
)

<IPython.core.display.Javascript object>

In [25]:
slider = alt.binding_range(min=1960, max=2020, step=5, name="Select a year ")
select_year = alt.selection_single(
    name=C_YEAR, fields=[C_YEAR], bind=slider, init={C_YEAR: 1990}
)

world_data_map = (
    alt.Chart(df)
    .mark_circle()
    .encode(
        longitude=C_LONGITUDE,
        latitude=C_LATITUDE,
        size="Life Expectancy:Q",
        tooltip=country_tooltip,
    )
    .transform_bin(
        "Life Expectancy", I_BIRTH_LIFE_EXPECTANCY, bin=alt.Bin(extent=[10, 100])
    )
    .project("equirectangular")
    .add_selection(select_year)
    .transform_filter(select_year)
    .properties(width=900, height=500,
                title="World country indicators")
)


<IPython.core.display.Javascript object>

## Life expectancy around the world

Compare various neighbours like 
 - Chile and Bolivia
 - Pakistan, India, China
 - Saudi Arabia, Yemen, Oman
 - Australia, Papua New Guinea, New Zealand
 - South Sudan, Ethiopia, Kenya

In [26]:
world_chart = world_map + world_data_map
world_chart

<IPython.core.display.Javascript object>

## Make graphs for every indicator
Connect all to a country selector.   
Show data for all indicators in line graphs across all years.


In [27]:

width = 200
height = 300

pop_total_chart = (
    alt.Chart(df)
    .mark_line()
    .encode(
        x=YEAR_X_AXIS,
        y=POP_TOTAL_Y,
        tooltip=country_tooltip,
    )
    .properties(width=width, height=height, title="Population")
)

LE_Y_AXIS = alt.Y(
    f"{I_BIRTH_LIFE_EXPECTANCY}:Q",
    scale=alt.Scale(domain=(0, 100)),
    # axis=alt.Axis(title="birth life expectancy"),
)

le_chart = (
    alt.Chart(df)
    .mark_line()
    .encode(
        x=YEAR_X_AXIS,
        y=LE_Y_AXIS,
        tooltip=country_tooltip,
    )
    .properties(width=width, height=height, title="Birth life expectancy")
)

mort5_chart = (
    alt.Chart(df)
    .mark_line()
    .encode(
        x=YEAR_X_AXIS,
        y=I_MORT_RATE_UND5,
        tooltip=country_tooltip,
    )
    .properties(width=width, height=height, title="Moratlity rate (under 5)")
)



<IPython.core.display.Javascript object>

In [28]:
SEC_ENROLL_Y_AXIS = alt.Y(f"{I_SEC_ENROLL}:Q", scale=alt.Scale(domain=(0, 200)))

gdp_chart = (
    alt.Chart(df)
    .mark_line()
    .encode(
        x=YEAR_X_AXIS,
        y=I_GDP,
        tooltip=country_tooltip,
    )
    .properties(width=width, height=height, title="GDP (millions)")
)

GDP_GROWTH_Y_AXIS = alt.Y(f"{I_GDP_GROWTH}:Q", scale=alt.Scale(domain=(-60, +60)))

gdp_growth_chart = (
    alt.Chart(df)
    .mark_line()
    .encode(
        x=YEAR_X_AXIS,
        y=GDP_GROWTH_Y_AXIS,
        tooltip=country_tooltip,
    )
    .properties(width=width, height=height, title="GDP growth (annual %)")
)

tax_rev_chart = (
    alt.Chart(df)
    .mark_line()
    .encode(
        x=YEAR_X_AXIS,
        y=I_TAX_REV,
        tooltip=country_tooltip,
    )
    .properties(width=width, height=height, title="Tax revenue")
)



<IPython.core.display.Javascript object>

In [29]:
sec_school_chart = (
    alt.Chart(df)
    .mark_line()
    .encode(
        x=YEAR_X_AXIS,
        y=SEC_ENROLL_Y_AXIS,
        tooltip=country_tooltip,
    )
    .properties(width=width, height=height, title="Secondary school enrollment")
)

gender_parity_chart = (
    alt.Chart(df)
    .mark_line()
    .encode(
        x=YEAR_X_AXIS,
        y=I_GPI,
        tooltip=country_tooltip,
    )
    .properties(width=width, height=height, title="School gender parity")
)


prim_comp_chart = (
    alt.Chart(df)
    .mark_line()
    .encode(
        x=YEAR_X_AXIS,
        y=I_PRIM_COMPLETION_RATE,
        tooltip=country_tooltip,
    )
    .properties(width=width, height=height, title="Primary completion rate")
)

prim_enroll_chart = (
    alt.Chart(df)
    .mark_line()
    .encode(
        x=YEAR_X_AXIS,
        y=I_PRIM_ENROLL,
        tooltip=country_tooltip,
    )
    .properties(width=width, height=height, title="Primary enrollment")
)


<IPython.core.display.Javascript object>

In [30]:
base_charts = [
    pop_total_chart,
    le_chart,
    mort5_chart,
    gender_parity_chart,
    prim_enroll_chart,
    prim_comp_chart,
    sec_school_chart,
    gdp_growth_chart,
    gdp_chart,
    tax_rev_chart,
]
one_country_charts = [
    chart.copy().add_selection(cc_selection).transform_filter(cc_selection)
    for chart in base_charts
]

country_indicators_chart = (
    (one_country_charts[0] | one_country_charts[1] | one_country_charts[2])
    & (one_country_charts[3] | one_country_charts[4] | one_country_charts[5] | one_country_charts[6])
    & (one_country_charts[7] | one_country_charts[8] | one_country_charts[9] )
).properties(title="All indicators for the selected country")
country_indicators_chart

<IPython.core.display.Javascript object>

## Notes for the chart above
Try Afghanistan, Iraq, Iran, India, Singapore, Sri Lanka etc to see some variations.  
For example, Vietname, Venezuela show higher under 5 moratliy rates.  
Every country seems to show improvement in most areas over time.

In [40]:
# make two sets of charts to compare countries
dropdown1 = alt.binding_select(options=COUNTRY_CODES, labels=COUNTRY_NAMES, name="Top row country ....: ")
cc_selection1 = alt.selection(type="single", fields=[C_COUNRTY_CODE], bind=dropdown1, init={C_COUNRTY_CODE: "IND"})

dropdown2 = alt.binding_select(options=COUNTRY_CODES, labels=COUNTRY_NAMES, name="Bottom row country: ")
cc_selection2 = alt.selection(type="single", fields=[C_COUNRTY_CODE], bind=dropdown2, init={C_COUNRTY_CODE: USA_CC})


country1_charts = [
    chart.copy().add_selection(
      cc_selection1
    ).transform_filter(
      cc_selection1
    )
    for chart in base_charts
]

country2_charts = [
    chart.copy().add_selection(
      cc_selection2
    ).transform_filter(
      cc_selection2
    )
    for chart in base_charts
]


<IPython.core.display.Javascript object>

## Charts to compare any two countries
### Notes
Population scales are country independent - all other scales same for both countries.  

The dropdowns for the countries are in the reverse order, sorry.

In [41]:
two_country_chart = (country1_charts[0] | country1_charts[1]| country1_charts[6] | country1_charts[7]) & (country2_charts[0] | country2_charts[1] | country2_charts[6] | country2_charts[7])
two_country_chart = two_country_chart.properties(title="Compare two countries")
two_country_chart


<IPython.core.display.Javascript object>

# SPLOM
See relationships between all the indicators.

The year selectors is used to choose data for a specific year.  
The country selector highlights data for a specific country.

In [33]:
# SPLOM
fields = [
    I_BIRTH_LIFE_EXPECTANCY,
    I_MORT_RATE_UND5,
    I_SEC_ENROLL,
    I_POP_GROWTH,
    I_GDP_GROWTH,
]

splom_chart = alt.Chart(df).mark_point().encode(
    alt.X(alt.repeat("column"), type="quantitative"),
    alt.Y(alt.repeat("row"), type="quantitative"),
    tooltip=country_tooltip,
    opacity=alt.condition(cc_selection, alt.value(1), alt.value(0.2))
).properties(
    width=200, height=200,
).repeat(
    row=fields, column=fields[::-1]
).add_selection(
    select_year
).transform_filter(
    select_year
).add_selection(
    cc_selection
).interactive()
splom_chart = splom_chart.properties(title="Compare all indicators for the selected year")

splom_chart


<IPython.core.display.Javascript object>

In [34]:
# https://altair-viz.github.io/gallery/top_k_items.html
top_gdp_countries = (
    alt.Chart(
        df,
    )
    .mark_bar()
    .encode(
        x=alt.X(f"{C_COUNTRY_NAME}:N", sort="-y"),
        y=alt.Y(f"{I_GDP}:Q"),
        color=alt.Color(f"{I_GDP}:Q"),
        tooltip=country_tooltip
    )
    .properties(width=200, height=200, title="Top GDP countries for the selected year")
    .add_selection(select_year)
    .transform_filter(select_year)
    .transform_window(
        rank=f"rank({I_GDP})", sort=[alt.SortField(I_GDP, order="descending")]
    )
    .transform_filter((alt.datum.rank < 10))
)


<IPython.core.display.Javascript object>

## Changes to the list of top GDP Countries over time

In [35]:
top_gdp_countries 

<IPython.core.display.Javascript object>

## End of notebook