In [None]:
!pip install world_bank_data --upgrade plotly itables tqdm descartes

In [None]:
import pandas as pd
import numpy as np
import plotly
import plotly.offline as offline
import world_bank_data as wb
from matplotlib import pyplot as plt
import seaborn as sns
from itables import init_notebook_mode
from tqdm import tqdm
import math 

init_notebook_mode(all_interactive=True)

import statsmodels.api as sm
import datetime as dt


from sklearn.cluster import KMeans
#import sklearn.cluster.hierarchical as hclust
from sklearn import preprocessing


pd.set_option('display.max_rows', 50)
offline.init_notebook_mode()

In [None]:
indices_to_crawl = [
    'VC.IDP.TOCV', # Internally displaced persons, total displaced by conflict and violence (number of people) --> We should normalize it by Total population
    'IQ.CPA.TRAN.XQ', # CPIA transparency, accountability, and corruption in the public sector rating (1=low to 6=high)
    'SP.DYN.AMRT.FE', # Mortality rate, adult, female (per 1,000 female adults)
    'SP.DYN.AMRT.MA', # Mortality rate, adult, male (per 1,000 male adults)
    'SN.ITK.SVFI.ZS', # Prevalence of severe food insecurity in the population (%)
    'NY.GDP.PCAP.CD', # GDP per capita (current US$)
    'NY.GDP.DEFL.KD.ZG', # Inflation, GDP deflator (annual %)
    'SL.UEM.TOTL.FE.ZS', # Unemployment, female (% of female labor force) (modeled ILO estimate)
    'SL.UEM.TOTL.MA.ZS', # Unemployment, male (% of male labor force) (modeled ILO estimate)
    'EG.ELC.ACCS.ZS', # Access to electricity (% of population)
    'ER.H2O.FWST.ZS', # Level of water stress: freshwater withdrawal as a proportion of available freshwater resources
    'SP.POP.TOTL', # Total Population
    'SP.POP.GROW', # Population growth (annual %)
    'EN.POP.DNST', # Population density (people per sq. km of land area)
    'SP.RUR.TOTL', # Total Rural Population
    'SP.DYN.CBRT.IN', # Birth rate, crude (per 1,000 people)
    'SP.DYN.LE00.IN', # Life expectancy at birth, total (years)	
    'SL.UEM.TOTL.ZS', # Unemployment, total (% of total labor force) (modeled ILO estimate)
    'SH.STA.WASH.P5', # Mortality rate attributed to unsafe water, unsafe sanitation and lack of hygiene (per 100,000 population)

    'NE.EXP.GNFS.ZS', # Exports of goods and services (% of GDP)
    'NE.IMP.GNFS.ZS', # Imports of goods and services (% of GDP)

    'AG.YLD.CREL.KG', # Cereal yield (kg per hectare)
    'SL.AGR.EMPL.ZS', # Employment in agriculture (% of total employment) (modeled ILO estimate)
    #'VC.IDP.NWDS', # Internally displaced persons, new displacement associated with disasters (number of cases)
    #'VC.IDP.NWCV', # Internally displaced persons, new displacement associated with conflict and violence (number of cases)
    #'AG.LND.PRCP.MM', # Average precipitation in depth (mm per year)
    #'EN.CLC.MDAT.ZS', # Droughts, floods, extreme temperatures (% of population, average 1990-2009)
    #'ER.H2O.FWST.ZS', # Level of water stress: freshwater withdrawal as a proportion of available freshwater resources
    #'SI.POV.GINI', # Gini index (World Bank estimate)
    'SE.PRM.CMPT.ZS', # Primary completion rate, total (% of relevant age group)
    #'HD.HCI.OVRL', # Human Capital Index (HCI) (scale 0-1)
    'AG.LND.ARBL.ZS', # Arable land (% of land area)

    'IQ.CPA.PADM.XQ', # CPIA quality of public administration rating (1=low to 6=high)
    'IQ.CPA.GNDR.XQ' # CPIA gender equality rating (1=low to 6=high)
                   ]
exclusions_list = []

sahel_countries = ['BFA','CMR','TCD','GMB','MLI','MRT','NER','NGA','SEN','CAF']

filtered_indices = list(set(indices_to_crawl) - set(exclusions_list))
filtered_indices

In [None]:
indices_to_crawl_text = {
    'VC.IDP.TOCV' : 'Internally displaced persons, total displaced by conflict and violence (number of people)',
    'IQ.CPA.TRAN.XQ' : 'CPIA transparency, accountability, and corruption in the public sector rating (1=low to 6=high)',
    'SP.DYN.AMRT.FE' : 'Mortality rate, adult, female (per 1,000 female adults)',
    'SP.DYN.AMRT.MA' : 'Mortality rate, adult, male (per 1,000 male adults)',
    'SN.ITK.SVFI.ZS' : 'Prevalence of severe food insecurity in the population (%)',
    'NY.GDP.PCAP.CD' : 'GDP per capita (current US$)',
    'NY.GDP.DEFL.KD.ZG' : 'Inflation, GDP deflator (annual %)',
    'SL.UEM.TOTL.FE.ZS' : 'Unemployment, female (% of female labor force) (modeled ILO estimate)',
    'SL.UEM.TOTL.MA.ZS' : 'Unemployment, male (% of male labor force) (modeled ILO estimate)',
    'EG.ELC.ACCS.ZS' : 'Access to electricity (% of population)',
    'ER.H2O.FWST.ZS' : 'Level of water stress: freshwater withdrawal as a proportion of available freshwater resources',
    'SP.POP.TOTL' : 'Total Population',
    'SP.POP.GROW' : 'Population growth (annual %)',
    'EN.POP.DNST' : 'Population density (people per sq. km of land area)',
    'SP.RUR.TOTL' : 'Total Rural Population',
    'SP.DYN.CBRT.IN' : 'Birth rate, crude (per 1,000 people)',
    'SP.DYN.LE00.IN' : 'Life expectancy at birth, total (years)',
    'SL.UEM.TOTL.ZS' : 'Unemployment, total (% of total labor force) (modeled ILO estimate)',
    'SH.STA.WASH.P5' : 'Mortality rate attributed to unsafe water, unsafe sanitation and lack of hygiene (per 100,000 population)',

    'NE.EXP.GNFS.ZS' : 'Exports of goods and services (% of GDP)',
    'NE.IMP.GNFS.ZS' : 'Imports of goods and services (% of GDP)',

    'AG.YLD.CREL.KG' : 'Cereal yield (kg per hectare)',
    'SL.AGR.EMPL.ZS' : 'Employment in agriculture (% of total employment) (modeled ILO estimate)',
    #'VC.IDP.NWDS', # Internally displaced persons, new displacement associated with disasters (number of cases)
    #'VC.IDP.NWCV', # Internally displaced persons, new displacement associated with conflict and violence (number of cases)
    #'AG.LND.PRCP.MM', # Average precipitation in depth (mm per year)
    #'EN.CLC.MDAT.ZS', # Droughts, floods, extreme temperatures (% of population, average 1990-2009)
    #'ER.H2O.FWST.ZS', # Level of water stress: freshwater withdrawal as a proportion of available freshwater resources
    #'SI.POV.GINI', # Gini index (World Bank estimate)
    'SE.PRM.CMPT.ZS' : 'Primary completion rate, total (% of relevant age group)',
    #'HD.HCI.OVRL', # Human Capital Index (HCI) (scale 0-1)
    'AG.LND.ARBL.ZS' : 'Arable land (% of land area)',

    'IQ.CPA.PADM.XQ' : 'CPIA quality of public administration rating (1=low to 6=high)',
    'IQ.CPA.GNDR.XQ' : 'CPIA gender equality rating (1=low to 6=high)'
}

In [None]:
all_values = pd.DataFrame({serie_id: wb.get_series(serie_id, id_or_value='id', simplify_index=True, date='2000:2022', gapfil='Y')
                           for serie_id in tqdm(filtered_indices)})
all_values

In [None]:

all_values = all_values.reset_index()
sahel_values = all_values[all_values['Country'].isin(sahel_countries)]
#sahel_values = sahel_values.drop('index', axis=1)
sahel_values = sahel_values.set_index(['Country', 'Year'])
sahel_values

In [None]:
sahel_values = sahel_values.reset_index()


sahel_values['VC.IDP.TOCV'] = 1000 * sahel_values['VC.IDP.TOCV'] / dfcluster['SP.POP.TOTL'] # normalizing per 1000 habitants


In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
sahel_values['date']  = sahel_values.apply(lambda x: dt.datetime(year = int(x.Year), month=1, day=1), axis=1)
sahel_values.set_index('date')

sns.set_style("darkgrid")

fig, axes = plt.subplots(len(indices_to_crawl_text.items()), 2, figsize=(25, 5))

for index, var in indices_to_crawl_text.items():
   
    i=0
    for key, group in sahel_values.groupby('Country'):
        
        #group['5 years average'] = group[index].rolling(window=5).mean()
        fig.suptitle("%s : %s" % (key,var))
        
        sns.lineplot(x='date', y=index, data=group, ax= axes[i,0], label=var) 
        sns.violinplot(x=index, data=group, ax=axes[i,1])
        sns.rugplot(x=index, data=group, height=.03, color='darkblue',ax=axes[i,1])
        
        axes[i,0].set_title('Time Series data')
        axes[i,1].set_title('Violin plot')
        
    
        fig.tight_layout()   
    i = i+1

In [None]:
sns.pairplot(dfcluster,hue='Country')