## Colombia World Development Indicators Analytics

In [1]:
import pandas as pd
import numpy as np
import plotly.plotly as py
import plotly.graph_objs as go

datos = pd.read_csv("suramerica.csv").drop('Unnamed: 0', 1)
paises = ['COL']
pais_nombre = 'Colombia'

preprocessing = 'imput'
search = 'original'
year_init = 2015
year_range = 15
look_back = 3
years = range(year_init-year_range+1,year_init+1)[::-1]

agricultura     = open("Indicadores/iagricultura.txt").read().split(',')
ambiente        = open("Indicadores/iambiente.txt").read().split(',')
ayuda           = open("Indicadores/iayuda.txt").read().split(',')
ciencia         = open("Indicadores/iciencia.txt").read().split(',')
clima           = open("Indicadores/iclima.txt").read().split(',')
comercio        = open("Indicadores/icomercio.txt").read().split(',')
deuda           = open("Indicadores/ideuda.txt").read().split(',')
economia        = open("Indicadores/ieconomia.txt").read().split(',')
educacion       = open("Indicadores/ieducacion.txt").read().split(',')
energia         = open("Indicadores/ienergia.txt").read().split(',')
finanzas        = open("Indicadores/ifinanzas.txt").read().split(',')
genero          = open("Indicadores/igenero.txt").read().split(',')
infraestructura = open("Indicadores/iinfraestructura.txt").read().split(',')
pobreza         = open("Indicadores/ipobreza.txt").read().split(',')
privado         = open("Indicadores/iprivado.txt").read().split(',')
publico         = open("Indicadores/ipublico.txt").read().split(',')
salud           = open("Indicadores/isalud.txt").read().split(',')
social          = open("Indicadores/isocial.txt").read().split(',')
trabajo         = open("Indicadores/itrabajo.txt").read().split(',')
urbano          = open("Indicadores/iurbano.txt").read().split(',')

conjunto_nombre = ['Agricultura','Ambiente','Ayuda','Ciencia','Clima','Comercio','Deuda','Economia','Educacion',
                   'Energia','Finanzas','Genero','Infraestructura','Pobreza','Privado','Publico','Salud','Social',
                   'Trabajo','Urbano']

conjunto = [agricultura,ambiente,ayuda,ciencia,clima,comercio,deuda,economia,educacion,energia,finanzas,genero,
            infraestructura,pobreza,privado,publico,salud,social,trabajo,urbano]

datos.head()

Unnamed: 0,CountryName,CountryCode,IndicatorName,IndicatorCode,Year,Value
0,Antigua and Barbuda,ATG,"Adolescent fertility rate (births per 1,000 wo...",SP.ADO.TFRT,1960,126.144
1,Antigua and Barbuda,ATG,Age dependency ratio (% of working-age populat...,SP.POP.DPND,1960,88.237117
2,Antigua and Barbuda,ATG,"Age dependency ratio, old (% of working-age po...",SP.POP.DPND.OL,1960,7.779958
3,Antigua and Barbuda,ATG,"Age dependency ratio, young (% of working-age ...",SP.POP.DPND.YG,1960,80.457159
4,Antigua and Barbuda,ATG,"Birth rate, crude (per 1,000 people)",SP.DYN.CBRT.IN,1960,32.92


## Categorizamos los Indicadores

Categorizamos los Indicadores en sus respectivos conjuntos

In [2]:
def categorizar_indicadores(df):
    for i in range(len(conjunto)):
        df.loc[df['IndicatorCode'].isin(conjunto[i]), 'Set'] = conjunto_nombre[i]
    return df

In [3]:
df = categorizar_indicadores(datos)
df.head()

Unnamed: 0,CountryName,CountryCode,IndicatorName,IndicatorCode,Year,Value,Set
0,Antigua and Barbuda,ATG,"Adolescent fertility rate (births per 1,000 wo...",SP.ADO.TFRT,1960,126.144,Social
1,Antigua and Barbuda,ATG,Age dependency ratio (% of working-age populat...,SP.POP.DPND,1960,88.237117,Salud
2,Antigua and Barbuda,ATG,"Age dependency ratio, old (% of working-age po...",SP.POP.DPND.OL,1960,7.779958,Salud
3,Antigua and Barbuda,ATG,"Age dependency ratio, young (% of working-age ...",SP.POP.DPND.YG,1960,80.457159,Salud
4,Antigua and Barbuda,ATG,"Birth rate, crude (per 1,000 people)",SP.DYN.CBRT.IN,1960,32.92,Salud


## Graficamos Registros Paises

Graficamos la cantidad de años registrados para cada uno de los paises de Suramérica

In [4]:
def registros_country_year(df):
    return df.groupby(['CountryName','Year']).agg(['count']).drop(['CountryCode','IndicatorName','IndicatorCode','Set'], 1)

In [5]:
def graficar_registros_paises(df):

    trace = go.Heatmap(z=df.reset_index(col_level=0)['Value'].values,
                       x=df.reset_index(col_level=0)['Year'],
                       y=df.reset_index(col_level=0)['CountryName'],
                       colorscale = 'Viridis')
    data=[trace]
    
    layout = go.Layout(
        title='Records of Indicators for each Country since 1960 to 2015',
        xaxis=dict(title='Year',   titlefont=dict(size=18)),
        yaxis=dict(title='Country', titlefont=dict(size=18))
    )
    
    fig = go.Figure(data=data, layout=layout)

    return py.iplot(fig,filename='Records_Country_Year')

In [6]:
graficar_registros_paises(registros_country_year(df))

## Graficamos Registros Conjuntos

Graficamos los registros de años para cada conjunto de indicadores para Colombia

In [7]:
def registros_set_year(df,pais):
    return df[df['CountryCode'] == pais].groupby(['Set','Year']).agg(['count']).drop(['CountryName','CountryCode','IndicatorName','IndicatorCode'], 1)

In [8]:
def graficar_registros_conjuntos(df,pais):

    trace = go.Heatmap(z=df.reset_index(col_level=0)['Value'].values,
                       x=df.reset_index(col_level=0)['Year'],
                       y=df.reset_index(col_level=0)['Set'],
                       colorscale = 'Viridis')
    data=[trace]
    
    layout = go.Layout(
        title='Records of Indicators for each Set for '+pais+' since 1960 to 2015',
        xaxis=dict(title='Year',   titlefont=dict(size=18)),
        yaxis=dict(title='Set', titlefont=dict(size=18))
    )
    
    fig = go.Figure(data=data, layout=layout)

    return py.iplot(fig,filename='Records of Indicators for each Set for '+pais+' since 1960 to 2015')

In [9]:
graficar_registros_conjuntos(registros_set_year(df,'COL'),'Colombia')

## Cargamos Resultados para Colombia y parámetros Originales

Si eliminamos los indicadores con menos de 5 registros y ampliamos el espectro de años a 15 e iniciamos desde 2015 hacia atrás. Tendremos los siguientes resultados:

In [10]:
resultados = pd.read_csv("colombia_original_log.csv").drop('Unnamed: 0', 1)
resultados

Unnamed: 0,Base,Target,Look Back,Reg,%,Time,Results
0,Agricultura,Agricultura,1,48,0.455277,10.584898,[[ 0. ]\n [ 0. ]\n [ 0.93549912]...
1,Agricultura,Agricultura,2,48,0.414852,13.457072,[[ 0. ]\n [ 0. ]\n [ 0.81843835]...
2,Agricultura,Agricultura,3,48,0.394925,13.106958,[[ 0.32661843]\n [ 0.60803299]\n [ 0.50717803]...
3,Agricultura,Ambiente,1,96,0.307334,16.575595,[[ 0. ]\n [ 0.74249732]\n [ 0. ]...
4,Agricultura,Ambiente,2,96,0.287166,18.022063,[[ 0. ]\n [ 0.85310254]\n [ 0. ]...
5,Agricultura,Ambiente,3,96,0.286223,18.956299,[[ 0.16877031]\n [ 0. ]\n [ 0. ]...
6,Agricultura,Ayuda,1,71,0.382280,17.504265,[[ 1. ]\n [ 0.29062546]\n [ 1. ]...
7,Agricultura,Ayuda,2,71,0.384664,18.287500,[[ 1. ]\n [ 0.88768056]\n [ 1. ]...
8,Agricultura,Ayuda,3,71,0.420092,19.652280,[[ 1. ]\n [ 0.83451842]\n [ 1. ]...
9,Agricultura,Ciencia,1,13,0.429908,5.675936,[[ 0.40123109]\n [ 0. ]\n [ 0.75798151]...


## Mejores Combinaciones

In [11]:
resultados.sort_values('%', ascending=False).reset_index().drop(['index','Time','Results'],1)[:10]

Unnamed: 0,Base,Target,Look Back,Reg,%
0,Social,Urbano,2,24,0.721845
1,Social,Urbano,3,24,0.693165
2,Urbano,Urbano,1,24,0.669267
3,Energia,Urbano,2,20,0.661917
4,Urbano,Salud,1,149,0.645482
5,Clima,Urbano,2,16,0.643453
6,Social,Salud,2,145,0.641142
7,Salud,Urbano,2,21,0.6394
8,Salud,Salud,1,152,0.637943
9,Economia,Urbano,2,24,0.637235


## Peores Combinaciones

In [12]:
resultados.sort_values('%', ascending=True).reset_index().drop(['index','Time','Results'],1)[:10]

Unnamed: 0,Base,Target,Look Back,Reg,%
0,Agricultura,Publico,3,97,0.107455
1,Finanzas,Publico,2,95,0.111846
2,Infraestructura,Publico,2,97,0.112536
3,Infraestructura,Publico,3,97,0.117392
4,Trabajo,Publico,2,97,0.117819
5,Ayuda,Publico,3,97,0.121123
6,Deuda,Publico,1,97,0.123299
7,Energia,Publico,2,97,0.127654
8,Publico,Publico,3,97,0.13019
9,Genero,Publico,1,87,0.130502


## Graficamos Resultados Globales

El siguiente Heatmap nos muestra el promedio del Score R2 obtenido para cada conjunto de indicadores

In [13]:
def graficar_resultados(df,look_back):

    trace = go.Heatmap(z=df[df['Look Back'] == look_back]['%'],
                       x=df[df['Look Back'] == look_back]['Base'],
                       y=df[df['Look Back'] == look_back]['Target'],
                       zmax=1.0,
                       zmin=0.0,
                       colorscale = 'Viridis')
    data=[trace]
    
    layout = go.Layout(
        title=pais_nombre+' + '+search+': Total Heatmap with Look Back = '+str(look_back),
        xaxis=dict(title='Base',   titlefont=dict(size=18)),
        yaxis=dict(title='Target', titlefont=dict(size=18))
    )
    
    fig = go.Figure(data=data, layout=layout)

    return py.iplot(fig,filename=pais_nombre+' + '+search+': Total Heatmap with Look Back = '+str(look_back))

In [14]:
graficar_resultados(resultados,1)

In [15]:
graficar_resultados(resultados,2)

In [16]:
graficar_resultados(resultados,3)

## Conclusiones

- Con el objetivo de encontrar los conjuntos con mayor grado de predictibilidad, determinamos el promedio de su rendimiento para dos casos:

 **Predictor:** Conjunto que predice a otro conjunto
 
 **Predecible:** Conjunto que es predecido por otro conjunto

In [17]:
def conjuntos_performance(df,look_back,perform,predict,rango):
    return df[df['Look Back']==look_back].groupby(predict).mean().sort_values('%', ascending=perform).index.values[:rango]

In [18]:
def predecibles_predictores(df,look_back,rango):
    return pd.DataFrame(np.column_stack([conjuntos_performance(df,look_back,False,'Base',rango),conjuntos_performance(df,look_back,True,'Base',rango),conjuntos_performance(df,look_back,False,'Target',rango),conjuntos_performance(df,look_back,True,'Target',rango)]),columns=[['Predictor','Predictor','Predictable','Predictable'],['Best','Worse','Easy','Hard']])

In [19]:
predecibles_predictores(resultados,1,5)

Unnamed: 0_level_0,Predictor,Predictor,Predictable,Predictable
Unnamed: 0_level_1,Best,Worse,Easy,Hard
0,Salud,Pobreza,Salud,Publico
1,Urbano,Deuda,Urbano,Pobreza
2,Economia,Publico,Economia,Educacion
3,Clima,Educacion,Agricultura,Genero
4,Ciencia,Trabajo,Privado,Energia
