## World Development Indicators - Analytics

In [50]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import plotly.plotly as py
import plotly.graph_objs as go

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import Imputer
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import ShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer

datos = pd.read_csv("suramerica.csv").drop('Unnamed: 0', 1)
datos.head()

Unnamed: 0,CountryName,CountryCode,IndicatorName,IndicatorCode,Year,Value
0,Antigua and Barbuda,ATG,"Adolescent fertility rate (births per 1,000 wo...",SP.ADO.TFRT,1960,126.144
1,Antigua and Barbuda,ATG,Age dependency ratio (% of working-age populat...,SP.POP.DPND,1960,88.237117
2,Antigua and Barbuda,ATG,"Age dependency ratio, old (% of working-age po...",SP.POP.DPND.OL,1960,7.779958
3,Antigua and Barbuda,ATG,"Age dependency ratio, young (% of working-age ...",SP.POP.DPND.YG,1960,80.457159
4,Antigua and Barbuda,ATG,"Birth rate, crude (per 1,000 people)",SP.DYN.CBRT.IN,1960,32.92


In [5]:
def r2(y_true, y_predict):
    return r2_score(y_true, y_predict)

In [6]:
def imputador(tab):
    impute=Imputer(missing_values="NaN",strategy='mean',axis=0)
    impute.fit(tab)
    tab_imputada=impute.transform(tab)
    return pd.DataFrame(tab_imputada)

In [7]:
def zeros(tab):
    #Asignamos a los NaN el valor de 0
    tab_zeros = pd.DataFrame(tab).fillna(value=0)
    #Eliminamos las columnas de Zeros descartando Indicadores que no tienen regristros para ningún pais y año deseados
    tab_zeros_2 = tab_zeros.loc[:, (tab_zeros != 0).any(axis=0)]
    return tab_zeros_2

In [8]:
def Grid_Search_CV(X_train, y_train):
    cv = ShuffleSplit(X_train.shape[0], n_iter = 10, test_size=0.2, random_state=0)
    estimator = DecisionTreeRegressor()
    params = {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'splitter': ['best', 'random']}
    scoring_func = make_scorer(r2)
    
    grid = GridSearchCV(estimator, params, cv=cv, scoring=scoring_func)

    grid.fit(X_train, y_train)
    
    for k,v in grid.best_params_.iteritems():
        best_max_depth = v
    
    return best_max_depth

In [9]:
def DTR_v2(X_train, X_test, y_train, y_test, depth):
    estimator = DecisionTreeRegressor(max_depth=depth)
    estimator.fit(X_train,y_train)
    y_predict = estimator.predict(X_test)
    return r2(y_test,y_predict)

In [10]:
def correlation_matrix(df):
    from matplotlib import pyplot as plt
    from matplotlib import cm as cm

    fig = plt.figure()
    ax1 = fig.add_subplot(111)
    cmap = cm.get_cmap('jet', 30)
    cax = ax1.imshow(df.corr(), interpolation="nearest", cmap=cmap)
    plt.title('Matriz de correlacion')
    fig.colorbar(cax)
    plt.show()

In [11]:
def correlacion(tab,y_indicator):
    # Buscamos los indicadores que se correlacionen más (coeficiente > 0.7)
    eliminar = []
    tabla_correlacion = np.array(pd.DataFrame(tab).corr())
    for i in range(np.shape(tab)[1]):
        if abs(tabla_correlacion[i,y_indicator]) > 0.7:
            eliminar.append(i)

    # Eliminamos los indicadores que se correlacionan más (Coeficiente > 0.7)
    temporal = tab[:,:]

    for i in range(len(eliminar)):
        temporal = np.delete(temporal[:,:], eliminar[i]-i, 1)

    return temporal

In [12]:
def tabla_base(paises,years,indicadores,datos):
    tab = pd.DataFrame.pivot_table(datos, values='Value', index=['CountryName', 'Year'], columns=['IndicatorCode']).loc[(paises,years),indicadores]
    return tab

## Definimos Paises, Años y Conjunto de Indicadores

In [74]:
paises = ['Antigua and Barbuda','Argentina','Aruba','Bahamas, The','Barbados','Belize','Bolivia','Brazil','Cayman Islands','Chile','Colombia','Costa Rica','Cuba','Curacao','Dominica','Dominican Republic','Ecuador','El Salvador','Grenada','Guatemala','Guyana','Haiti','Honduras','Jamaica','Mexico','Nicaragua','Panama','Paraguay','Peru','Puerto Rico','Sint Maarten (Dutch part)','St. Kitts and Nevis','St. Lucia','St. Martin (French part)','St. Vincent and the Grenadines','Suriname','Trinidad and Tobago','Turks and Caicos Islands','Uruguay','Venezuela, RB','Virgin Islands (U.S.)']
years = range(2008,2010+1)

gdp = ['GDP (constant LCU)','GDP (current LCU)','GDP growth (annual %)']
#gdp = ['NY.GDP.MKTP.KN','NY.GDP.MKTP.CN','NY.GDP.MKTP.CD','NY.GDP.MKTP.KD','NY.GDP.DEFL.ZS','NY.GDP.MKTP.KD.ZG','NY.GDP.PCAP.KD','NY.GDP.PCAP.KN','NY.GDP.PCAP.CN','NY.GDP.PCAP.CD','NY.GDP.PCAP.KD.ZG','NY.GDP.PCAP.PP.KD','NY.GDP.PCAP.PP.CD','NY.GDP.MKTP.PP.KD','NY.GDP.MKTP.PP.CD']
#gross = ['NE.GDI.TOTL.ZS','NE.GDI.TOTL.KD.ZG','NE.GDI.TOTL.KD','NE.GDI.TOTL.KN','NE.GDI.TOTL.CN','NE.GDI.TOTL.CD','NY.GDY.TOTL.KN','NY.GDS.TOTL.ZS','NY.GDS.TOTL.CN','NY.GDS.TOTL.CD','NE.GDI.FTOT.ZS','NE.GDI.FTOT.KD.ZG','NE.GDI.FTOT.KD','NE.GDI.FTOT.KN','NE.GDI.FTOT.CN','NE.GDI.FTOT.CD','NE.GDI.FPRV.ZS','NE.GDI.FPRV.CN','NE.DAB.TOTL.ZS','NE.DAB.TOTL.KD','NE.DAB.TOTL.KN','NE.DAB.TOTL.CN','NE.DAB.TOTL.CD','NE.DAB.DEFL.ZS','NY.GNS.ICTR.ZS','NY.GNS.ICTR.GN.ZS','NY.GNS.ICTR.CN','NY.GNS.ICTR.CD','NY.GDP.FCST.KD','NY.GDP.FCST.KN','NY.GDP.FCST.CN','NY.GDP.FCST.CD']
#gni = ['NY.GNP.MKTP.KD','NY.GNP.MKTP.KN','NY.GNP.MKTP.CN','NY.GNP.MKTP.CD','NY.GNP.MKTP.KD.ZG','NY.GNP.PCAP.KD','NY.GNP.PCAP.KN','NY.GNP.PCAP.CN','NY.GNP.PCAP.KD.ZG','NY.GNP.PCAP.CD','NY.GNP.PCAP.PP.KD','NY.GNP.PCAP.PP.CD','NY.GNP.ATLS.CD','NY.GNP.MKTP.PP.KD','NY.GNP.MKTP.PP.CD']

conjunto_nombre = ['GDP','Gross','GNI']
conjunto = [gdp,gross,gni]

In [79]:
pd.DataFrame.pivot_table(datos, values='Value', index=['CountryName'], columns=['Year','IndicatorName']).loc[paises,(years,gdp)]

Year,2008,2008,2008,2009,2009,2009,2010,2010,2010
IndicatorName,GDP (constant LCU),GDP (current LCU),GDP growth (annual %),GDP (constant LCU),GDP (current LCU),GDP growth (annual %),GDP (constant LCU),GDP (current LCU),GDP growth (annual %)
CountryName,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Antigua and Barbuda,3358406000.0,3637845000.0,0.071111,2954188000.0,3257308000.0,-12.036015,2743171000.0,3065955000.0,-7.142991
Argentina,705864700000.0,1283906000000.0,3.074946,706217800000.0,1411526000000.0,0.050024,772966600000.0,1810830000000.0,9.451578
Aruba,3293180000.0,4997610000.0,-6.881302,3107000000.0,4473090000.0,-5.653502,,4417190000.0,
"Bahamas, The",7893019000.0,8246650000.0,-2.323937,7563465000.0,7820420000.0,-4.175259,7679850000.0,7909580000.0,1.538779
Barbados,1141000000.0,9190000000.0,0.440141,1095000000.0,9204000000.0,-4.031551,1098000000.0,8891000000.0,0.273973
Belize,2353310000.0,2737250000.0,3.229101,2370095000.0,2673914000.0,0.713251,2448874000.0,2794227000.0,3.323871
Bolivia,30277830000.0,120693800000.0,6.148497,31294250000.0,121726700000.0,3.357001,32585680000.0,137875600000.0,4.126723
Brazil,1603198000000.0,3107530000000.0,5.019316,1599415000000.0,3328174000000.0,-0.235978,1720523000000.0,3886835000000.0,7.572067
Cayman Islands,,2873637000.0,,,2703056000.0,,,2646332000.0,
Chile,93847930000000.0,93847930000000.0,3.292455,92875260000000.0,96443760000000.0,-1.036432,98219030000000.0,110998700000000.0,5.753709


## Calculamos Registros

Calculamos la cantidad de registros no nulos (Non-NaN) para cada conjunto de indicadores en base a los paises y años deseados

In [41]:
def registros(conjunto):   
    reg = []
    for i in range(len(conjunto)):
        reg.append(np.sum(tabla_base(paises,years,conjunto[i],datos).count(axis=1)))
    return reg

In [47]:
def graficar_registros(reg):
    
    data = [go.Bar(
            x=conjunto_nombre,
            y=reg)]
    
    return py.iplot(data, filename='Registros')

In [51]:
graficar_registros(registros(conjunto))

## Splitter v3

Predecir desde un grupo de indicadores a otro grupo de indicadores

In [27]:
def splitter_v3(tabla_1,tabla_2,y_indicator):
    #Fusionamos la tabla_1 y el indicador y de la tabla_2
    tab_fusion = np.column_stack((tabla_1[:,:],tabla_2[:,y_indicator]))
    
    # Asignamos X e y, eliminando los indicadores que se correlacionen más (coeficiente > 0.7)  
    X = correlacion(tab_fusion,-1)
    y = tab_fusion[:,-1]
    
    # Separamos Train y Test respectivamente para X e y
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    return X_train, X_test, y_train, y_test

## Iteramos el Splitter v3

In [28]:
def iter_R2_Splitter_v3(tab1,tab2):    
    R2_global = list()
    for i in range(0,np.shape(tab2)[1]):
        ## Zeros
        X_train, X_test, y_train, y_test = splitter_v3(np.array(zeros(tab1)),np.array(zeros(tab2)),i)
        best_max_depth = Grid_Search_CV(X_train, y_train)
        R2_Original_GridSearchCV = DTR_v2(X_train, X_test, y_train, y_test,best_max_depth)

        ## Imputado
        X_train, X_test, y_train, y_test = splitter_v3(np.array(imputador(tab1)),np.array(imputador(tab2)),i)
        best_max_depth = Grid_Search_CV(X_train, y_train)
        R2_Imputado_GridSearchCV = DTR_v2(X_train, X_test, y_train, y_test,best_max_depth)

        # Recopilamos los R2
        R2_global.append([R2_Original_GridSearchCV,R2_Imputado_GridSearchCV])
    
    return pd.DataFrame(R2_global,columns=['Original','Imputado'])

## Porcentaje

Calculamos el % de indicadores cuyo porcentaje de predicción es mayor a 90% (Original & Imputado)

In [29]:
def porcentaje(valor):
    x=0.0
    for i in np.array(valor):
        if (i[0] > 0.9 or i[1] > 0.9):
            x=x+1
    return x/len(valor)

In [30]:
def iterador_global(datos,paises,years,indicadores_1,indicadores_2):
    
    t_1 = tabla_base(paises,years,indicadores_1,datos)
    t_2 = tabla_base(paises,years,indicadores_2,datos)
    
    iterador = iter_R2_Splitter_v3(t_1,t_2)
    
    return iterador

## Predecir desde un indicador

In [31]:
start_time = time.time()

resultado = []
for i in range(len(conjunto)):
    resultado.append([conjunto_nombre[0],conjunto_nombre[i],porcentaje(iterador_global(datos,paises,years,conjunto[0],conjunto[i]))])
    
print("--- %s seconds ---" % (time.time() - start_time))
pd.DataFrame(resultado, columns=["Base","Target","% > 90%"])

--- 115.069999933 seconds ---


Unnamed: 0,Base,Target,% > 90%
0,GDP,GDP,0.666667
1,GDP,Gross,0.59375
2,GDP,GNI,0.8


In [32]:
import plotly.plotly as py
import plotly.graph_objs as go

data = [go.Bar(
            x=['GDP', 'Gross', 'GNI'],
            y=np.array(resultado)[:,2]
    )]

py.iplot(data, filename='GDP')

## Iteramos todas las combinaciones

In [33]:
start_time = time.time()

resultado = []
for i in range(len(conjunto)):
    for j in range(len(conjunto)):
        resultado.append([conjunto_nombre[i],conjunto_nombre[j],porcentaje(iterador_global(datos,paises,years,conjunto[i],conjunto[j]))])
        
print("--- %s seconds ---" % (time.time() - start_time))
pd.DataFrame(resultado, columns=["Base","Target","% > 90%"])

--- 297.601999998 seconds ---


Unnamed: 0,Base,Target,% > 90%
0,GDP,GDP,0.6
1,GDP,Gross,0.6875
2,GDP,GNI,0.6
3,Gross,GDP,0.466667
4,Gross,Gross,0.59375
5,Gross,GNI,0.533333
6,GNI,GDP,0.666667
7,GNI,Gross,0.6875
8,GNI,GNI,0.8


## Gráficamos resultados

In [34]:
import plotly.plotly as py
import plotly.graph_objs as go

trace0 = go.Bar(
    x=conjunto_nombre,
    y=[np.array(resultado)[0,2],np.array(resultado)[3,2],np.array(resultado)[6,2]],
    name='GDP')

trace1 = go.Bar(
    x=conjunto_nombre,
    y=[np.array(resultado)[1,2],np.array(resultado)[4,2],np.array(resultado)[7,2]],
    name='Gross')

trace2 = go.Bar(
    x=conjunto_nombre,
    y=[np.array(resultado)[2,2],np.array(resultado)[5,2],np.array(resultado)[8,2]],
    name='GNI')

data = [trace0, trace1, trace2]
layout = go.Layout(barmode='group')

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='Resultado 2')

## Mapa de densidad

In [35]:
data = [
        go.Heatmap(
            z=np.array(resultado)[:,2],
            x=np.array(resultado)[:,1],
            y=np.array(resultado)[:,0],
            colorscale='Viridis'
        )
    ]
py.iplot(data, filename='Resultado Global')