## World Development Indicators - Analytics

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import Imputer
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import ShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer

datos = pd.read_csv("suramerica.csv").drop('Unnamed: 0', 1)
datos.head()



Unnamed: 0,CountryName,CountryCode,IndicatorName,IndicatorCode,Year,Value
0,Antigua and Barbuda,ATG,"Adolescent fertility rate (births per 1,000 wo...",SP.ADO.TFRT,1960,126.144
1,Antigua and Barbuda,ATG,Age dependency ratio (% of working-age populat...,SP.POP.DPND,1960,88.237117
2,Antigua and Barbuda,ATG,"Age dependency ratio, old (% of working-age po...",SP.POP.DPND.OL,1960,7.779958
3,Antigua and Barbuda,ATG,"Age dependency ratio, young (% of working-age ...",SP.POP.DPND.YG,1960,80.457159
4,Antigua and Barbuda,ATG,"Birth rate, crude (per 1,000 people)",SP.DYN.CBRT.IN,1960,32.92


In [2]:
def r2(y_true, y_predict):
    return r2_score(y_true, y_predict)

In [3]:
def imputador(tab):
    impute=Imputer(missing_values="NaN",strategy='mean',axis=0)
    impute.fit(tab)
    tab_imputada=impute.transform(tab)
    return pd.DataFrame(tab_imputada)

In [4]:
def zeros(tab):
    #Asignamos a los NaN el valor de 0
    tab_zeros = pd.DataFrame(tab).fillna(value=0)
    #Eliminamos las columnas de Zeros descartando Indicadores que no tienen regristros para ningún pais y año deseados
    tab_zeros_2 = tab_zeros.loc[:, (tab_zeros != 0).any(axis=0)]
    return tab_zeros_2

In [5]:
def Grid_Search_CV(X_train, y_train):
    cv = ShuffleSplit(X_train.shape[0], n_iter = 10, test_size=0.2, random_state=0)
    estimator = DecisionTreeRegressor()
    params = {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'splitter': ['best', 'random']}

    grid = GridSearchCV(estimator, params, cv=cv, scoring="r2")

    grid.fit(X_train, y_train)
    
    for k,v in grid.best_params_.iteritems():
        best_max_depth = v
    
    return best_max_depth

In [6]:
def DTR_v2(X_train, X_test, y_train, y_test, depth):
    estimator = DecisionTreeRegressor(max_depth=depth)
    estimator.fit(X_train,y_train)
    y_predict = estimator.predict(X_test)
    return r2(y_test,y_predict)

In [7]:
def correlation_matrix(df):
    from matplotlib import pyplot as plt
    from matplotlib import cm as cm

    fig = plt.figure()
    ax1 = fig.add_subplot(111)
    cmap = cm.get_cmap('jet', 30)
    cax = ax1.imshow(df.corr(), interpolation="nearest", cmap=cmap)
    plt.title('Matriz de correlacion')
    fig.colorbar(cax)
    plt.show()

In [8]:
def correlacion(tab,y_indicator):
    # Buscamos los indicadores que se correlacionen más (coeficiente > 0.7)
    eliminar = []
    tabla_correlacion = np.array(pd.DataFrame(tab).corr())
    for i in range(np.shape(tab)[1]):
        if abs(tabla_correlacion[i,y_indicator]) > 0.7:
            eliminar.append(i)

    # Eliminamos los indicadores que se correlacionan más (Coeficiente > 0.7)
    temporal = tab[:,:]

    for i in range(len(eliminar)):
        temporal = np.delete(temporal[:,:], eliminar[i]-i, 1)

    return temporal

In [9]:
def splitter_v2(tab,y_indicator):
    # Asignamos X e y, eliminando los indicadores que se correlacionen más (coeficiente > 0.7)
    X = correlacion(tab,y_indicator)
    y = tab[:,y_indicator]
    
    # Separamos Train y Test respectivamente para X e y
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    return X_train, X_test, y_train, y_test

In [10]:
def splitter_v3(tabla_1,tabla_2,y_indicator):
    #Fusionamos la tabla_1 y el indicador y de la tabla_2
    tab_fusion = np.column_stack((tabla_1[:,:],tabla_2[:,y_indicator]))
    
    # Asignamos X e y, eliminando los indicadores que se correlacionen más (coeficiente > 0.7)  
    X = correlacion(tab_fusion,-1)
    y = tab_fusion[:,-1]
    
    # Separamos Train y Test respectivamente para X e y
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    return X_train, X_test, y_train, y_test

In [11]:
def iter_R2_Splitter_v2(tab):
    R2_global = list()

    for i in range(0,np.shape(tab)[1]):
        ## Zeros
        X_train, X_test, y_train, y_test = splitter_v2(np.array(zeros(tab)),i)
        best_max_depth = Grid_Search_CV(X_train, y_train)
        R2_Original_GridSearchCV = DTR_v2(X_train, X_test, y_train, y_test,best_max_depth)

        ## Imputado
        X_train, X_test, y_train, y_test = splitter_v2(np.array(imputador(tab)),i)
        best_max_depth = Grid_Search_CV(X_train, y_train)
        R2_Imputado_GridSearchCV = DTR_v2(X_train, X_test, y_train, y_test,best_max_depth)

        # Recopilamos los R2
        R2_global.append([R2_Original_GridSearchCV,R2_Imputado_GridSearchCV])

    return pd.DataFrame(R2_global,columns=['Original','Imputado'])

In [12]:
def iter_R2_Splitter_v3(tab1,tab2):    
    R2_global = list()
    for i in range(0,np.shape(tab2)[1]):
        ## Zeros
        X_train, X_test, y_train, y_test = splitter_v3(np.array(zeros(tab1)),np.array(zeros(tab2)),i)
        best_max_depth = Grid_Search_CV(X_train, y_train)
        R2_Original_GridSearchCV = DTR_v2(X_train, X_test, y_train, y_test,best_max_depth)

        ## Imputado
        X_train, X_test, y_train, y_test = splitter_v3(np.array(imputador(tab1)),np.array(imputador(tab2)),i)
        best_max_depth = Grid_Search_CV(X_train, y_train)
        R2_Imputado_GridSearchCV = DTR_v2(X_train, X_test, y_train, y_test,best_max_depth)

        # Recopilamos los R2
        R2_global.append([R2_Original_GridSearchCV,R2_Imputado_GridSearchCV])
    
    return pd.DataFrame(R2_global,columns=['Original','Imputado'])

In [13]:
def porcentaje(valor):
    x=0.0
    for i in np.array(valor):
        if (i[0] > 0.9 and i[1] > 0.9):
            x=x+1
    return x/len(valor)

In [19]:
def tabla_base(paises,years,indicadores,datos):
    tab = pd.DataFrame.pivot_table(datos, values='Value', index=['CountryName', 'Year'], columns=['IndicatorCode']).loc[(paises,years[::-1]),indicadores]
    return tab

In [20]:
def iterador_global(datos,paises,years,indicadores_1,indicadores_2):
    
    t_1 = tabla_base(paises,years,indicadores_1,datos)
    t_2 = tabla_base(paises,years,indicadores_2,datos)
    
    iterador = iter_R2_Splitter_v3(t_1,t_2)
    
    return iterador

In [21]:
paises = ['Antigua and Barbuda','Argentina','Aruba','Bahamas, The','Barbados','Belize','Bolivia','Brazil','Cayman Islands','Chile','Colombia','Costa Rica','Cuba','Curacao','Dominica','Dominican Republic','Ecuador','El Salvador','Grenada','Guatemala','Guyana','Haiti','Honduras','Jamaica','Mexico','Nicaragua','Panama','Paraguay','Peru','Puerto Rico','Sint Maarten (Dutch part)','St. Kitts and Nevis','St. Lucia','St. Martin (French part)','St. Vincent and the Grenadines','Suriname','Trinidad and Tobago','Turks and Caicos Islands','Uruguay','Venezuela, RB','Virgin Islands (U.S.)']
years = range(2008,2012+1)
gdp = ['NY.GDP.MKTP.KN','NY.GDP.MKTP.CN','NY.GDP.MKTP.CD','NY.GDP.MKTP.KD','NY.GDP.DEFL.ZS','NY.GDP.MKTP.KD.ZG','NY.GDP.PCAP.KD','NY.GDP.PCAP.KN','NY.GDP.PCAP.CN','NY.GDP.PCAP.CD','NY.GDP.PCAP.KD.ZG','NY.GDP.PCAP.PP.KD','NY.GDP.PCAP.PP.CD','NY.GDP.MKTP.PP.KD','NY.GDP.MKTP.PP.CD']
gross = ['NE.GDI.TOTL.ZS','NE.GDI.TOTL.KD.ZG','NE.GDI.TOTL.KD','NE.GDI.TOTL.KN','NE.GDI.TOTL.CN','NE.GDI.TOTL.CD','NY.GDY.TOTL.KN','NY.GDS.TOTL.ZS','NY.GDS.TOTL.CN','NY.GDS.TOTL.CD','NE.GDI.FTOT.ZS','NE.GDI.FTOT.KD.ZG','NE.GDI.FTOT.KD','NE.GDI.FTOT.KN','NE.GDI.FTOT.CN','NE.GDI.FTOT.CD','NE.GDI.FPRV.ZS','NE.GDI.FPRV.CN','NE.DAB.TOTL.ZS','NE.DAB.TOTL.KD','NE.DAB.TOTL.KN','NE.DAB.TOTL.CN','NE.DAB.TOTL.CD','NE.DAB.DEFL.ZS','NY.GNS.ICTR.ZS','NY.GNS.ICTR.GN.ZS','NY.GNS.ICTR.CN','NY.GNS.ICTR.CD','NY.GDP.FCST.KD','NY.GDP.FCST.KN','NY.GDP.FCST.CN','NY.GDP.FCST.CD']
gni = ['NY.GNP.MKTP.KD','NY.GNP.MKTP.KN','NY.GNP.MKTP.CN','NY.GNP.MKTP.CD','NY.GNP.MKTP.KD.ZG','NY.GNP.PCAP.KD','NY.GNP.PCAP.KN','NY.GNP.PCAP.CN','NY.GNP.PCAP.KD.ZG','NY.GNP.PCAP.CD','NY.GNP.PCAP.PP.KD','NY.GNP.PCAP.PP.CD','NY.GNP.ATLS.CD','NY.GNP.MKTP.PP.KD','NY.GNP.MKTP.PP.CD']
conjunto_nombre = ['GDP','Gross','GNI']
conjunto = [gdp,gross,gni]

In [22]:
tabla_base(paises,years,gdp,datos)

Unnamed: 0_level_0,IndicatorCode,NY.GDP.MKTP.KN,NY.GDP.MKTP.CN,NY.GDP.MKTP.CD,NY.GDP.MKTP.KD,NY.GDP.DEFL.ZS,NY.GDP.MKTP.KD.ZG,NY.GDP.PCAP.KD,NY.GDP.PCAP.KN,NY.GDP.PCAP.CN,NY.GDP.PCAP.CD,NY.GDP.PCAP.KD.ZG,NY.GDP.PCAP.PP.KD,NY.GDP.PCAP.PP.CD,NY.GDP.MKTP.PP.KD,NY.GDP.MKTP.PP.CD
CountryName,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Antigua and Barbuda,2008,3.358406e+09,3.637845e+09,1.347350e+09,1.239080e+09,108.320559,0.071111,14517.635341,39348.639719,42622.666667,15786.172840,-1.046262,25735.714754,24723.111892,2.196543e+09,2.110118e+09
Antigua and Barbuda,2009,2.954188e+09,3.257308e+09,1.206410e+09,1.089944e+09,110.260684,-12.036015,12629.713786,34231.611819,37744.009270,13979.262693,-13.004332,22388.956865,21671.375749,1.932167e+09,1.870240e+09
Antigua and Barbuda,2010,2.743171e+09,3.065955e+09,1.135539e+09,1.012090e+09,111.766847,-7.142991,11602.142227,31446.478970,35146.738046,13017.310387,-8.136143,20567.359344,20151.309866,1.794152e+09,1.757859e+09
Antigua and Barbuda,2011,2.693974e+09,3.050780e+09,1.129918e+09,9.939385e+08,113.244595,-1.793436,11275.280156,30560.551094,34608.172248,12817.841573,-2.817256,19987.924139,19987.924139,1.761975e+09,1.761975e+09
Antigua and Barbuda,2012,2.802259e+09,3.252725e+09,1.204713e+09,1.033890e+09,116.075112,4.019542,11607.745344,31461.665675,36519.163794,13525.616220,2.948620,20577.292107,20946.700835,1.832799e+09,1.865702e+09
Argentina,2008,7.058647e+11,1.283906e+12,4.037820e+11,2.663823e+11,181.891166,3.074946,6596.583217,17479.748134,31794.117656,9999.093517,2.023423,,,,
Argentina,2009,7.062178e+11,1.411526e+12,3.766279e+11,2.665156e+11,199.871182,0.050024,6532.461614,17309.837526,34597.376834,9231.382900,-0.972043,,,,
Argentina,2010,7.729666e+11,1.810830e+12,4.616402e+11,2.917055e+11,234.270156,9.451578,7076.301272,18750.913904,43927.795332,11198.642567,8.325187,,,,
Argentina,2011,8.377910e+11,2.312009e+12,5.578902e+11,3.161692e+11,275.964823,8.386451,7590.074325,20112.319237,55502.926232,13392.916904,7.260475,,,,
Argentina,2012,8.445081e+11,2.765575e+12,6.043785e+11,3.187041e+11,327.477653,0.801760,7571.028495,20061.851245,65698.079692,14357.411589,-0.250931,,,,


In [17]:
start_time = time.time()

resultado = []
for i in range(len(conjunto)):
    resultado.append([conjunto_nombre[0],conjunto_nombre[i],porcentaje(iterador_global(datos,paises,years,conjunto[0],conjunto[i]))])
    
print("--- %s seconds ---" % (time.time() - start_time))
pd.DataFrame(resultado, columns=["Base","Target","% > 90%"])

--- 99.3480000496 seconds ---


Unnamed: 0,Base,Target,% > 90%
0,GDP,GDP,0.466667
1,GDP,Gross,0.5
2,GDP,GNI,0.266667


In [18]:
import plotly.plotly as py
import plotly.graph_objs as go

data = [go.Bar(
            x=['GDP', 'Gross', 'GNI'],
            y=np.array(resultado)[:,2]
    )]

py.iplot(data, filename='GDP')

In [19]:
start_time = time.time()

resultado = []
for i in range(len(conjunto)):
    for j in range(len(conjunto)):
        resultado.append([conjunto_nombre[i],conjunto_nombre[j],porcentaje(iterador_global(datos,paises,years,conjunto[i],conjunto[j]))])
        
print("--- %s seconds ---" % (time.time() - start_time))
pd.DataFrame(resultado, columns=["Base","Target","% > 90%"])

--- 301.758999825 seconds ---


Unnamed: 0,Base,Target,% > 90%
0,GDP,GDP,0.066667
1,GDP,Gross,0.1875
2,GDP,GNI,0.266667
3,Gross,GDP,0.2
4,Gross,Gross,0.21875
5,Gross,GNI,0.266667
6,GNI,GDP,0.4
7,GNI,Gross,0.375
8,GNI,GNI,0.333333


In [20]:
import plotly.plotly as py
import plotly.graph_objs as go

trace0 = go.Bar(
    x=conjunto_nombre,
    y=[np.array(resultado)[0,2],np.array(resultado)[3,2],np.array(resultado)[6,2]],
    name='GDP')

trace1 = go.Bar(
    x=conjunto_nombre,
    y=[np.array(resultado)[1,2],np.array(resultado)[4,2],np.array(resultado)[7,2]],
    name='Gross')

trace2 = go.Bar(
    x=conjunto_nombre,
    y=[np.array(resultado)[2,2],np.array(resultado)[5,2],np.array(resultado)[8,2]],
    name='GNI')

data = [trace0, trace1, trace2]
layout = go.Layout(barmode='group')

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='Resultado 2')

In [21]:
data = [
        go.Heatmap(
            z=np.array(resultado)[:,2],
            x=np.array(resultado)[:,1],
            y=np.array(resultado)[:,0],
            colorscale='Viridis'
        )
    ]
py.iplot(data, filename='Resultado Global')