## World Development Indicators - Analytics

In [2]:
import pandas as pd
import numpy as np
import time

datos = pd.read_csv("suramerica.csv").drop('Unnamed: 0', 1)
paises = datos['CountryCode'].drop_duplicates().values.tolist()

preprocessing = 'zeros'
search = 'original'
year_init = 2012
year_range = 5
look_back = 3
years = range(year_init-year_range+1,year_init+1)[::-1]

agricultura     = open("Indicadores/iagricultura.txt").read().split(',')
ambiente        = open("Indicadores/iambiente.txt").read().split(',')
ayuda           = open("Indicadores/iayuda.txt").read().split(',')
ciencia         = open("Indicadores/iciencia.txt").read().split(',')
clima           = open("Indicadores/iclima.txt").read().split(',')
comercio        = open("Indicadores/icomercio.txt").read().split(',')
deuda           = open("Indicadores/ideuda.txt").read().split(',')
economia        = open("Indicadores/ieconomia.txt").read().split(',')
educacion       = open("Indicadores/ieducacion.txt").read().split(',')
energia         = open("Indicadores/ienergia.txt").read().split(',')
finanzas        = open("Indicadores/ifinanzas.txt").read().split(',')
genero          = open("Indicadores/igenero.txt").read().split(',')
infraestructura = open("Indicadores/iinfraestructura.txt").read().split(',')
pobreza         = open("Indicadores/ipobreza.txt").read().split(',')
privado         = open("Indicadores/iprivado.txt").read().split(',')
publico         = open("Indicadores/ipublico.txt").read().split(',')
salud           = open("Indicadores/isalud.txt").read().split(',')
social          = open("Indicadores/isocial.txt").read().split(',')
trabajo         = open("Indicadores/itrabajo.txt").read().split(',')
urbano          = open("Indicadores/iurbano.txt").read().split(',')

conjunto_nombre = ['Agricultura','Economia','Educacion','Salud','Deuda']
conjunto = [agricultura,economia,educacion,salud,deuda]
datos.head()

Unnamed: 0,CountryName,CountryCode,IndicatorName,IndicatorCode,Year,Value
0,Antigua and Barbuda,ATG,"Adolescent fertility rate (births per 1,000 wo...",SP.ADO.TFRT,1960,126.144
1,Antigua and Barbuda,ATG,Age dependency ratio (% of working-age populat...,SP.POP.DPND,1960,88.237117
2,Antigua and Barbuda,ATG,"Age dependency ratio, old (% of working-age po...",SP.POP.DPND.OL,1960,7.779958
3,Antigua and Barbuda,ATG,"Age dependency ratio, young (% of working-age ...",SP.POP.DPND.YG,1960,80.457159
4,Antigua and Barbuda,ATG,"Birth rate, crude (per 1,000 people)",SP.DYN.CBRT.IN,1960,32.92


In [8]:
def r2(y_true, y_predict):
    from sklearn.metrics import r2_score
    return r2_score(y_true, y_predict)

In [9]:
def tabla_base(indicadores):
    tab = pd.DataFrame.pivot_table(datos, values='Value', index=['CountryCode', 'Year'], columns=['IndicatorCode']).loc[(paises,years),indicadores].sortlevel(["CountryCode","Year"], ascending=[True,False])
    return tab

In [10]:
def tabla_2_base(indicadores,look_back):
    temp_table = []
    for i in range(look_back):      
        temp_years = range(year_init-year_range-i+1,year_init-i+1)[::-1]
        temp_table.append(pd.DataFrame.pivot_table(datos, values='Value', index=['CountryCode', 'Year'], columns=['IndicatorCode']).loc[(paises,temp_years),indicadores].sortlevel(["CountryCode","Year"], ascending=[True,False]))
    return pd.DataFrame(np.column_stack(temp_table))

In [11]:
def estimator_Universal(estimador, X_train, X_test, y_train, y_test):
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.svm import SVR
    
    if(search=='original'):
        if(estimador=='DTR'):
            estimator = DecisionTreeRegressor()
        if(estimador=='RFR'):
            estimator = RandomForestRegressor(n_jobs=-1)
        if(estimador=='SVR'):
            estimator = SVR()
    else:
        best_params = SearchCV_Universal(estimador, search, X_train, y_train)       

        if(estimador=='DTR'):
            estimator = DecisionTreeRegressor().set_params(**best_params)
        if(estimador=='RFR'):
            estimator = RandomForestRegressor(n_jobs=-1).set_params(**best_params)
        if(estimador=='SVR'):
            estimator = SVR().set_params(**best_params)
        
    estimator.fit(X_train,y_train)
    y_predict = estimator.predict(X_test)
        
    return r2(y_test,y_predict)

In [12]:
def Preprocess(tab1,tab2,y_indicator):
    from sklearn.preprocessing import Imputer
    from sklearn.preprocessing import StandardScaler
    from sklearn.model_selection import train_test_split
       
    #Fusionamos la tabla_1 y el indicador y de la tabla_2
    tab_fusion = np.column_stack((np.array(tab1)[:,:],np.array(tab2)[:,y_indicator]))    
    
    #Eliminamos las columnas de NaN descartando Indicadores que no tienen regristros para ningún pais y año deseados
    df = pd.DataFrame(tab_fusion)
    df = df.dropna(how='all',axis=1)
    
    #Eliminamos las filas Si el valor a predecir es NaN
    df = df.dropna(subset=[df.iloc[:,-1].name])
    
    if(preprocessing=='zeros'):
        #Imputamos los NaN por Zero
        df = df.fillna(0)      
        
    if(preprocessing=='imput'):
        #Imputamos los NaN por la media de cada Indicador respectivamente      
        impute=Imputer(missing_values="NaN",strategy='mean',axis=0)
        impute.fit(df)
        df = pd.DataFrame(impute.transform(df))
        
    # Asignamos X e y, eliminando los indicadores que se correlacionen más (coeficiente > 0.7)
    df_ = df.corr()
    X = df.drop(df_[df_.iloc[:,-1] > 0.7].index, axis=1)
    y = df.iloc[:,-1]

    # Normalizamos los datos
    sc = StandardScaler()
    df_norm = sc.fit_transform(np.column_stack([X,y]))
    X = df_norm[:,:-1]
    y = df_norm[:,-1]
    
    # Separamos Train y Test respectivamente para X e y
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    return X_train, X_test, y_train, y_test

In [13]:
def iter_Splitter_Optimus(tab1,tab2): 
    R2_global = list()
    for i in range(0,np.shape(tab2.dropna(how='all',axis=1))[1]):

        X_train, X_test, y_train, y_test = Preprocess(tab1, tab2, i)

        result = estimator_Universal('DTR', X_train, X_test, y_train, y_test)

        if(result < 0.9): 
            temp = estimator_Universal('SVR', X_train, X_test, y_train, y_test)
            if(temp < 0.9): 
                temp2 = estimator_Universal('RFR', X_train, X_test, y_train, y_test)
                if (temp2 > temp): 
                    result = temp2
            if(temp > result): 
                result = temp

        R2_global.append(result)
    return R2_global

In [14]:
def SearchCV_Universal(estimador, search, X_train, y_train):
    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import RandomizedSearchCV
    from sklearn.model_selection import ShuffleSplit
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.svm import SVR
        
    if(estimador=='DTR'):
        estimator  = DecisionTreeRegressor()
        param_grid = {  'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 
                        'splitter': ['best', 'random']
                     }
        
    if(estimador=='RFR'):
        estimator  = RandomForestRegressor()       
        param_grid = { 
                        "n_estimators"      : [10,20,30,40],
                        "max_features"      : ["auto", "sqrt", "log2"],
                        "min_samples_split" : [2,4,8],
                        "bootstrap": [True, False],
                     }
    if(estimador=='SVR'):
        estimator  = SVR()
        param_grid ={
                        'gamma'  : ['auto', 1e-3, 1e-4],
                        'C'      : [1, 10, 100, 1000],            
                    }

    if (search=='random'):
        grid = RandomizedSearchCV(estimator, param_grid, n_jobs=-1, cv=ShuffleSplit(test_size=0.2))
    if (search=='grid'):
        grid = GridSearchCV(estimator, param_grid, n_jobs=-1, cv=ShuffleSplit(test_size=0.2))
        
    grid.fit(X_train, y_train)
    
    return grid.best_params_

In [15]:
def filtro(indicadores_1,indicadores_2):
    if(indicadores_1!=indicadores_2):
        df = pd.DataFrame(indicadores_2)
        indicadores_2 = np.array(df.loc[~df.ix[:,0].isin(indicadores_1)]).flatten()
    return indicadores_2

In [16]:
def iterador_global(indicadores_1,indicadores_2,look_back): 
    
    tab1 = tabla_2_base(indicadores_1,look_back)
    tab2 = tabla_base(filtro(indicadores_1,indicadores_2))
    
    return iter_Splitter_Optimus(tab1,tab2)

In [17]:
tabla_base(filtro(agricultura,ayuda)).fillna(0)

Unnamed: 0_level_0,IndicatorCode,DT.DOD.MDRI.CD,BX.GRT.EXTA.CD.WD,DT.DIS.IDAG.CD,SH.STA.ACSN,SH.TBS.INCD,SI.DST.FRST.20,SH.STA.MMRT,SH.DYN.MORT,DT.ODA.ODAT.GN.ZS,DT.ODA.ODAT.XP.ZS,...,DT.NFL.WFPG.CD,DT.NFL.WHOL.CD,SI.POV.DDAY,SH.STA.ANVC.ZS,SH.DYN.AIDS.ZS,SH.STA.STNT.ZS,SE.PRM.CMPT.ZS,SE.ENR.PRSC.FM.ZS,BX.GRT.TECH.CD.WD,SL.EMP.VULN.ZS
CountryCode,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
ABW,2012,0.0,0.0,0.0,97.7,12.0,0.00,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.00,0.0,0.0,0.000000,0.000000,1.02118,0.0,0.000000
ABW,2011,0.0,0.0,0.0,97.7,13.0,0.00,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.00,0.0,0.0,0.000000,0.000000,1.00870,0.0,0.000000
ABW,2010,0.0,0.0,0.0,97.7,13.0,0.00,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.00,0.0,0.0,0.000000,94.722778,1.01434,0.0,4.100000
ABW,2009,0.0,0.0,0.0,97.8,13.0,0.00,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.00,0.0,0.0,0.000000,96.306068,1.00407,0.0,0.000000
ABW,2008,0.0,0.0,0.0,97.8,13.0,0.00,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.00,0.0,0.0,0.000000,95.133034,1.00573,0.0,0.000000
ARG,2012,0.0,41900000.0,0.0,95.5,24.0,4.73,55.0,13.8,0.030237,0.000000,...,0.0,0.0,1.63,98.1,0.5,0.000000,110.209846,1.03380,56350000.0,19.000000
ARG,2011,0.0,61600000.0,0.0,95.2,24.0,4.58,56.0,14.2,0.015941,0.000000,...,0.0,0.0,1.53,0.0,0.5,0.000000,109.337738,1.03902,55570000.0,18.600000
ARG,2010,0.0,30260000.0,0.0,94.9,25.0,4.36,58.0,14.6,0.026893,0.000000,...,0.0,0.0,2.05,0.0,0.4,0.000000,107.305138,1.03808,75320000.0,19.000000
ARG,2009,0.0,76170000.0,0.0,94.5,25.0,3.93,62.0,15.0,0.034461,0.000000,...,0.0,0.0,3.05,0.0,0.4,0.000000,106.029030,1.04382,71060000.0,19.600000
ARG,2008,0.0,76230000.0,0.0,94.2,26.0,3.93,57.0,15.4,0.032949,0.000000,...,0.0,0.0,2.97,0.0,0.4,0.000000,105.821701,1.05030,63830000.0,19.100000


In [11]:
def porcent_result_reg(df,indicadores_1,indicadores_2):
    df_temp = df
    x = 0.0
    for i in range(len(df_temp)):
        if(df_temp[i] > 0.9):
            x=x+1
    porcent = x/len(df)
    df = pd.DataFrame(df)
    df[df < 0] = 0.0
    result = np.array(df.values)   
    reg = len(filtro(indicadores_1,indicadores_2))
    return porcent,result,reg

In [12]:
def resultados(df,indicador_1,indicador_2,look_back):
    #Buscamos Resultados en base al conjunto de indicadores Base, Target y Look Back
    temp = df[(df['Base']==indicador_1)&(df['Target']==indicador_2)&(df['Look Back'] == look_back)]['Results'].values[0]
        
    # Códigos de los indicadores con registros   
    for i in range(len(conjunto_nombre)):
        
        if indicador_1 == conjunto_nombre[i]:
            indicador1 = conjunto[i]
        if indicador_2 == conjunto_nombre[i]:
            indicador2 = conjunto[i]

            df = pd.DataFrame(indicador2)
            # Filtro indicadores repetidos
            if(indicador1!=indicador2):
                indicador2 = np.array(df.loc[~df.ix[:,0].isin(indicador1)]).flatten()
            df_zeros = tabla_base(indicador2).dropna(how='all',axis=1)
                
    icodes = df_zeros.columns.values

    # Buscamos Nombres de los Códigos de los Indicadores
    nombres = pd.DataFrame(datos).loc[:,['IndicatorName','IndicatorCode']].set_index('IndicatorCode').loc[icodes].drop_duplicates()

    # Concatenamos Resultados y Nombres, Reindexando en base a los nombres
    df_results = pd.DataFrame(np.column_stack([nombres,temp]),columns=['Indicadores','Resultados'])
    
    return df_results

In [13]:
def buscar_mejores(df):
    return df[df.Resultados > 0.9].reset_index(drop=True)

## Iteramos el Primer Conjunto de Indicadores

In [80]:
start_time = time.time()
resultado_base = []

for i in range(len(conjunto)):
    for j in range(1,look_back+1):
        start_time2 = time.time()
        porcent , results, reg = porcent_result_reg(iterador_global(conjunto[0],conjunto[i],j),conjunto[0],conjunto[i])
        resultado_base.append([conjunto_nombre[0],conjunto_nombre[i],j,reg,porcent,"%s" % (time.time() - start_time2),results])
        pd.DataFrame(resultado_base, columns=["Base","Target","Look Back","Reg","%","Time","Results"]).to_csv('log_base.csv')

df = pd.DataFrame(resultado_base, columns=["Base","Target","Look Back","Reg","%","Time","Results"])
print("--- %s seconds ---" % (time.time() - start_time))
df

--- 601.968616009 seconds ---


Unnamed: 0,Base,Target,Look Back,Reg,%,Time,Results
0,Agricultura,Agricultura,1,48,0.522727,10.4963088036,"[[0.418630915831], [0.726742204529], [0.882763..."
1,Agricultura,Agricultura,2,48,0.409091,12.8444778919,"[[0.610281631514], [0.810631913409], [0.738249..."
2,Agricultura,Agricultura,3,48,0.295455,15.1500520706,"[[0.0], [0.555375447438], [0.759861194799], [0..."
3,Agricultura,Economia,1,258,0.392,52.7511818409,"[[0.704464021134], [0.44736097896], [0.0995583..."
4,Agricultura,Economia,2,258,0.36,59.086963892,"[[0.965882751386], [0.909133070199], [0.338026..."
5,Agricultura,Economia,3,258,0.336,63.7134780884,"[[0.942939155397], [0.911842797573], [0.0], [0..."
6,Agricultura,Educacion,1,168,0.310345,29.6018359661,"[[0.771568800354], [0.597323996936], [0.683710..."
7,Agricultura,Educacion,2,168,0.258621,32.9971959591,"[[0.582445768712], [0.455007874234], [0.471413..."
8,Agricultura,Educacion,3,168,0.137931,39.021214962,"[[0.607801873365], [0.579282835964], [0.812200..."
9,Agricultura,Salud,1,150,0.328,30.8679280281,"[[0.117627195004], [0.944941558671], [0.829862..."


In [81]:
pd.DataFrame.pivot_table(df, index=["Base","Target","Reg"], columns=["Look Back"])

Unnamed: 0_level_0,Unnamed: 1_level_0,Reg,Reg,Reg,%,%,%
Unnamed: 0_level_1,Look Back,1,2,3,1,2,3
Base,Target,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Agricultura,Agricultura,48,48,48,0.522727,0.409091,0.295455
Agricultura,Deuda,229,229,229,0.25,0.223214,0.272321
Agricultura,Economia,258,258,258,0.392,0.36,0.336
Agricultura,Educacion,168,168,168,0.310345,0.258621,0.137931
Agricultura,Salud,150,150,150,0.328,0.32,0.288


In [82]:
buscar_mejores(resultados(df,'Agricultura','Agricultura',1))

Unnamed: 0,Indicadores,Resultados
0,Agricultural land (% of land area),0.932408
1,Agricultural land (sq. km),0.98259
2,Agricultural raw materials exports (% of merch...,0.900358
3,Agriculture value added per worker (constant 2...,0.931916
4,"Agriculture, value added (% of GDP)",0.915322
5,"Agriculture, value added (current US$)",0.933682
6,Arable land (% of land area),0.979884
7,Arable land (hectares per person),0.988951
8,Arable land (hectares),0.995271
9,Cereal yield (kg per hectare),0.975292


## Iteramos todos los Conjuntos de Indicadores

In [None]:
start_time = time.time()
resultado_global = []

for i in range(len(conjunto)):
    for j in range(len(conjunto)):
        for k in range(1,look_back+1):
            start_time2 = time.time()
            porcent , results, reg = porcent_result_reg(iterador_global(conjunto[i],conjunto[j],k),conjunto[i],conjunto[j])
            resultado_global.append([conjunto_nombre[i],conjunto_nombre[j],k,reg,porcent,"%s" % (time.time() - start_time2),results])
            pd.DataFrame(resultado_global, columns=["Base","Target","Look Back","Reg","%","Time","Results"]).to_csv('log_global.csv')


df_global = pd.DataFrame(resultado_global, columns=["Base","Target","Look Back","Reg","%","Time","Results"])
print("--- %s seconds ---" % (time.time() - start_time))
df_global

## Resultados

In [None]:
pd.DataFrame.pivot_table(df_global, index=["Base","Target","Reg"], columns=["Look Back"])