## World Development Indicators - Analytics

In [1]:
import pandas as pd
import numpy as np
import time

datos = pd.read_csv("suramerica.csv").drop('Unnamed: 0', 1)
paises = datos['CountryCode'].drop_duplicates().values.tolist()

preprocessing = 'imput'
search = 'original'
year_init = 2012
year_range = 5
look_back = 3
years = range(year_init-year_range+1,year_init+1)[::-1]

agricultura = open("Indicadores/iagricultura.txt").read().split(',')
economia    = open("Indicadores/ieconomia.txt").read().split(',')
educacion   = open("Indicadores/ieducacion.txt").read().split(',')
salud       = open("Indicadores/isalud.txt").read().split(',')
deuda       = open("Indicadores/ideuda.txt").read().split(',')

conjunto_nombre = ['Agricultura','Economia','Educacion','Salud','Deuda']
conjunto = [agricultura,economia,educacion,salud,deuda]
datos

Unnamed: 0,CountryName,CountryCode,IndicatorName,IndicatorCode,Year,Value
0,Antigua and Barbuda,ATG,"Adolescent fertility rate (births per 1,000 wo...",SP.ADO.TFRT,1960,1.261440e+02
1,Antigua and Barbuda,ATG,Age dependency ratio (% of working-age populat...,SP.POP.DPND,1960,8.823712e+01
2,Antigua and Barbuda,ATG,"Age dependency ratio, old (% of working-age po...",SP.POP.DPND.OL,1960,7.779958e+00
3,Antigua and Barbuda,ATG,"Age dependency ratio, young (% of working-age ...",SP.POP.DPND.YG,1960,8.045716e+01
4,Antigua and Barbuda,ATG,"Birth rate, crude (per 1,000 people)",SP.DYN.CBRT.IN,1960,3.292000e+01
5,Antigua and Barbuda,ATG,CO2 emissions (kt),EN.ATM.CO2E.KT,1960,3.667000e+01
6,Antigua and Barbuda,ATG,CO2 emissions (metric tons per capita),EN.ATM.CO2E.PC,1960,6.706169e-01
7,Antigua and Barbuda,ATG,CO2 emissions from gaseous fuel consumption (%...,EN.ATM.CO2E.GF.ZS,1960,0.000000e+00
8,Antigua and Barbuda,ATG,CO2 emissions from gaseous fuel consumption (kt),EN.ATM.CO2E.GF.KT,1960,0.000000e+00
9,Antigua and Barbuda,ATG,CO2 emissions from liquid fuel consumption (% ...,EN.ATM.CO2E.LF.ZS,1960,1.000000e+02


In [2]:
def r2(y_true, y_predict):
    from sklearn.metrics import r2_score
    return r2_score(y_true, y_predict)

In [3]:
def tabla_base(indicadores):
    tab = pd.DataFrame.pivot_table(datos, values='Value', index=['CountryCode', 'Year'], columns=['IndicatorCode']).loc[(paises,years),indicadores].sortlevel(["CountryCode","Year"], ascending=[True,False])
    return tab

In [4]:
def tabla_2_base(indicadores,look_back):
    temp_table = []
    for i in range(look_back):      
        temp_years = range(year_init-year_range-i+1,year_init-i+1)[::-1]
        temp_table.append(pd.DataFrame.pivot_table(datos, values='Value', index=['CountryCode', 'Year'], columns=['IndicatorCode']).loc[(paises,temp_years),indicadores].sortlevel(["CountryCode","Year"], ascending=[True,False]))
    return pd.DataFrame(np.column_stack(temp_table))

In [5]:
def estimator_Universal(estimador, X_train, X_test, y_train, y_test):
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.svm import SVR
    
    if(search=='original'):
        if(estimador=='DTR'):
            estimator = DecisionTreeRegressor()
        if(estimador=='RFR'):
            estimator = RandomForestRegressor(n_jobs=-1)
        if(estimador=='SVR'):
            estimator = SVR()
    else:
        best_params = SearchCV_Universal(estimador, search, X_train, y_train)       

        if(estimador=='DTR'):
            estimator = DecisionTreeRegressor().set_params(**best_params)
        if(estimador=='RFR'):
            estimator = RandomForestRegressor(n_jobs=-1).set_params(**best_params)
        if(estimador=='SVR'):
            estimator = SVR().set_params(**best_params)
        
    estimator.fit(X_train,y_train)
    y_predict = estimator.predict(X_test)
        
    return r2(y_test,y_predict)

In [6]:
def Preprocess(tab1,tab2,y_indicator):
    from sklearn.preprocessing import Imputer
    from sklearn.preprocessing import StandardScaler
    from sklearn.model_selection import train_test_split
       
    #Fusionamos la tabla_1 y el indicador y de la tabla_2
    tab_fusion = np.column_stack((np.array(tab1)[:,:],np.array(tab2)[:,y_indicator]))    
    
    #Eliminamos las columnas de NaN descartando Indicadores que no tienen regristros para ningún pais y año deseados
    df = pd.DataFrame(tab_fusion)
    df = df.dropna(how='all',axis=1)
    
    #Eliminamos las filas Si el valor a predecir es NaN
    df = df.dropna(subset=[df.iloc[:,-1].name])
    
    if(preprocessing=='zeros'):
        #Imputamos los NaN por Zero
        df = df.fillna(0)      
        
    if(preprocessing=='imput'):
        #Imputamos los NaN por la media de cada Indicador respectivamente      
        impute=Imputer(missing_values="NaN",strategy='mean',axis=0)
        impute.fit(df)
        df = pd.DataFrame(impute.transform(df))
        
    # Asignamos X e y, eliminando los indicadores que se correlacionen más (coeficiente > 0.7)
    df_ = df.corr()
    X = df.drop(df_[df_.iloc[:,-1] > 0.7].index, axis=1)
    y = df.iloc[:,-1]

    # Normalizamos los datos
    sc = StandardScaler()
    df_norm = sc.fit_transform(np.column_stack([X,y]))
    X = df_norm[:,:-1]
    y = df_norm[:,-1]
    
    # Separamos Train y Test respectivamente para X e y
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    return X_train, X_test, y_train, y_test

In [7]:
def iter_Splitter_Optimus(tab1,tab2): 
    R2_global = list()
    for i in range(0,np.shape(tab2.dropna(how='all',axis=1))[1]):

        X_train, X_test, y_train, y_test = Preprocess(tab1, tab2, i)

        result = estimator_Universal('DTR', X_train, X_test, y_train, y_test)

        if(result < 0.9): 
            temp = estimator_Universal('SVR', X_train, X_test, y_train, y_test)
            if(temp < 0.9): 
                temp2 = estimator_Universal('RFR', X_train, X_test, y_train, y_test)
                if (temp2 > temp): 
                    result = temp2
            if(temp > result): 
                result = temp

        R2_global.append(result)
    return R2_global

In [8]:
def SearchCV_Universal(estimador, search, X_train, y_train):
    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import RandomizedSearchCV
    from sklearn.model_selection import ShuffleSplit
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.svm import SVR
        
    if(estimador=='DTR'):
        estimator  = DecisionTreeRegressor()
        param_grid = {  'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 
                        'splitter': ['best', 'random']
                     }
        
    if(estimador=='RFR'):
        estimator  = RandomForestRegressor()       
        param_grid = { 
                        "n_estimators"      : [10,20,30,40],
                        "max_features"      : ["auto", "sqrt", "log2"],
                        "min_samples_split" : [2,4,8],
                        "bootstrap": [True, False],
                     }
    if(estimador=='SVR'):
        estimator  = SVR()
        param_grid ={
                        'gamma'  : ['auto', 1e-3, 1e-4],
                        'C'      : [1, 10, 100, 1000],            
                    }

    if (search=='random'):
        grid = RandomizedSearchCV(estimator, param_grid, n_jobs=-1, cv=ShuffleSplit(test_size=0.2))
    if (search=='grid'):
        grid = GridSearchCV(estimator, param_grid, n_jobs=-1, cv=ShuffleSplit(test_size=0.2))
        
    grid.fit(X_train, y_train)
    
    return grid.best_params_

In [9]:
def iterador_global(indicadores_1,indicadores_2,look_back): 
    
    if(indicadores_1!=indicadores_2):
        df = pd.DataFrame(indicadores_2)
        indicadores_2 = np.array(df.loc[~df.ix[:,0].isin(indicadores_1)]).flatten()
    
    tab1 = tabla_2_base(indicadores_1,look_back)
    tab2 = tabla_base(indicadores_2)
    
    return iter_Splitter_Optimus(tab1,tab2)

In [10]:
def porcent_result(df):
    df_temp = df
    x = 0.0
    for i in range(len(df_temp)):
        if(df_temp[i] > 0.9):
            x=x+1
    porcentaje = x/len(df)
    df = pd.DataFrame(df)
    df[df < 0] = 0.0
    return porcentaje , np.array(df.values)

In [37]:
def resultados(df,indicador_1,indicador_2,look_back):
    #Buscamos Resultados en base al conjunto de indicadores Base, Target y Look Back
    temp = df[(df['Base']==indicador_1)&(df['Target']==indicador_2)&(df['Look Back'] == look_back)]['Results'].values[0]
        
    # Códigos de los indicadores con registros   
    for i in range(len(conjunto_nombre)):
        
        if indicador_1 == conjunto_nombre[i]:
            indicador1 = conjunto[i]
        if indicador_2 == conjunto_nombre[i]:
            indicador2 = conjunto[i]

            df = pd.DataFrame(indicador2)
            # Filtro indicadores repetidos
            if(indicador1!=indicador2):
                indicador2 = np.array(df.loc[~df.ix[:,0].isin(indicador1)]).flatten()
            df_zeros = tabla_base(indicador2).dropna(how='all',axis=1)
                
    icodes = df_zeros.columns.values

    # Buscamos Nombres de los Códigos de los Indicadores
    nombres = pd.DataFrame(datos).loc[:,['IndicatorName','IndicatorCode']].set_index('IndicatorCode').loc[icodes].drop_duplicates()

    # Concatenamos Resultados y Nombres, Reindexando en base a los nombres
    df_results = pd.DataFrame(np.column_stack([nombres,temp]),columns=['Indicadores','Resultados'])
    
    return df_results

In [12]:
def buscar_mejores(df):
    return df[df.Resultados > 0.9].reset_index(drop=True)

In [13]:
start_time = time.time()
resultado_base = []

for i in range(len(conjunto)):
    for j in range(1,look_back+1):
        start_time2 = time.time()
        porcent , results = porcent_result(iterador_global(conjunto[0],conjunto[i],j))
        resultado_base.append([conjunto_nombre[0],conjunto_nombre[i],j,porcent,"%s" % (time.time() - start_time2),results])
        pd.DataFrame(resultado_base, columns=["Base","Target","Look Back","%","Time","Results"]).to_csv('log_base.csv')

df = pd.DataFrame(resultado_base, columns=["Base","Target","Look Back","%","Time","Results"])
print("--- %s seconds ---" % (time.time() - start_time))
df

--- 595.413469076 seconds ---


Unnamed: 0,Base,Target,Look Back,%,Time,Results
0,Agricultura,Agricultura,1,0.5,12.7370700836,"[[0.778257898353], [0.788531807735], [0.830138..."
1,Agricultura,Agricultura,2,0.363636,12.958398819,"[[0.763216882171], [0.917272702807], [0.544577..."
2,Agricultura,Agricultura,3,0.363636,14.8255419731,"[[0.510805987847], [0.702524919596], [0.278586..."
3,Agricultura,Economia,1,0.452,49.0823287964,"[[0.932156510988], [0.312913377604], [0.213793..."
4,Agricultura,Economia,2,0.336,62.5354230404,"[[0.429723682669], [0.853496204628], [0.0], [0..."
5,Agricultura,Economia,3,0.316,62.4134879112,"[[0.374221497518], [0.766551384221], [0.0], [0..."
6,Agricultura,Educacion,1,0.353448,27.7205719948,"[[0.289217094789], [0.410625995757], [0.666316..."
7,Agricultura,Educacion,2,0.241379,33.6851248741,"[[0.272179227231], [0.291876473292], [0.580444..."
8,Agricultura,Educacion,3,0.224138,35.0912330151,"[[0.415860488719], [0.505179768898], [0.624702..."
9,Agricultura,Salud,1,0.336,29.749671936,"[[0.0], [0.934783035009], [0.999606070373], [0..."


In [14]:
pd.DataFrame.pivot_table(df, index=["Base","Target"], columns=["Look Back"])

Unnamed: 0_level_0,Unnamed: 1_level_0,%,%,%
Unnamed: 0_level_1,Look Back,1,2,3
Base,Target,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Agricultura,Agricultura,0.5,0.363636,0.363636
Agricultura,Deuda,0.232143,0.241071,0.209821
Agricultura,Economia,0.452,0.336,0.316
Agricultura,Educacion,0.353448,0.241379,0.224138
Agricultura,Salud,0.336,0.304,0.328


In [36]:
buscar_mejores(resultados(df,'Agricultura','Agricultura',1))

Unnamed: 0,Indicadores,Resultados
0,Agricultural land (% of land area),0.985735
1,Agricultural land (sq. km),0.956217
2,Agricultural raw materials exports (% of merch...,0.959808
3,Agriculture value added per worker (constant 2...,0.988348
4,"Agriculture, value added (% of GDP)",0.984947
5,Arable land (% of land area),0.988122
6,Arable land (hectares per person),0.973539
7,Arable land (hectares),0.999012
8,Cereal production (metric tons),0.945905
9,"Employment in agriculture, female (% of female...",0.941031


In [39]:
buscar_mejores(resultados(df,'Agricultura','Economia',1))

Unnamed: 0,Indicadores,Resultados
0,Adjusted net national income (annual % growth),0.932157
1,Adjusted net national income per capita (annua...,0.965199
2,Adjusted net national income per capita (const...,0.933097
3,"Adjusted net savings, excluding particulate em...",0.974459
4,"Adjusted net savings, excluding particulate em...",0.941369
5,Adjusted savings: consumption of fixed capital...,0.996294
6,Adjusted savings: education expenditure (% of ...,0.951438
7,Adjusted savings: education expenditure (curre...,0.936671
8,Adjusted savings: energy depletion (current US$),0.913275
9,Adjusted savings: gross savings (% of GNI),0.937959


In [None]:
start_time = time.time()
resultado_global = []

for i in range(len(conjunto)):
    for j in range(len(conjunto)):
        for k in range(1,look_back+1):
            start_time2 = time.time()
            porcent , results = porcent_result(iterador_global(conjunto[i],conjunto[j],k))
            resultado_global.append([conjunto_nombre[i],conjunto_nombre[j],k,porcent,"%s" % (time.time() - start_time2),results])
            pd.DataFrame(resultado_global, columns=["Base","Target","Look Back","%","Time","Results"]).to_csv('log_global.csv')

df_global = pd.DataFrame(resultado_base, columns=["Base","Target","Look Back","%","Time","Results"])
print("--- %s seconds ---" % (time.time() - start_time))
df_global