## World Development Indicators - Analytics

In [6]:
import pandas as pd
import numpy as np
import time

!wget  https://www.dropbox.com/s/b0takaot9m3gdg8/suramerica_indicators.csv?dl=1 -O suramerica_indicators.csv

datos = pd.read_csv("suramerica_indicators.csv").drop('Unnamed: 0', 1)
paises = datos['CountryCode'].drop_duplicates().values.tolist()

preprocessing = 'imput'
search = 'original'
year_init = 2012
year_range = 5
look_back = 3
years = range(year_init-year_range+1,year_init+1)[::-1]

agricultura     = open("Indicadores/iagricultura.txt").read().split(',')
ambiente        = open("Indicadores/iambiente.txt").read().split(',')
ayuda           = open("Indicadores/iayuda.txt").read().split(',')
ciencia         = open("Indicadores/iciencia.txt").read().split(',')
clima           = open("Indicadores/iclima.txt").read().split(',')
comercio        = open("Indicadores/icomercio.txt").read().split(',')
deuda           = open("Indicadores/ideuda.txt").read().split(',')
economia        = open("Indicadores/ieconomia.txt").read().split(',')
educacion       = open("Indicadores/ieducacion.txt").read().split(',')
energia         = open("Indicadores/ienergia.txt").read().split(',')
finanzas        = open("Indicadores/ifinanzas.txt").read().split(',')
genero          = open("Indicadores/igenero.txt").read().split(',')
infraestructura = open("Indicadores/iinfraestructura.txt").read().split(',')
pobreza         = open("Indicadores/ipobreza.txt").read().split(',')
privado         = open("Indicadores/iprivado.txt").read().split(',')
publico         = open("Indicadores/ipublico.txt").read().split(',')
salud           = open("Indicadores/isalud.txt").read().split(',')
social          = open("Indicadores/isocial.txt").read().split(',')
trabajo         = open("Indicadores/itrabajo.txt").read().split(',')
urbano          = open("Indicadores/iurbano.txt").read().split(',')


conjunto_nombre = ['Agricultura','Ambiente','Ayuda','Ciencia','Clima','Comercio','Deuda','Economia','Educacion',
                   'Energia','Finanzas','Genero','Infraestructura','Pobreza','Privado','Publico','Salud','Social',
                   'Trabajo','Urbano']

conjunto = [agricultura,ambiente,ayuda,ciencia,clima,comercio,deuda,economia,educacion,energia,finanzas,genero,
            infraestructura,pobreza,privado,publico,salud,social,trabajo,urbano]

datos.head()

Unnamed: 0,CountryName,CountryCode,IndicatorName,IndicatorCode,Year,Value
0,Antigua and Barbuda,ATG,"Adolescent fertility rate (births per 1,000 wo...",SP.ADO.TFRT,1960,126.144
1,Antigua and Barbuda,ATG,Age dependency ratio (% of working-age populat...,SP.POP.DPND,1960,88.237117
2,Antigua and Barbuda,ATG,"Age dependency ratio, old (% of working-age po...",SP.POP.DPND.OL,1960,7.779958
3,Antigua and Barbuda,ATG,"Age dependency ratio, young (% of working-age ...",SP.POP.DPND.YG,1960,80.457159
4,Antigua and Barbuda,ATG,"Birth rate, crude (per 1,000 people)",SP.DYN.CBRT.IN,1960,32.92


In [7]:
def r2(y_true, y_predict):
    from sklearn.metrics import r2_score
    return r2_score(y_true, y_predict)

In [8]:
def tabla_base(indicadores):
    tab = pd.DataFrame.pivot_table(datos, values='Value', index=['CountryCode', 'Year'], columns=['IndicatorCode']).loc[(paises,years),indicadores].sortlevel(["CountryCode","Year"], ascending=[True,False])
    return tab

In [9]:
def tabla_2_base(indicadores,look_back):
    temp_table = []
    for i in range(look_back):      
        temp_years = range(year_init-year_range-i+1,year_init-i+1)[::-1]
        temp_table.append(pd.DataFrame.pivot_table(datos, values='Value', index=['CountryCode', 'Year'], columns=['IndicatorCode']).loc[(paises,temp_years),indicadores].sortlevel(["CountryCode","Year"], ascending=[True,False]))
    return pd.DataFrame(np.column_stack(temp_table))

In [17]:
def estimator_Universal(estimador, X, y):
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.svm import SVR
    from sklearn.model_selection import cross_val_score
    from sklearn.model_selection import ShuffleSplit
    
    if(search=='original'):
        if(estimador=='DTR'):
            estimator = DecisionTreeRegressor()
        if(estimador=='RFR'):
            estimator = RandomForestRegressor(n_jobs=-1)
        if(estimador=='SVR'):
            estimator = SVR()
    else:
        best_params = SearchCV_Universal(estimador, search, X, y)       

        if(estimador=='DTR'):
            estimator = DecisionTreeRegressor().set_params(**best_params)
        if(estimador=='RFR'):
            estimator = RandomForestRegressor(n_jobs=-1).set_params(**best_params)
        if(estimador=='SVR'):
            estimator = SVR().set_params(**best_params)

    estimador_score = cross_val_score(estimator, X , y, scoring='r2', cv=ShuffleSplit(test_size=0.2))

    return estimador_score.mean()

In [18]:
def Preprocess(tab1,tab2,y_indicator):
    from sklearn.preprocessing import Imputer
    from sklearn.preprocessing import StandardScaler

    #Eliminamos las columnas de NaN descartando Indicadores que no tienen regristros para ningún pais y año deseados
    tab1 = tab1.dropna(how='all',axis=1)
    tab2 = tab2.dropna(how='all',axis=1)

    if(preprocessing=='zeros'):
        #Imputamos los NaN por Zero
        tab1 = tab1.fillna(0)      

    if(preprocessing=='imput'):
        #Imputamos los NaN por la media de cada Indicador respectivamente      
        impute=Imputer(missing_values="NaN",strategy='mean',axis=0)
        impute.fit(tab1)
        tab1 = pd.DataFrame(impute.transform(tab1))

    #Fusionamos la tabla_1 y el indicador y de la tabla_2
    tab_fusion = pd.DataFrame(np.column_stack((np.array(tab1)[:,:],np.array(tab2)[:,y_indicator])))

    #Eliminamos las filas Si el valor a predecir es NaN
    tab_fusion = tab_fusion.dropna(subset=[tab_fusion.iloc[:,-1].name])

    # Asignamos X e y, eliminando los indicadores que se correlacionen más con el indicador a predecir (coeficiente > 0.7)
    tab_fusion_corr = tab_fusion.corr()
    X = tab_fusion.drop(tab_fusion_corr[tab_fusion_corr.iloc[:,-1] > 0.7].index, axis=1)
    y = tab_fusion.iloc[:,-1]

    # Normalizamos los datos
    sc = StandardScaler()
    tab_fusion_norm = sc.fit_transform(np.column_stack([X,y]))
    X = tab_fusion_norm[:,:-1]
    y = tab_fusion_norm[:,-1]
    
    return X, y

In [19]:
def iter_Splitter_Optimus(tab1,tab2): 
    R2_global = list()
    estimadores = ['DTR','SVR','RFR']
    
    for i in range(0,np.shape(tab2.dropna(how='all',axis=1))[1]):

        X, y = Preprocess(tab1, tab2, i)
        result = list()
        
        start_time = time.time()
        
        for j in estimadores:
            result.append([ estimator_Universal(j, X, y) , j, "%s" % (time.time() - start_time) ])
        
        if (max(result)[0] > 0):
            R2_global.append(max(result))           
        else:
            if(max(result)[0] == 0):
                R2_global.append([0.0,"NONE",max(result)[2]])
            else:      
                R2_global.append([0.0,max(result)[1],max(result)[2]])
        
    df = pd.DataFrame(R2_global)
    df2 = df.groupby(1).count()/np.shape(tab2.dropna(how='all',axis=1))[1]
    
    if (len(df2)==4):
        DTR, N, RFR, SVR = df2.values[:,0][0], df2.values[:,0][1], df2.values[:,0][2], df2.values[:,0][3]        
    else:
        DTR, N, RFR, SVR = df2.values[:,0][0], 0.0 , df2.values[:,0][1], df2.values[:,0][2]
    
    return df, DTR, N, RFR, SVR

In [20]:
def SearchCV_Universal(estimador, search, X_train, y_train):
    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import RandomizedSearchCV
    from sklearn.model_selection import ShuffleSplit
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.svm import SVR
        
    if(estimador=='DTR'):
        estimator  = DecisionTreeRegressor()
        param_grid = {  'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 
                        'splitter': ['best', 'random']
                     }
        
    if(estimador=='RFR'):
        estimator  = RandomForestRegressor()       
        param_grid = { 
                        "n_estimators"      : [10,20,30,40],
                        "max_features"      : ["auto", "sqrt", "log2"],
                        "min_samples_split" : [2,4,8],
                        "bootstrap": [True, False],
                     }
    if(estimador=='SVR'):
        estimator  = SVR()
        param_grid ={
                        'gamma'  : ['auto', 1e-3, 1e-4],
                        'C'      : [1, 10, 100, 1000],            
                    }

    if (search=='random'):
        grid = RandomizedSearchCV(estimator, param_grid, n_jobs=-1, cv=ShuffleSplit(test_size=0.2))
    if (search=='grid'):
        grid = GridSearchCV(estimator, param_grid, n_jobs=-1, cv=ShuffleSplit(test_size=0.2))
    
    grid.fit(X_train, y_train)
        
    return grid.best_params_

In [21]:
def filtro(indicadores_1,indicadores_2):
    if(indicadores_1!=indicadores_2):
        df = pd.DataFrame(indicadores_2)
        indicadores_2 = np.array(df.loc[~df.ix[:,0].isin(indicadores_1)]).flatten()
    return indicadores_2

In [22]:
def iterador_global(indicadores_1,indicadores_2,look_back): 
    
    tab1 = tabla_2_base(indicadores_1,look_back)
    tab2 = tabla_base(filtro(indicadores_1,indicadores_2))
    
    return iter_Splitter_Optimus(tab1,tab2)

In [23]:
def porcent_result_reg(df,indicadores_1,indicadores_2):
    porcent = df.mean().values[0]
    result = np.array(df.values)   
    reg = len(filtro(indicadores_1,indicadores_2))
    return porcent,result,reg

## Iteramos todos los Conjuntos de Indicadores

In [24]:
resultado_global = []
predicciones = []

for i in range(20):
    for j in range(20):
        for k in range(1,look_back+1):
            start_time = time.time()
            R2_global, DTR, RFR, SVR, N = iterador_global(conjunto[i],conjunto[j],k)
            porcent, results, reg = porcent_result_reg(R2_global, conjunto[i], conjunto[j])
            resultado_global.append([conjunto_nombre[i],conjunto_nombre[j],k,reg,porcent, DTR, RFR, SVR, N,"%s" % (time.time() - start_time),results])
            pd.DataFrame(resultado_global, columns=["Base","Target","Look Back","Reg","%","DTR","RFR","SVR","None","Time","Results"]).to_csv('suramerica_absolute.csv')

KeyboardInterrupt: 