## World Development Indicators - Analytics

In [114]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time

datos = pd.read_csv("suramerica.csv").drop('Unnamed: 0', 1)
paises = datos['CountryCode'].drop_duplicates().values.tolist()

log_detail = []
preprocessing = 'imput'
search = 'original'
year_init = 2012
year_range = 5
look_back = 3
years = range(year_init-year_range+1,year_init+1)[::-1]

agricultura = pd.read_csv('Indicadores/iagricultura.txt', sep=",", header =None).as_matrix()[0]
economia    = pd.read_csv('Indicadores/ieconomia.txt',    sep=",", header =None).as_matrix()[0]
educacion   = pd.read_csv('Indicadores/ieducacion.txt',   sep=",", header =None).as_matrix()[0]
salud       = pd.read_csv('Indicadores/isalud.txt',       sep=",", header =None).as_matrix()[0]
deuda       = pd.read_csv('Indicadores/ideuda.txt',       sep=",", header =None).as_matrix()[0]

conjunto_nombre = ['Agricultura','Economia','Educacion','Salud','Deuda']
conjunto = [agricultura,economia,educacion,salud,deuda]

In [56]:
def r2(y_true, y_predict):
    from sklearn.metrics import r2_score
    return r2_score(y_true, y_predict)

In [57]:
def tabla_base(indicadores):
    tab = pd.DataFrame.pivot_table(datos, values='Value', index=['CountryName', 'Year'], columns=['IndicatorCode']).loc[(paises,years),indicadores].sortlevel(["CountryName","Year"], ascending=[True,False])
    return tab

In [58]:
def tabla_2_base(indicadores,look_back):
    temp_table = []
    for i in range(look_back):      
        temp_years = range(year_init-year_range-i+1,year_init-i+1)[::-1]
        temp_table.append(pd.DataFrame.pivot_table(datos, values='Value', index=['CountryName', 'Year'], columns=['IndicatorCode']).loc[(paises,temp_years),indicadores].sortlevel(["CountryName","Year"], ascending=[True,False]))
    return pd.DataFrame(np.column_stack(temp_table))

In [59]:
def estimator_Universal(estimador, preprocess, search, X_train, X_test, y_train, y_test):
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.svm import SVR
    
    start_time = time.time()
    
    if(search=='original'):
        if(estimador=='DTR'):
            estimator = DecisionTreeRegressor()
        if(estimador=='RFR'):
            estimator = RandomForestRegressor(n_jobs=-1)
        if(estimador=='SVR'):
            estimator = SVR()
    else:
        best_params = SearchCV_Universal(estimador, search, X_train, y_train)       

        if(estimador=='DTR'):
            estimator = DecisionTreeRegressor().set_params(**best_params)
        if(estimador=='RFR'):
            estimator = RandomForestRegressor(n_jobs=-1).set_params(**best_params)
        if(estimador=='SVR'):
            estimator = SVR().set_params(**best_params)
        
        
    estimator.fit(X_train,y_train)
    y_predict = estimator.predict(X_test)
    
    log_detail.append([estimador, search, "%s" % (time.time() - start_time), preprocess])
        
    return r2(y_test,y_predict)

In [60]:
def Preprocess(tab1,tab2,y_indicator):
    from sklearn.preprocessing import Imputer
    from sklearn.preprocessing import StandardScaler
    from sklearn.model_selection import train_test_split
       
    #Fusionamos la tabla_1 y el indicador y de la tabla_2
    tab_fusion = np.column_stack((np.array(tab1)[:,:],np.array(tab2)[:,y_indicator]))    
    
    #Eliminamos las columnas de NaN descartando Indicadores que no tienen regristros para ningún pais y año deseados
    df = pd.DataFrame(tab_fusion)
    df = df.dropna(how='all',axis=1)
    
    #Eliminamos las filas Si el valor a predecir es NaN
    df = df.dropna(subset=[df.iloc[:,-1].name])
    
    if(preprocessing=='zeros'):
        #Imputamos los NaN por Zero
        df = df.fillna(0)      
        
    if(preprocessing=='imput'):
        #Imputamos los NaN por la media de cada Indicador respectivamente      
        impute=Imputer(missing_values="NaN",strategy='mean',axis=0)
        impute.fit(df)
        df = pd.DataFrame(impute.transform(df))
        
    # Asignamos X e y, eliminando los indicadores que se correlacionen más (coeficiente > 0.7)
    df_ = df.corr()
    X = df.drop(df_[df_.iloc[:,-1] > 0.7].index, axis=1)
    y = df.iloc[:,-1]

    # Normalizamos los datos
    sc = StandardScaler()
    df_norm = sc.fit_transform(np.column_stack([X,y]))
    X = df_norm[:,:-1]
    y = df_norm[:,-1]
    
    # Separamos Train y Test respectivamente para X e y
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    return X_train, X_test, y_train, y_test

In [61]:
def iter_Splitter_Optimus(tab1,tab2,preprocessing,search): 
    R2_global = list()
    
    for i in range(0,np.shape(tab2.dropna(how='all',axis=1))[1]):

        X_train, X_test, y_train, y_test = Preprocess(tab1, tab2, i)

        result = estimator_Universal('DTR', X_train, X_test, y_train, y_test)

        if(result < 0.9): 
            temp = estimator_Universal('SVR', X_train, X_test, y_train, y_test)
            if(temp < 0.9): 
                temp2 = estimator_Universal('RFR', X_train, X_test, y_train, y_test)
                if (temp2 > temp): 
                    result = temp2
            if(temp > result): 
                result = temp

        R2_global.append(result)
        pd.DataFrame(log_detail, columns=["Estimator","Search","Time","Preprocessing"]).to_csv('log_base_detail.csv')    

    return R2_global

In [62]:
def SearchCV_Universal(estimador, search, X_train, y_train):
    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import RandomizedSearchCV
    from sklearn.model_selection import ShuffleSplit
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.svm import SVR
        
    if(estimador=='DTR'):
        estimator  = DecisionTreeRegressor()
        param_grid = {  'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 
                        'splitter': ['best', 'random']
                     }
        
    if(estimador=='RFR'):
        estimator  = RandomForestRegressor()       
        param_grid = { 
                        "n_estimators"      : [10,20,30,40],
                        "max_features"      : ["auto", "sqrt", "log2"],
                        "min_samples_split" : [2,4,8],
                        "bootstrap": [True, False],
                     }
    if(estimador=='SVR'):
        estimator  = SVR()
        param_grid ={
                        'gamma'  : ['auto', 1e-3, 1e-4],
                        'C'      : [1, 10, 100, 1000],            
                    }

    if (search=='random'):
        grid = RandomizedSearchCV(estimator, param_grid, n_jobs=-1, cv=ShuffleSplit(test_size=0.2))
    if (search=='grid'):
        grid = GridSearchCV(estimator, param_grid, n_jobs=-1, cv=ShuffleSplit(test_size=0.2))
        
    grid.fit(X_train, y_train)
    
    return grid.best_params_

In [63]:
def iterador_global(datos,paises,years,indicadores_1,indicadores_2,look_back): 
    
    tab1 = tabla_2_base(datos,indicadores_1,look_back)
    tab2 = tabla_base(paises,years,indicadores_2,datos)
    
    return iter_Splitter_Optimus(tab1,tab2,preprocessing,search)

In [64]:
def porcent_result(df):
    df_temp = df
    x = 0.0
    for i in range(len(df_temp)):
        if(df_temp[i] > 0.9):
            x=x+1
    porcentaje = x/len(df)
    df = pd.DataFrame(df)
    df[df < 0] = 0.0
    return porcentaje , np.array(df.values)

In [65]:
def buscar(df,indicador_1,indicador_2,look_back):
    #Buscamos Resultados en base al conjunto de indicadores Base, Target y Look Back
    temp = df[(df['Base']==indicador_1)&(df['Target']==indicador_2)&(df['Look Back'] == look_back)]['Results'].values[0]
    
    # Códigos de los indicadores con registros    
    for i in range(len(conjunto_nombre)):
        if indicador_2 == conjunto_nombre[i]:
            df_zeros = tabla_base(paises,years,conjunto[i],datos).dropna(how='all',axis=1)
    icodes = df_zeros.columns.values

    # Buscamos Nombres de los Códigos de los Indicadores
    nombres = pd.DataFrame(datos).loc[:,['IndicatorName','IndicatorCode']].set_index('IndicatorCode').loc[icodes].drop_duplicates()

    # Concatenamos Resultados y Nombres, Reindexando en base a los nombres
    df_results = pd.DataFrame(np.column_stack([nombres,temp]),columns=['Indicadores','Resultados'])
    
    return df_results

In [13]:
start_time = time.time()
resultado_base = []
log_detail = []

for i in range(len(conjunto)):
    for j in range(1,look_back+1):
        start_time2 = time.time()
        porcentaje , resultados = porcent_result(iterador_global(datos,paises,years,conjunto[0],conjunto[i],j))
        resultado_base.append([conjunto_nombre[0],conjunto_nombre[i],j,porcentaje,"%s" % (time.time() - start_time2),resultados])
        pd.DataFrame(resultado_base, columns=["Base","Target","Look Back","%","Time","Results"]).to_csv('log_base.csv')

df = pd.DataFrame(resultado_base, columns=["Base","Target","Look Back","%","Time","Results"])
print("--- %s seconds ---" % (time.time() - start_time))
df

--- 645.736938953 seconds ---


Unnamed: 0,Base,Target,Look Back,%,Time,Results
0,Agricultura,Agricultura,1,0.431818,12.5098381042,"[[0.763794714357], [0.734246476612], [0.556046..."
1,Agricultura,Agricultura,2,0.454545,12.8189790249,"[[0.174212547814], [0.739560897677], [0.203574..."
2,Agricultura,Agricultura,3,0.340909,16.1851139069,"[[0.364488438568], [0.74121317959], [0.5926993..."
3,Agricultura,Economia,1,0.404762,56.6033189297,"[[0.869336913111], [0.762601712244], [0.0], [0..."
4,Agricultura,Economia,2,0.361111,64.9724378586,"[[0.647483163951], [0.956012458524], [0.221244..."
5,Agricultura,Economia,3,0.31746,69.242795229,"[[0.482120851977], [0.988199927935], [0.235794..."
6,Agricultura,Educacion,1,0.258621,31.1422669888,"[[0.672901681411], [0.696753220885], [0.290736..."
7,Agricultura,Educacion,2,0.267241,33.4380178452,"[[0.59212558116], [0.679266032611], [0.5567955..."
8,Agricultura,Educacion,3,0.112069,40.8244318962,"[[0.471146268879], [0.762719234186], [0.460579..."
9,Agricultura,Salud,1,0.370079,32.4308478832,"[[0.0], [0.910237355613], [0.979608704089], [0..."


In [22]:
df_ = buscar(df,'Agricultura','Salud',1) 
df_[df_.Resultados > 0.9]

Unnamed: 0,Indicadores,Resultados
1,"Adolescent fertility rate (births per 1,000 wo...",0.910237
2,Age dependency ratio (% of working-age populat...,0.979609
3,"Age dependency ratio, old (% of working-age po...",0.951785
4,"Age dependency ratio, young (% of working-age ...",0.939775
5,Antiretroviral therapy coverage (% of people l...,0.978818
6,"Birth rate, crude (per 1,000 people)",0.915041
8,"Cause of death, by communicable diseases and m...",0.914731
9,"Cause of death, by injury (% of total)",0.96626
14,Completeness of birth registration (%),0.977675
15,"Completeness of birth registration, rural (%)",0.970931


In [28]:
tabla_base(paises,years,salud,datos).dropna(how='all',axis=1).ix[:,17]

CountryName               Year
Antigua and Barbuda       2012     79.0
                          2011      NaN
                          2010      NaN
                          2009      NaN
                          2008     80.2
Argentina                 2012      NaN
                          2011    100.0
                          2010      NaN
                          2009     99.2
                          2008      NaN
Aruba                     2012      NaN
                          2011      NaN
                          2010      NaN
                          2009      NaN
                          2008      NaN
Bahamas, The              2012      NaN
                          2011      NaN
                          2010      NaN
                          2009     93.0
                          2008      NaN
Barbados                  2012      NaN
                          2011      NaN
                          2010    100.0
                          2009      NaN
         

In [119]:
datos['CountryCode'].drop_duplicates().values.tolist()

['ATG',
 'ARG',
 'ABW',
 'BHS',
 'BRB',
 'BLZ',
 'BOL',
 'BRA',
 'CYM',
 'CHL',
 'COL',
 'CRI',
 'CUB',
 'CUW',
 'DMA',
 'DOM',
 'ECU',
 'SLV',
 'GRD',
 'GTM',
 'GUY',
 'HTI',
 'HND',
 'JAM',
 'MEX',
 'NIC',
 'PAN',
 'PRY',
 'PER',
 'PRI',
 'SXM',
 'KNA',
 'LCA',
 'MAF',
 'VCT',
 'SUR',
 'TTO',
 'TCA',
 'URY',
 'VEN',
 'VIR']