## World Development Indicators - Analytics

In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time

tabla = np.array(pd.read_csv("economia7.csv").drop('Unnamed: 0', 1))
pd.DataFrame(tabla).head()

Unnamed: 0,CountryName,CountryCode,IndicatorName,IndicatorCode,Year,Value
0,Antigua and Barbuda,ATG,"Adolescent fertility rate (births per 1,000 wo...",SP.ADO.TFRT,1960,126.144
1,Antigua and Barbuda,ATG,Age dependency ratio (% of working-age populat...,SP.POP.DPND,1960,88.237117
2,Antigua and Barbuda,ATG,"Age dependency ratio, old (% of working-age po...",SP.POP.DPND.OL,1960,7.779958
3,Antigua and Barbuda,ATG,"Age dependency ratio, young (% of working-age ...",SP.POP.DPND.YG,1960,80.457159
4,Antigua and Barbuda,ATG,"Birth rate, crude (per 1,000 people)",SP.DYN.CBRT.IN,1960,32.92


In [38]:
def r2(y_true, y_predict):
    from sklearn.metrics import r2_score
    return r2_score(y_true, y_predict)

In [39]:
def imputador(tab):
    from sklearn.preprocessing import Imputer
    impute=Imputer(missing_values="NaN",strategy='mean',axis=0)
    impute.fit(tab)
    tab_imputada=impute.transform(tab)
    return pd.DataFrame(tab_imputada)

In [40]:
def zeros(tab):
    df = pd.DataFrame(tab)
    #Eliminamos las columnas de NaN descartando Indicadores que no tienen regristros para ningún pais y año deseados
    df = df.dropna(how='all',axis=1)
    #Asignamos a los NaN el valor de 0    
    df = df.fillna(0)
    return df

In [42]:
def Splitter(tab,y_indicator):
    from sklearn.model_selection import train_test_split
    # Asignamos X e y, eliminando la columna y en X
    X = correlacion(tab,y_indicator)
    y = tab[:,y_indicator]
    # Separamos Train y Test respectivamente para X e y
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    return X_train, X_test, y_train, y_test

In [43]:
def correlacion(tab,y_indicator):
    # Buscamos los indicadores que se correlacionen más (coeficiente > 0.7)
    eliminar = []
    tabla_correlacion = np.array(pd.DataFrame(tab).corr())
    for i in range(np.shape(tab)[1]):
        if abs(tabla_correlacion[i,y_indicator]) > 0.7:
            eliminar.append(i)

    # Eliminamos los indicadores que se correlacionan más (Coeficiente > 0.7)
    temporal = tab[:,:]

    for i in range(len(eliminar)):
        temporal = np.delete(temporal[:,:], eliminar[i]-i, 1)

    return temporal

In [44]:
def GridSearchCV_Universal(estimador,X_train, y_train):
    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import ShuffleSplit
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.tree import DecisionTreeRegressor
        
    if(estimador=='DTR'):
        estimator = DecisionTreeRegressor()
        param_grid = {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 
                  'splitter': ['best', 'random']
                 }
        
    if(estimador=='RFR'):
        estimator = RandomForestRegressor()       
        param_grid = { 
                "n_estimators"      : [10,20,30,40],
                "max_features"      : ["auto", "sqrt", "log2"],
                "min_samples_split" : [2,4,8],
                "bootstrap": [True, False],
                }
    
    grid = GridSearchCV(estimator, param_grid, n_jobs=-1, cv=ShuffleSplit(test_size=0.2))   
    grid.fit(X_train, y_train)
    
    return grid.best_params_

In [45]:
def estimator_Universal(estimador, X_train, X_test, y_train, y_test, best_params):
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.tree import DecisionTreeRegressor
    
    if(estimador=='DTR'):
        estimator = DecisionTreeRegressor().set_params(**best_params)
    if(estimador=='RFR'):
        estimator = RandomForestRegressor(n_jobs=-1).set_params(**best_params)

    estimator.fit(X_train,y_train)
    y_predict = estimator.predict(X_test)
    return r2(y_test,y_predict)

In [46]:
def iter_Splitter(estimador,tab):    
    R2_global = list()
    start_time = time.time()
    for i in range(0,np.shape(imputador(tab2))[1]):
        ## Zero
        start_time_2 = time.time()
        X_train, X_test, y_train, y_test = Splitter(np.array(zeros(tab)),i)
        best_params = GridSearchCV_Universal(estimador, X_train, y_train)
        print i , "Zero" , best_params, "- %s s" % (time.time() - start_time_2)
        R2_Original = estimator_Universal(estimador, X_train, X_test, y_train, y_test, best_params)

        ## Imputer
        start_time_3 = time.time()
        X_train, X_test, y_train, y_test = Splitter(np.array(imputador(tab)),i)
        best_params = GridSearchCV_Universal(estimador, X_train, y_train)
        print i , "Zero" , best_params, "- %s s" % (time.time() - start_time_3)
        R2_Imputado = estimator_Universal(estimador, X_train, X_test, y_train, y_test, best_params)

        # Join results
        R2_global.append([R2_Original,R2_Imputado])
    
    print("--- %s seconds ---" % (time.time() - start_time))
    return pd.DataFrame(R2_global,columns=['Original','Imputado'])

In [47]:
iter_Splitter('DTR',tabla)

In [None]:
iter_Splitter('RFR',tabla)

In [26]:
iter_Splitter('SVR',tabla)

Best Params {'splitter': 'random', 'max_depth': 7} - 0.810426950455 s
Best Params {'splitter': 'best', 'max_depth': 9} - 0.626332998276 s
Best Params {'splitter': 'random', 'max_depth': 5} - 0.715851783752 s
Best Params {'splitter': 'random', 'max_depth': 9} - 0.600108861923 s
Best Params {'splitter': 'best', 'max_depth': 9} - 0.709694862366 s
Best Params {'splitter': 'random', 'max_depth': 8} - 0.729998111725 s
Best Params {'splitter': 'best', 'max_depth': 10} - 0.722052097321 s
Best Params {'splitter': 'best', 'max_depth': 7} - 0.73463511467 s
Best Params {'splitter': 'best', 'max_depth': 9} - 0.750251054764 s
Best Params {'splitter': 'best', 'max_depth': 8} - 0.756103992462 s
Best Params {'splitter': 'best', 'max_depth': 2} - 0.718957901001 s
Best Params {'splitter': 'best', 'max_depth': 1} - 0.706968069077 s
Best Params {'splitter': 'best', 'max_depth': 5} - 0.71296787262 s
Best Params {'splitter': 'best', 'max_depth': 4} - 0.707448005676 s
Best Params {'splitter': 'best', 'max_dep

Unnamed: 0,Original,Imputado
0,0.999718,0.998368
1,0.966501,0.990089
2,0.071373,-0.270602
3,0.997568,0.997385
4,0.859054,0.970196
5,0.276584,0.065544
6,0.60997,0.767523
7,-0.077037,0.462637
8,0.953881,0.910438
9,0.963481,0.988926
