## World Development Indicators - Analytics

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time

datos = pd.read_csv("economia7.csv").drop('Unnamed: 0', 1)
datos.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,3358406000.0,3637845000.0,1347350000.0,1239080000.0,108.320559,0.071111,14517.635341,39348.639719,42622.666667,15786.17284,-1.046262,25735.714754,24723.111892,2196543000.0,2110118000.0
1,2954188000.0,3257308000.0,1206410000.0,1089944000.0,110.260684,-12.036015,12629.713786,34231.611819,37744.00927,13979.262693,-13.004332,22388.956865,21671.375749,1932167000.0,1870240000.0
2,2743171000.0,3065955000.0,1135539000.0,1012090000.0,111.766847,-7.142991,11602.142227,31446.47897,35146.738046,13017.310387,-8.136143,20567.359344,20151.309866,1794152000.0,1757859000.0
3,2693974000.0,3050780000.0,1129918000.0,993938500.0,113.244595,-1.793436,11275.280156,30560.551094,34608.172248,12817.841573,-2.817256,19987.924139,19987.924139,1761975000.0,1761975000.0
4,2802259000.0,3252725000.0,1204713000.0,1033890000.0,116.075112,4.019542,11607.745344,31461.665675,36519.163794,13525.61622,2.94862,20577.292107,20946.700835,1832799000.0,1865702000.0


In [2]:
def r2(y_true, y_predict):
    from sklearn.metrics import r2_score
    return r2_score(y_true, y_predict)

In [3]:
def tabla_base(paises,years,indicadores,datos):
    tab = pd.DataFrame.pivot_table(datos, values='Value', index=['CountryName', 'Year'], columns=['IndicatorCode']).loc[(paises,years),indicadores].sortlevel(["CountryName","Year"], ascending=[True,False])
    return tab

In [4]:
def zeros(tab):
    df = pd.DataFrame(tab)
    #Eliminamos las columnas de NaN descartando Indicadores que no tienen regristros para ningún pais y año deseados
    df = df.dropna(how='all',axis=1)
    #Asignamos a los NaN el valor de 0    
    df = df.fillna(0)
    # Elimina Indicadores con todos sus registros en Zero
    df = df.loc[:, (df != 0).any(axis=0)]
    return np.array(df)

In [5]:
def imputar(tab):
    from sklearn.preprocessing import Imputer
    impute=Imputer(missing_values="NaN",strategy='mean',axis=0)
    impute.fit(tab)
    tab_imputada=impute.transform(tab)
    return np.array(tab_imputada)

In [6]:
def normalizar(tab):
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    return sc.fit_transform(tab)

In [7]:
def correlacion(tab,y_indicator):
    # Buscamos los indicadores que se correlacionen más (coeficiente > 0.7)
    eliminar = []
    tabla_correlacion = np.array(pd.DataFrame(tab).corr())
    for i in range(np.shape(tab)[1]):
        if abs(tabla_correlacion[i,y_indicator]) > 0.7:
            eliminar.append(i)

    # Eliminamos los indicadores que se correlacionan más (Coeficiente > 0.7)
    temporal = tab[:,:]

    for i in range(len(eliminar)):
        temporal = np.delete(temporal[:,:], eliminar[i]-i, 1)

    return temporal

In [8]:
def Splitter(tab1,tab2,y_indicator):
    from sklearn.model_selection import train_test_split
    #Fusionamos la tabla_1 y el indicador y de la tabla_2
    tab_fusion = np.column_stack((tab1[:,:],tab2[:,y_indicator]))
    
    # Asignamos X e y, eliminando los indicadores que se correlacionen más (coeficiente > 0.7)  
    X = correlacion(tab_fusion,-1)
    y = tab_fusion[:,-1]
    
    # Separamos Train y Test respectivamente para X e y
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    return X_train, X_test, y_train, y_test

In [9]:
def estimator_Universal(estimador, preprocess, search, X_train, X_test, y_train, y_test):
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.svm import SVR
    
    start_time = time.time()
    best_params = SearchCV_Universal(estimador, search, X_train, y_train)       
    
    if(estimador=='DTR'):
        estimator = DecisionTreeRegressor().set_params(**best_params)
    if(estimador=='RFR'):
        estimator = RandomForestRegressor(n_jobs=-1).set_params(**best_params)
    if(estimador=='SVR'):
        estimator = SVR().set_params(**best_params)
        
    estimator.fit(X_train,y_train)
    y_predict = estimator.predict(X_test)
    
    log_detail.append([estimador, preprocess, search, "%s" % (time.time() - start_time), best_params])
        
    return r2(y_test,y_predict)

In [10]:
def SearchCV_Universal(estimador, search, X_train, y_train):
    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import RandomizedSearchCV
    from sklearn.model_selection import ShuffleSplit
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.svm import SVR
        
    if(estimador=='DTR'):
        estimator  = DecisionTreeRegressor()
        param_grid = {  'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 
                        'splitter': ['best', 'random']
                     }
        
    if(estimador=='RFR'):
        estimator  = RandomForestRegressor()       
        param_grid = { 
                        "n_estimators"      : [10,20,30,40],
                        "max_features"      : ["auto", "sqrt", "log2"],
                        "min_samples_split" : [2,4,8],
                        "bootstrap": [True, False],
                     }
    if(estimador=='SVR'):
        estimator  = SVR()
        param_grid ={
                        'gamma'  : ['auto', 1e-3, 1e-4],
                        'C'      : [1, 10, 100, 1000],            
                    }

    if (search=='random'):
        grid = RandomizedSearchCV(estimator, param_grid, n_jobs=-1, cv=ShuffleSplit(test_size=0.2))
    if (search=='grid'):
        grid = GridSearchCV(estimator, param_grid, n_jobs=-1, cv=ShuffleSplit(test_size=0.2))
    grid.fit(X_train, y_train)
    
    return grid.best_params_

In [11]:
def iter_Splitter_Optimus(tab, preprocess, search):    
    R2_global = list()
    for i in range(0,np.shape(zeros(tab2))[1]):
        
        if (preprocess=='zeros'):
            X_train, X_test, y_train, y_test = Splitter(normalizar(zeros(tab)), i)
        if (preprocess=='imput'):
            X_train, X_test, y_train, y_test = Splitter(normalizar(imputar(tab)), i)

        result = estimator_Universal('DTR', preprocess, search, X_train, X_test, y_train, y_test)

        if(result < 0.9): 
            temp = estimator_Universal('SVR', preprocess, search, X_train, X_test, y_train, y_test)
            if(temp < 0.9): 
                temp2 = estimator_Universal('RFR', preprocess, search, X_train, X_test, y_train, y_test)
                if (temp2 > temp): 
                    result = temp2
            if(temp > result): 
                result = temp

        R2_global.append(result)
        pd.DataFrame(log_detail, columns=["Estimator","Preprocess","Search","Time","Best Params"]).to_csv('log_base_detail.csv')    

    return R2_global

In [12]:
def porcent_result(df):
    df_temp = np.array(df)
    x = 0.0
    for i in range(len(df_temp)):
        if(df_temp[i] > 0.9):
            x=x+1
    porcentaje = x/len(df)
    return porcentaje , df

## Rendimiento Normalizar Zeros vs Normalizar Imputado

In [13]:
log_detail = []

iter_Zeros = iter_Splitter_Optimus(datos,'zeros','random')
iter_Imput = iter_Splitter_Optimus(datos,'imput','random')

porcent_zeros , result_zeros = porcent_result(iter_Zeros)
porcent_imput , result_imput = porcent_result(iter_Imput)

print "Zeros" , porcent_zeros
print "Imput" , porcent_imput

pd.DataFrame(np.column_stack([iter_Zeros,iter_Imput]),columns=['Zeros','Imputado'])

TypeError: iter_Splitter_Optimus() takes exactly 4 arguments (3 given)

## Rendimiento RandomizedSearch vs GridSearch

In [None]:
log_detail = []

iter_random = iter_Splitter_Optimus(datos,'zeros','random')
iter_grid   = iter_Splitter_Optimus(datos,'zeros','grid')

porcent_random , result_random = porcent_result(iter_random)
porcent_grid , result_grid = porcent_result(iter_grid)

print "Random" , porcent_zeros
print "Grid" , porcent_imput

pd.DataFrame(np.column_stack([iter_random,iter_grid]),columns=['Random','Grid'])

## Cargamos resultados

In [17]:
current = pd.read_csv("log.csv").drop('Unnamed: 0', 1)
current

Unnamed: 0,Base,Target,Look Back,%,Time,Indicators >90%
0,Agricultura,Agricultura,1,0.590909,606.989,Indica...
1,Agricultura,Agricultura,2,0.613636,556.706,Indica...


In [18]:
optimus = pd.read_csv("log_base.csv").drop('Unnamed: 0', 1)
optimus

Unnamed: 0,Base,Target,Look Back,%,Time,Results
0,Agricultura,Agricultura,1,0.545455,564.243,"[0.15643444150326735, 0.049425973594666162, 0...."
1,Agricultura,Agricultura,2,0.636364,574.454,"[0.4474346786034068, 0.4243037686278408, 0.423..."
2,Agricultura,Agricultura,3,0.454545,707.747,"[0.73481763173455406, 0.66178940131207709, 0.9..."
3,Agricultura,Ayuda,1,0.181818,1225.504,"[1.0, 0.60168480611699982, 0.08192142663539248..."
4,Agricultura,Ayuda,2,0.242424,1232.591,"[1.0, 0.55010499953005199, 0.88628137043969135..."
5,Agricultura,Ayuda,3,0.166667,1466.036,"[1.0, 0.57771240836484328, 0.91914047340930738..."
6,Agricultura,Clima,1,0.523077,1076.463,"[0.34659092882129983, 0.89717141284706869, 0.9..."
7,Agricultura,Clima,2,0.492308,1031.602,"[0.70820173539776432, 0.81491424608952356, 0.9..."
8,Agricultura,Clima,3,0.523077,1144.477,"[0.6957243549314851, 0.71638672030648465, 0.99..."
9,Agricultura,Economia,1,0.559524,3425.863,"[0.47939850678703388, 0.96690062633883045, 0.9..."


## Rendimiento Current WorkFlow Vs Optimus WorkFlow

In [19]:
pd.DataFrame.pivot_table(current, index=["Base","Target"], columns=["Look Back"] )

Unnamed: 0_level_0,Unnamed: 1_level_0,%,%,Time,Time
Unnamed: 0_level_1,Look Back,1,2,1,2
Base,Target,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Agricultura,Agricultura,0.590909,0.613636,606.989,556.706


In [20]:
pd.DataFrame.pivot_table(optimus, index=["Base","Target"], columns=["Look Back"] )

Unnamed: 0_level_0,Unnamed: 1_level_0,%,%,%,Time,Time,Time
Unnamed: 0_level_1,Look Back,1,2,3,1,2,3
Base,Target,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Agricultura,Agricultura,0.545455,0.636364,0.454545,564.243,574.454,707.747
Agricultura,Ambiente,0.471264,0.413793,0.413793,1182.392,1585.165,1617.623
Agricultura,Ayuda,0.181818,0.242424,0.166667,1225.504,1232.591,1466.036
Agricultura,Clima,0.523077,0.492308,0.523077,1076.463,1031.602,1144.477
Agricultura,Deuda,0.28125,0.245536,0.209821,3974.728,6299.709,5420.656
Agricultura,Economia,0.559524,0.492063,0.52381,3425.863,3626.461,4268.061
Agricultura,Educacion,0.12931,0.12069,0.137931,2603.257,2809.773,2331.567
Agricultura,Energia,0.480769,0.519231,0.5,738.504,816.706,777.198
Agricultura,Finanzas,0.433333,0.416667,0.433333,1185.953,1801.854,1177.102
Agricultura,Genero,0.205674,,,3557.816,,
