# **Modelamiento**

## Librerías

In [64]:
!pip install scikit-learn==0.24.2



In [65]:
!pip install unidecode



In [66]:
#Generales
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#Proceso de Modelación
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_percentage_error,r2_score,median_absolute_error, make_scorer
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit, ShuffleSplit

## Lectura Información

In [67]:
#Descargar datasets desde github
!git clone https://github.com/andres-soto-h/monografia-udea-eacd.git

fatal: destination path 'monografia-udea-eacd' already exists and is not an empty directory.


In [68]:
#Descargar stopwords español
!wget https://raw.githubusercontent.com/Alir3z4/stop-words/master/spanish.txt

--2021-10-11 12:24:44--  https://raw.githubusercontent.com/Alir3z4/stop-words/master/spanish.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4244 (4.1K) [text/plain]
Saving to: ‘spanish.txt.3’


2021-10-11 12:24:44 (46.5 MB/s) - ‘spanish.txt.3’ saved [4244/4244]



In [69]:
#Lectura del dataset transformado
df_propiedades=pd.read_csv('/content/monografia-udea-eacd/df_prop_clean_12082021.csv', delimiter=';', encoding='latin1')

In [70]:
# from google.colab import drive
# drive.mount('/content/drive/')
# %cd '/content/drive/My Drive/Colab Notebooks/Seminario/DATASETS'
# # !ls
# df_propiedades=pd.read_csv('df_prop_clean_12082021.csv', delimiter=';', encoding='latin1') 
# print(df_propiedades.shape)

In [71]:
df_propiedades.rename(columns={"municipio":"ubicacion"}, inplace=True)

**NOTA:** se decide eliminar la información del municipio de **La Unión**.

In [72]:
mask_union = df_propiedades['ubicacion']!='la uniÃ³n'
df_propiedades = df_propiedades[mask_union]

In [73]:
df_propiedades['ubicacion'].value_counts()

rionegro                  1241
el retiro                  439
la ceja                    342
guarne                     196
marinilla                  117
el carmen de viboral        62
san antonio de pereira      38
llanogrande                 31
santuario                   15
Name: ubicacion, dtype: int64

**Nota2:** Se convierte la variable Estrato en ordinal

In [74]:
df_propiedades['estrato'] = df_propiedades['estrato'].replace('Campestre', '0')
df_propiedades['estrato'] = df_propiedades['estrato'].astype(int)
df_propiedades['estrato'].value_counts()

4    947
5    636
3    584
0    162
6     89
2     58
1      5
Name: estrato, dtype: int64

## **Ajuste del Modelo**

**Funciones**

In [75]:
#Métricas datos transformados
def median_absolute_error2(y_true, y_pred):
  return median_absolute_error(np.expm1(y_true), np.expm1(y_pred))

def mean_squared_error2(y_true, y_pred):
  return mean_squared_error(np.expm1(y_true), np.expm1(y_pred), squared=False)


def mean_absolute_percentage_error2(y_true, y_pred):
  return mean_absolute_percentage_error(np.expm1(y_true), np.expm1(y_pred))
  
def r2_score2(y_true, y_pred):
  return r2_score(np.expm1(y_true), np.expm1(y_pred))

In [76]:
def div_train_test(X, y, var_stratify = ''):
  
  if var_stratify != '': 
    x_train, x_test, y_train, y_test = train_test_split(X,y, train_size=0.9, stratify = X[var_stratify],  random_state = 42)

    var_stratify_train = x_train[var_stratify]
    #ubicacion_test = X_test['ubicacion']

    x_train.drop(var_stratify,axis=1, inplace=True)
    x_test.drop(var_stratify,axis=1, inplace=True)

    #Variable Objetivo Transformada
    y_train_t = np.log1p(y_train)
    y_test_t = np.log1p(y_test)

    return x_train, x_test, y_train, y_test, y_train_t, y_test_t, var_stratify_train

  else:
    x_train, x_test, y_train, y_test = train_test_split(X,y, train_size=0.9,  random_state = 42)

    #Variable Objetivo Transformada
    y_train_t = np.log1p(y_train)
    y_test_t = np.log1p(y_test)
  
  return x_train, x_test, y_train, y_test, y_train_t, y_test_t

def busqueda_hiperparametros(x_train, y_train, var_stratify_train, estimator,param_grid = [] , scoring_med = 'neg_median_absolute_error', transf=False):

  if len(var_stratify_train) != 0:
    
    print('Seccion Stratify')
    sss = StratifiedShuffleSplit(n_splits=5, test_size=0.1, random_state=42)
    stratified = sss.split(x_train, var_stratify_train)
    
    if transf: 
      print('**para datos transformados**')
      scoring_grid={'metrica':scoring_med,'meae':make_scorer(median_absolute_error2), 'r2':make_scorer(r2_score2) , 'rmse':make_scorer(mean_squared_error2), 'mape':make_scorer(mean_absolute_percentage_error2)}
    else:
      print('**para datos sin transformar**')
      scoring_grid={'metrica':scoring_med,'meae':make_scorer(median_absolute_error), 'r2':make_scorer(r2_score) , 'rmse':make_scorer(mean_squared_error), 'mape':make_scorer(mean_absolute_percentage_error)}
    
    grid = GridSearchCV(estimator, param_grid=param_grid, cv = stratified, scoring = scoring_grid , refit = 'metrica',  return_train_score=True, n_jobs=-1, verbose=8)
    return grid.fit(x_train, y_train)
  
  else:
    print('OutStratify')
    
    if transf: 
      print('**para datos transformados**')
      scoring_grid={'metrica':scoring_med,'meae':make_scorer(median_absolute_error2), 'r2':make_scorer(r2_score2) , 'rmse':make_scorer(mean_squared_error2), 'mape':make_scorer(mean_absolute_percentage_error2)}
    else:
      print('**para datos sin transformar**')
      scoring_grid={'metrica':scoring_med,'meae':make_scorer(median_absolute_error), 'r2':make_scorer(r2_score) , 'rmse':make_scorer(mean_squared_error), 'mape':make_scorer(mean_absolute_percentage_error)}
    
    grid = GridSearchCV(estimator, param_grid=param_grid, scoring = scoring_grid, refit = 'metrica', return_train_score=True, n_jobs=-1, verbose=8 )
    return grid.fit(X_train, y_train)

def metricas(model,y_train, p_train, y_test, p_test): 

  resultados = {
      'parametros' : [model.get_params()['steps'][1][1]],
      'MeAE_train' : [median_absolute_error(y_train,p_train)],
      'MeAE_test'  : [median_absolute_error(y_test, p_test)],
      'MAPE_train' : [mean_absolute_percentage_error(y_train,p_train)],
      'MAPE_test'  : [mean_absolute_percentage_error(y_test, p_test)],
      'r2_train'   : [r2_score(y_train,p_train)],
      'r2_test'    : [r2_score(y_test, p_test)],
      'rmse_train' : [mean_squared_error(y_train,p_train,squared = False)],
      'rmse_test'  : [mean_squared_error(y_test,p_test,squared = False)]
  }
  return pd.DataFrame(resultados)

**División Covariables  y Variable Objetivo**

In [77]:
data_model=df_propiedades.copy()
# data_model = data_model[(data_model['estrato']==0) | (data_model['estrato']==1) | (data_model['estrato']==2)]
# data_model = data_model[data_model['estrato']==3]
# data_model = data_model[data_model['estrato']==4]
data_model = data_model[(data_model['estrato']==5) | (data_model['estrato']==6)]

print(data_model['estrato'].unique())
#columnas_quitar=['tipo','url','titulo','descripcion','caractint','caractext','caractsec']
columnas_quitar=['url','titulo','descripcion','caractint','caractext','caractsec','otros_datos','estrato']
data_model.drop(columnas_quitar, axis=1, inplace=True)
data_model.shape

[5 6]


(725, 176)

In [78]:
X = data_model.drop(['precio'], axis=1)
y = data_model['precio']

In [79]:
var_stratify_train = pd.Series()
len(var_stratify_train)

  """Entry point for launching an IPython kernel.


0

**OneHotEncoder Variables Categóricas**

In [80]:
enc = OneHotEncoder(handle_unknown='ignore')
data_aux = pd.DataFrame(enc.fit_transform(X[['tipo','tipo_propiedad','antiguedad','ubicacion']]).toarray(), columns= enc.get_feature_names(['tipo','tipo_propiedad','antiguedad','ubicacion']))
data_aux['fila'] = range(0, X.shape[0])
data_aux.head()

X.drop(['tipo','tipo_propiedad','antiguedad','ubicacion'],axis=1,inplace=True)
X['fila'] = range(0, X.shape[0])

X = pd.merge(data_aux,X, on = 'fila', how='inner')
X.drop(['fila'], axis=1, inplace=True)
X.shape

(725, 190)

**División train y Test**

In [81]:
X_train, X_test, Y_train, Y_test, Y_train_t, Y_test_t = div_train_test(X, y, var_stratify = '')

### **Random Forest**

In [None]:
pipe = Pipeline(steps = [('scaler', StandardScaler()), ('rf', RandomForestRegressor(random_state=42))])

####**Sin transformar la Y**

**Búsqueda de Hiperparámetros**

In [None]:
para_grid = {'rf__n_estimators':[80,100,120,150], 'rf__max_depth':[3,5,10,20,25], 'rf__min_samples_split':[2,3,5,10,15],'rf__min_samples_leaf':[1,2,5,8]}
# para_grid = {'rf__n_estimators':[120], 'rf__max_depth':[5,10], 'rf__min_samples_split':[3,5],'rf__min_samples_leaf':[2,5]}
# para_grid = {'rf__max_depth':[3,5,10,12,15], 'rf__min_samples_split':[3,5,10,15,20],'rf__min_samples_leaf':[2,3,5,10]}

modelo_rf = busqueda_hiperparametros(X_train, Y_train,var_stratify_train, pipe, param_grid = para_grid, transf=False)
modelo_rf

OutStratify
**para datos sin transformar**
Fitting 5 folds for each of 400 candidates, totalling 2000 fits


GridSearchCV(estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('rf',
                                        RandomForestRegressor(random_state=42))]),
             n_jobs=-1,
             param_grid={'rf__max_depth': [3, 5, 10, 20, 25],
                         'rf__min_samples_leaf': [1, 2, 5, 8],
                         'rf__min_samples_split': [2, 3, 5, 10, 15],
                         'rf__n_estimators': [80, 100, 120, 150]},
             refit='metrica', return_train_score=True,
             scoring={'mape': make_scorer(mean_absolute_percentage_error),
                      'meae': make_scorer(median_absolute_error),
                      'metrica': 'neg_median_absolute_error',
                      'r2': make_scorer(r2_score),
                      'rmse': make_scorer(mean_squared_error)},
             verbose=8)

In [None]:
metricas_rf = metricas(modelo_rf.best_estimator_,Y_train, modelo_rf.predict(X_train), Y_test, modelo_rf.predict(X_test))
metricas_rf

Unnamed: 0,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test
0,"(DecisionTreeRegressor(max_depth=10, max_featu...",50370380.0,83285310.0,0.090549,0.183594,0.948438,0.821335,149114800.0,258662100.0


In [None]:
modelo_rf.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('rf',
                 RandomForestRegressor(max_depth=10, min_samples_leaf=2,
                                       n_estimators=80, random_state=42))])

In [None]:
dd=pd.DataFrame(modelo_rf.cv_results_)
dd[dd['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica',
'mean_test_meae', 'std_test_meae', 'rank_test_meae','mean_train_meae','std_train_meae',
'mean_test_r2', 'std_test_r2','rank_test_r2','mean_train_r2', 'std_train_r2',
'mean_test_rmse','std_test_rmse', 'rank_test_rmse','mean_train_rmse', 'std_train_rmse',
'mean_test_mape','std_test_mape', 'rank_test_mape', 'mean_train_mape', 'std_train_mape']]

Unnamed: 0,params,mean_test_metrica,std_test_metrica,mean_train_metrica,std_train_metrica,mean_test_meae,std_test_meae,rank_test_meae,mean_train_meae,std_train_meae,mean_test_r2,std_test_r2,rank_test_r2,mean_train_r2,std_train_r2,mean_test_rmse,std_test_rmse,rank_test_rmse,mean_train_rmse,std_train_rmse,mean_test_mape,std_test_mape,rank_test_mape,mean_train_mape,std_train_mape
180,"{'rf__max_depth': 10, 'rf__min_samples_leaf': ...",-104766600.0,5586634.0,-49315990.0,1398213.0,104766600.0,5586634.0,399,49315990.0,1398213.0,0.791913,0.029403,11,0.949648,0.00309,8.80896e+16,6540152000000000.0,382,2.16807e+16,988665200000000.0,0.18718,0.012643,372,0.090221,0.001387
184,"{'rf__max_depth': 10, 'rf__min_samples_leaf': ...",-104766600.0,5586634.0,-49315990.0,1398213.0,104766600.0,5586634.0,399,49315990.0,1398213.0,0.791913,0.029403,11,0.949648,0.00309,8.80896e+16,6540152000000000.0,382,2.16807e+16,988665200000000.0,0.18718,0.012643,372,0.090221,0.001387


####**Usando Y transformada**

**Búsqueda de Hiperparámetros**

In [None]:
# para_grid = {'rf__n_estimators':[20,25], 'rf__max_depth':[3,5], 'rf__min_samples_split':[2,3],'rf__min_samples_leaf':[1,2]}
para_grid = {'rf__n_estimators':[80,100,120,150], 'rf__max_depth':[3,5,10,20,25], 'rf__min_samples_split':[2,3,5,10,15],'rf__min_samples_leaf':[1,2,5,8]}
# para_grid = {'rf__n_estimators':[120], 'rf__max_depth':[5,10], 'rf__min_samples_split':[3,5],'rf__min_samples_leaf':[2,5]}
# para_grid = {'rf__max_depth':[3,5,10,12,15], 'rf__min_samples_split':[3,5,10,15,20],'rf__min_samples_leaf':[2,3,5,10]}

modelo_rf_t = busqueda_hiperparametros(X_train, Y_train_t,var_stratify_train, pipe, param_grid = para_grid, transf = True)
modelo_rf_t

OutStratify
**para datos transformados**
Fitting 5 folds for each of 400 candidates, totalling 2000 fits


GridSearchCV(estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('rf',
                                        RandomForestRegressor(random_state=42))]),
             n_jobs=-1,
             param_grid={'rf__max_depth': [3, 5, 10, 20, 25],
                         'rf__min_samples_leaf': [1, 2, 5, 8],
                         'rf__min_samples_split': [2, 3, 5, 10, 15],
                         'rf__n_estimators': [80, 100, 120, 150]},
             refit='metrica', return_train_score=True,
             scoring={'mape': make_scorer(mean_absolute_percentage_error2),
                      'meae': make_scorer(median_absolute_error2),
                      'metrica': 'neg_median_absolute_error',
                      'r2': make_scorer(r2_score2),
                      'rmse': make_scorer(mean_squared_error2)},
             verbose=8)

In [None]:
metricas_rf_t = metricas(modelo_rf_t.best_estimator_,Y_train, np.expm1(modelo_rf_t.predict(X_train)), Y_test, np.expm1(modelo_rf_t.predict(X_test)))
metricas_rf_t

Unnamed: 0,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test
0,"(DecisionTreeRegressor(max_depth=25, max_featu...",39344940.0,64846890.0,0.066362,0.167071,0.963317,0.806674,125773600.0,269065100.0


In [None]:
modelo_rf_t.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('rf',
                 RandomForestRegressor(max_depth=25, min_samples_split=3,
                                       n_estimators=80, random_state=42))])

In [None]:
dd_t=pd.DataFrame(modelo_rf_t.cv_results_)
dd_t[dd_t['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica',
'mean_test_meae', 'std_test_meae', 'rank_test_meae','mean_train_meae','std_train_meae',
'mean_test_r2', 'std_test_r2','rank_test_r2','mean_train_r2', 'std_train_r2',
'mean_test_rmse','std_test_rmse', 'rank_test_rmse','mean_train_rmse', 'std_train_rmse',
'mean_test_mape','std_test_mape', 'rank_test_mape', 'mean_train_mape', 'std_train_mape']]

Unnamed: 0,params,mean_test_metrica,std_test_metrica,mean_train_metrica,std_train_metrica,mean_test_meae,std_test_meae,rank_test_meae,mean_train_meae,std_train_meae,mean_test_r2,std_test_r2,rank_test_r2,mean_train_r2,std_train_r2,mean_test_rmse,std_test_rmse,rank_test_rmse,mean_train_rmse,std_train_rmse,mean_test_mape,std_test_mape,rank_test_mape,mean_train_mape,std_train_mape
324,"{'rf__max_depth': 25, 'rf__min_samples_leaf': ...",-0.11973,0.006229,-0.046766,0.000815,107316100.0,7693660.0,344,39863310.0,1316932.0,0.780866,0.035248,36,0.959474,0.002686,304053100.0,16029630.0,372,132055100.0,3291549.0,0.179743,0.014293,380,0.068826,0.000752


### **Gradient Boosting Regression**

####**Sin tranformar la Y**

In [None]:
pipe_gbt = Pipeline(steps = [('scaler', StandardScaler()), ('gbt', GradientBoostingRegressor(random_state=42))])

**Búsqueda de Hiperparámetros**

In [None]:
# para_grid = {'gbt__n_estimators':[20,25], 'gbt__max_depth':[3,5], 'gbt__min_samples_split':[2,3]}
para_grid = {'gbt__n_estimators':[80,100,120,180], 'gbt__max_depth':[3,5,10,15,20], 'gbt__min_samples_split':[5,10,15,17,25], 'gbt__min_samples_leaf':[1,3,5]}
# para_grid = {'gbt__n_estimators':[120], 'gbt__max_depth':[5,10], 'gbt__min_samples_split':[3,5],'gbt__min_samples_leaf':[2,5]}
# para_grid = {'gbt__max_depth':[3,5,10,12,15], 'gbt__min_samples_split':[3,5,10,15,20],'gbt__min_samples_leaf':[2,3,5,10]}

modelo_gbt = busqueda_hiperparametros(X_train, Y_train,var_stratify_train, pipe_gbt, param_grid = para_grid,transf=False )

OutStratify
**para datos sin transformar**
Fitting 5 folds for each of 300 candidates, totalling 1500 fits


In [None]:
metricas_gbt = metricas(modelo_gbt.best_estimator_,Y_train, modelo_gbt.predict(X_train), Y_test, modelo_gbt.predict(X_test))
metricas_gbt

Unnamed: 0,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test
0,([DecisionTreeRegressor(criterion='friedman_ms...,175321.715409,97374880.0,0.000398,0.199556,1.0,0.743856,457531.989451,309709700.0


In [None]:
modelo_gbt.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('gbt',
                 GradientBoostingRegressor(max_depth=15, min_samples_split=10,
                                           n_estimators=120,
                                           random_state=42))])

####**Usando Y transformada**

**Búsqueda de hiperparámetros**

In [None]:
# para_grid = {'gbt__n_estimators':[20,25], 'gbt__max_depth':[3,5], 'gbt__min_samples_split':[2,3],'gbt__min_samples_leaf':[1,2]}
para_grid = {'gbt__n_estimators':[80,100,120,180], 'gbt__max_depth':[3,5,10,15,20], 'gbt__min_samples_split':[5,10,15,17,25], 'gbt__min_samples_leaf':[1,3,5]}
# para_grid = {'gbt__n_estimators':[120], 'gbt__max_depth':[5,10], 'gbt__min_samples_split':[3,5],'gbt__min_samples_leaf':[2,5]}
# para_grid = {'gbt__max_depth':[3,5,10,12,15], 'gbt__min_samples_split':[3,5,10,15,20],'gbt__min_samples_leaf':[2,3,5,10]}

modelo_gbt_t = busqueda_hiperparametros(X_train, Y_train_t,var_stratify_train, pipe_gbt, param_grid = para_grid, transf=True )

OutStratify
**para datos transformados**
Fitting 5 folds for each of 300 candidates, totalling 1500 fits


In [None]:
metricas_gbt_t = metricas(modelo_gbt_t.best_estimator_,Y_train, np.expm1(modelo_gbt_t.predict(X_train)), Y_test, np.expm1(modelo_gbt_t.predict(X_test)))
metricas_gbt_t

Unnamed: 0,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test
0,([DecisionTreeRegressor(criterion='friedman_ms...,1067338.0,71725640.0,0.001894,0.180233,0.999973,0.762898,3419166.0,297975500.0


In [None]:
modelo_gbt_t.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('gbt',
                 GradientBoostingRegressor(max_depth=20, min_samples_split=17,
                                           random_state=42))])

### **XG Boost**

#### Sin transformar la Y

In [82]:
import xgboost as xgb
pipe_xgb = Pipeline(steps = [
    ('scaler', StandardScaler()), 
    ('xgb',  xgb.XGBRegressor(
        objective='reg:squarederror', n_jobs=-1, 
    ))
])

**Búsqueda de Hiperparámetros**

In [83]:
para_grid = {'xgb__n_estimators':[100,120,150], 'xgb__max_depth':[5,6], 'xgb__learning_rate':[0.05,0.1], 'xgb__reg_alpha':[0.01,0.5,0.1]}
modelo_xgb = busqueda_hiperparametros(X_train, Y_train,var_stratify_train, pipe_xgb, param_grid = para_grid )

OutStratify
**para datos sin transformar**
Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [84]:
metricas_xgb = metricas(modelo_xgb.best_estimator_,Y_train, modelo_xgb.predict(X_train), Y_test, modelo_xgb.predict(X_test))
metricas_xgb

Unnamed: 0,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test
0,"XGBRegressor(learning_rate=0.05, max_depth=6, ...",40303136.0,92885600.0,0.066315,0.179523,0.981036,0.823013,90431560.0,257444400.0


In [85]:
modelo_xgb.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('xgb',
                 XGBRegressor(learning_rate=0.05, max_depth=6, n_estimators=150,
                              n_jobs=-1, objective='reg:squarederror',
                              reg_alpha=0.01))])

### **Red Neuronal**

####**Sin tranformar la Y**

**Búsqueda de hiperparámetros**

In [None]:
from sklearn.neural_network import MLPRegressor
pipe_rnn = Pipeline(steps = [('scaler', StandardScaler()), ('rnn', MLPRegressor(activation='relu',max_iter=100,solver='lbfgs',random_state=42))])

In [None]:
# para_grid = {'rnn__hidden_layer_sizes': [(45,25),(95,50),(110,60),(150,75),(180,90)],
#     'rnn__alpha': np.logspace(-3, 3, 10),
#     'rnn__learning_rate_init': [0.001, 0.01]}
para_grid = {'rnn__hidden_layer_sizes': [(45,25),(95,50),(110,60),(150,75),(180,90)],
    'rnn__alpha': [0.001,0.004,0.01,0.1],
    'rnn__learning_rate_init': [0.001,0.004,0.01,0.1],
    'rnn__max_iter':[200,500]}

modelo_rnn = busqueda_hiperparametros(X_train, Y_train,var_stratify_train, pipe_rnn, param_grid = para_grid,transf=False)

OutStratify
**para datos sin transformar**
Fitting 5 folds for each of 160 candidates, totalling 800 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [None]:
metricas_rnn = metricas(modelo_rnn.best_estimator_,Y_train, modelo_rnn.predict(X_train), Y_test, modelo_rnn.predict(X_test))
metricas_rnn

Unnamed: 0,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test
0,"MLPRegressor(alpha=0.004, hidden_layer_sizes=(...",9710.673277,182236000.0,0.000244,0.443642,0.999996,0.442881,1316209.0,456758600.0


In [None]:
modelo_rnn.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('rnn',
                 MLPRegressor(alpha=0.004, hidden_layer_sizes=(150, 75),
                              max_iter=500, random_state=42, solver='lbfgs'))])

In [None]:
dd_t=pd.DataFrame(modelo_rnn.cv_results_)
dd_t[dd_t['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica',
'mean_test_meae', 'std_test_meae', 'rank_test_meae','mean_train_meae','std_train_meae',
'mean_test_r2', 'std_test_r2','rank_test_r2','mean_train_r2', 'std_train_r2',
'mean_test_rmse','std_test_rmse', 'rank_test_rmse','mean_train_rmse', 'std_train_rmse',
'mean_test_mape','std_test_mape', 'rank_test_mape', 'mean_train_mape', 'std_train_mape']]

Unnamed: 0,params,mean_test_metrica,std_test_metrica,mean_train_metrica,std_train_metrica,mean_test_meae,std_test_meae,rank_test_meae,mean_train_meae,std_train_meae,mean_test_r2,std_test_r2,rank_test_r2,mean_train_r2,std_train_r2,mean_test_rmse,std_test_rmse,rank_test_rmse,mean_train_rmse,std_train_rmse,mean_test_mape,std_test_mape,rank_test_mape,mean_train_mape,std_train_mape
65,"{'rnn__alpha': 0.004, 'rnn__hidden_layer_sizes...",-267719200.0,30955900.0,-4126.92474,3005.468145,267719200.0,30955900.0,157,4126.92474,3005.468145,0.146671,0.192026,81,0.999997,2e-06,3.688404e+17,9.941891e+16,77,1346766000000.0,929297700000.0,0.461206,0.070831,77,0.000195,7.5e-05
67,"{'rnn__alpha': 0.004, 'rnn__hidden_layer_sizes...",-267719200.0,30955900.0,-4126.92474,3005.468145,267719200.0,30955900.0,157,4126.92474,3005.468145,0.146671,0.192026,81,0.999997,2e-06,3.688404e+17,9.941891e+16,77,1346766000000.0,929297700000.0,0.461206,0.070831,77,0.000195,7.5e-05
69,"{'rnn__alpha': 0.004, 'rnn__hidden_layer_sizes...",-267719200.0,30955900.0,-4126.92474,3005.468145,267719200.0,30955900.0,157,4126.92474,3005.468145,0.146671,0.192026,81,0.999997,2e-06,3.688404e+17,9.941891e+16,77,1346766000000.0,929297700000.0,0.461206,0.070831,77,0.000195,7.5e-05
71,"{'rnn__alpha': 0.004, 'rnn__hidden_layer_sizes...",-267719200.0,30955900.0,-4126.92474,3005.468145,267719200.0,30955900.0,157,4126.92474,3005.468145,0.146671,0.192026,81,0.999997,2e-06,3.688404e+17,9.941891e+16,77,1346766000000.0,929297700000.0,0.461206,0.070831,77,0.000195,7.5e-05


#### Usando Y transformada

In [None]:
# para_grid = {'rnn__hidden_layer_sizes': [(45,25),(95,50),(110,60),(150,75),(180,90)],
#     'rnn__alpha': np.logspace(-3, 3, 10),
#     'rnn__learning_rate_init': [0.001, 0.01]}
para_grid = {'rnn__hidden_layer_sizes': [(45,25),(95,50),(110,60),(150,75),(180,90)],
    'rnn__alpha': [0.001,0.004,0.01,0.1],
    'rnn__learning_rate_init': [0.001,0.004,0.01,0.1],
    'rnn__max_iter':[200,500]}
modelo_rnn_t = busqueda_hiperparametros(X_train, Y_train_t,var_stratify_train, pipe_rnn, param_grid = para_grid,transf= True )

OutStratify
**para datos transformados**
Fitting 5 folds for each of 160 candidates, totalling 800 fits


In [None]:
metricas_rnn_t = metricas(modelo_rnn_t.best_estimator_,Y_train, modelo_rnn_t.predict(X_train), Y_test, modelo_rnn_t.predict(X_test))
metricas_rnn_t

Unnamed: 0,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test
0,"MLPRegressor(alpha=0.001, hidden_layer_sizes=(...",845000000.0,780000000.0,1.0,1.0,-2.607469,-2.444343,1247266000.0,1135706000.0


In [None]:
modelo_rnn_t.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('rnn',
                 MLPRegressor(alpha=0.001, hidden_layer_sizes=(180, 90),
                              max_iter=500, random_state=42, solver='lbfgs'))])

In [None]:
dd_t=pd.DataFrame(modelo_rnn_t.cv_results_)
res = dd_t[dd_t['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica',
'mean_test_meae', 'std_test_meae', 'rank_test_meae','mean_train_meae','std_train_meae',
'mean_test_r2', 'std_test_r2','rank_test_r2','mean_train_r2', 'std_train_r2',
'mean_test_rmse','std_test_rmse', 'rank_test_rmse','mean_train_rmse', 'std_train_rmse',
'mean_test_mape','std_test_mape', 'rank_test_mape', 'mean_train_mape', 'std_train_mape']]
res

Unnamed: 0,params,mean_test_metrica,std_test_metrica,mean_train_metrica,std_train_metrica,mean_test_meae,std_test_meae,rank_test_meae,mean_train_meae,std_train_meae,mean_test_r2,std_test_r2,rank_test_r2,mean_train_r2,std_train_r2,mean_test_rmse,std_test_rmse,rank_test_rmse,mean_train_rmse,std_train_rmse,mean_test_mape,std_test_mape,rank_test_mape,mean_train_mape,std_train_mape
33,"{'rnn__alpha': 0.001, 'rnn__hidden_layer_sizes...",-0.693142,0.048531,-0.000959,0.0002,478989900.0,38693950.0,149,798559.728607,110786.654239,-535643000.0,1071228000.0,117,0.999957,1.8e-05,7204519000000.0,14269690000000.0,41,4192902.0,857409.954258,951.750456,1884.219394,17,0.002037,0.000397
35,"{'rnn__alpha': 0.001, 'rnn__hidden_layer_sizes...",-0.693142,0.048531,-0.000959,0.0002,478989900.0,38693950.0,149,798559.728607,110786.654239,-535643000.0,1071228000.0,117,0.999957,1.8e-05,7204519000000.0,14269690000000.0,41,4192902.0,857409.954258,951.750456,1884.219394,17,0.002037,0.000397
37,"{'rnn__alpha': 0.001, 'rnn__hidden_layer_sizes...",-0.693142,0.048531,-0.000959,0.0002,478989900.0,38693950.0,149,798559.728607,110786.654239,-535643000.0,1071228000.0,117,0.999957,1.8e-05,7204519000000.0,14269690000000.0,41,4192902.0,857409.954258,951.750456,1884.219394,17,0.002037,0.000397
39,"{'rnn__alpha': 0.001, 'rnn__hidden_layer_sizes...",-0.693142,0.048531,-0.000959,0.0002,478989900.0,38693950.0,149,798559.728607,110786.654239,-535643000.0,1071228000.0,117,0.999957,1.8e-05,7204519000000.0,14269690000000.0,41,4192902.0,857409.954258,951.750456,1884.219394,17,0.002037,0.000397


Referencia: https://www.cienciadedatos.net/documentos/py35-redes-neuronales-python.html

## Resumen

In [None]:
from google.colab import drive
drive.mount('/content/drive/')
%cd '/content/drive/My Drive/Colab Notebooks/Seminario/Etapa Modelamiento/Resultados'

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/My Drive/Colab Notebooks/Seminario/Etapa Modelamiento/Resultados


In [None]:
import os
os.getcwd()

'/content/drive/My Drive/Colab Notebooks/Seminario/Etapa Modelamiento/Resultados'

In [None]:
cross_validation_rf = pd.DataFrame(modelo_rf.cv_results_)
cross_validation_rf = cross_validation_rf[cross_validation_rf['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica','mean_train_meae','mean_test_meae','mean_train_mape','mean_test_mape','mean_train_r2','mean_test_r2','mean_train_rmse','mean_test_rmse']]
cross_validation_rf['escenario']='modelo_rf'

cross_validation_rft=pd.DataFrame(modelo_rf_t.cv_results_)
cross_validation_rft = cross_validation_rft[cross_validation_rft['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica','mean_train_meae','mean_test_meae','mean_train_mape','mean_test_mape','mean_train_r2','mean_test_r2','mean_train_rmse','mean_test_rmse']]
cross_validation_rft['escenario']='modelo_rft'

cross_validation_gbt=pd.DataFrame(modelo_gbt.cv_results_)
cross_validation_gbt = cross_validation_gbt[cross_validation_gbt['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica','mean_train_meae','mean_test_meae','mean_train_mape','mean_test_mape','mean_train_r2','mean_test_r2','mean_train_rmse','mean_test_rmse']]
cross_validation_gbt['escenario']='modelo_gbt'

cross_validation_gbtt=pd.DataFrame(modelo_gbt_t.cv_results_)
cross_validation_gbtt = cross_validation_gbtt[cross_validation_gbtt['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica','mean_train_meae','mean_test_meae','mean_train_mape','mean_test_mape','mean_train_r2','mean_test_r2','mean_train_rmse','mean_test_rmse']]
cross_validation_gbtt['escenario']='modelo_gbtt'

cross_validation_rnn=pd.DataFrame(modelo_rnn.cv_results_)
cross_validation_rnn = cross_validation_rnn[cross_validation_rnn['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica','mean_train_meae','mean_test_meae','mean_train_mape','mean_test_mape','mean_train_r2','mean_test_r2','mean_train_rmse','mean_test_rmse']]
cross_validation_rnn['escenario']='modelo_rnn'

cross_validation_rnnt=pd.DataFrame(modelo_rnn_t.cv_results_)
cross_validation_rnnt = cross_validation_rnnt[cross_validation_rnnt['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica','mean_train_meae','mean_test_meae','mean_train_mape','mean_test_mape','mean_train_r2','mean_test_r2','mean_train_rmse','mean_test_rmse']]
cross_validation_rnnt['escenario']='modelo_rnnt'

resultado_cross = pd.concat([cross_validation_rf,cross_validation_rft, cross_validation_gbt, cross_validation_gbtt,cross_validation_rnn,cross_validation_rnnt],
                            ignore_index = True)

columna = pd.DataFrame(resultado_cross['params'])

resultado_cross.drop(['params'],axis=1, inplace=True)
resultado_cross = resultado_cross.drop_duplicates()
resultado_cross.reset_index(inplace=True, drop=False)
resultado_cross

Unnamed: 0,index,mean_test_metrica,std_test_metrica,mean_train_metrica,std_train_metrica,mean_train_meae,mean_test_meae,mean_train_mape,mean_test_mape,mean_train_r2,mean_test_r2,mean_train_rmse,mean_test_rmse,escenario
0,0,-104766600.0,5586634.0,-49315990.0,1398213.0,49315990.0,104766600.0,0.090221,0.18718,0.949648,0.7919129,2.16807e+16,8.80896e+16,modelo_rf
1,2,-0.1197304,0.006228798,-0.04676569,0.0008145096,39863310.0,107316100.0,0.068826,0.179743,0.959474,0.7808663,132055100.0,304053100.0,modelo_rft
2,3,-99121810.0,12066300.0,-133321.7,26988.59,133321.7,99121810.0,0.000309,0.190605,1.0,0.7774227,140925100000.0,9.489643e+16,modelo_gbt
3,4,-0.119104,0.009519969,-0.001416225,0.0001564929,1202147.0,114632600.0,0.002062,0.184457,0.99997,0.7650026,3556217.0,315175200.0,modelo_gbtt
4,5,-267719200.0,30955900.0,-4126.925,3005.468,4126.925,267719200.0,0.000195,0.461206,0.999997,0.1466713,1346766000000.0,3.688404e+17,modelo_rnn
5,9,-0.6931419,0.04853103,-0.0009587826,0.0002004012,798559.7,478989900.0,0.002037,951.750456,0.999957,-535643000.0,4192902.0,7204519000000.0,modelo_rnnt


In [None]:
resultado_metricas = pd.concat([metricas_rf, metricas_rf_t, metricas_gbt, metricas_gbt_t, metricas_rnn, metricas_rnn_t],
                            ignore_index = True)
resultado_metricas.drop_duplicates(inplace=True)
resultado_metricas.reset_index(inplace=True, drop=False)
resultado_metricas

Unnamed: 0,index,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test
0,0,"(DecisionTreeRegressor(max_depth=10, max_featu...",50370380.0,83285310.0,0.090549,0.183594,0.948438,0.821335,149114800.0,258662100.0
1,1,"(DecisionTreeRegressor(max_depth=25, max_featu...",39344940.0,64846890.0,0.066362,0.167071,0.963317,0.806674,125773600.0,269065100.0
2,2,([DecisionTreeRegressor(criterion='friedman_ms...,175321.7,97374880.0,0.000398,0.199556,1.0,0.743856,457532.0,309709700.0
3,3,([DecisionTreeRegressor(criterion='friedman_ms...,1067338.0,71725640.0,0.001894,0.180233,0.999973,0.762898,3419166.0,297975500.0
4,4,"MLPRegressor(alpha=0.004, hidden_layer_sizes=(...",9710.673,182236000.0,0.000244,0.443642,0.999996,0.442881,1316209.0,456758600.0
5,5,"MLPRegressor(alpha=0.001, hidden_layer_sizes=(...",845000000.0,780000000.0,1.0,1.0,-2.607469,-2.444343,1247266000.0,1135706000.0


In [None]:
resultados =  pd.concat([resultado_metricas, resultado_cross], axis=1)
resultados

Unnamed: 0,index,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test,index.1,mean_test_metrica,std_test_metrica,mean_train_metrica,std_train_metrica,mean_train_meae,mean_test_meae,mean_train_mape,mean_test_mape,mean_train_r2,mean_test_r2,mean_train_rmse,mean_test_rmse,escenario
0,0,"(DecisionTreeRegressor(max_depth=10, max_featu...",50370380.0,83285310.0,0.090549,0.183594,0.948438,0.821335,149114800.0,258662100.0,0,-104766600.0,5586634.0,-49315990.0,1398213.0,49315990.0,104766600.0,0.090221,0.18718,0.949648,0.7919129,2.16807e+16,8.80896e+16,modelo_rf
1,1,"(DecisionTreeRegressor(max_depth=25, max_featu...",39344940.0,64846890.0,0.066362,0.167071,0.963317,0.806674,125773600.0,269065100.0,2,-0.1197304,0.006228798,-0.04676569,0.0008145096,39863310.0,107316100.0,0.068826,0.179743,0.959474,0.7808663,132055100.0,304053100.0,modelo_rft
2,2,([DecisionTreeRegressor(criterion='friedman_ms...,175321.7,97374880.0,0.000398,0.199556,1.0,0.743856,457532.0,309709700.0,3,-99121810.0,12066300.0,-133321.7,26988.59,133321.7,99121810.0,0.000309,0.190605,1.0,0.7774227,140925100000.0,9.489643e+16,modelo_gbt
3,3,([DecisionTreeRegressor(criterion='friedman_ms...,1067338.0,71725640.0,0.001894,0.180233,0.999973,0.762898,3419166.0,297975500.0,4,-0.119104,0.009519969,-0.001416225,0.0001564929,1202147.0,114632600.0,0.002062,0.184457,0.99997,0.7650026,3556217.0,315175200.0,modelo_gbtt
4,4,"MLPRegressor(alpha=0.004, hidden_layer_sizes=(...",9710.673,182236000.0,0.000244,0.443642,0.999996,0.442881,1316209.0,456758600.0,5,-267719200.0,30955900.0,-4126.925,3005.468,4126.925,267719200.0,0.000195,0.461206,0.999997,0.1466713,1346766000000.0,3.688404e+17,modelo_rnn
5,5,"MLPRegressor(alpha=0.001, hidden_layer_sizes=(...",845000000.0,780000000.0,1.0,1.0,-2.607469,-2.444343,1247266000.0,1135706000.0,9,-0.6931419,0.04853103,-0.0009587826,0.0002004012,798559.7,478989900.0,0.002037,951.750456,0.999957,-535643000.0,4192902.0,7204519000000.0,modelo_rnnt


In [None]:
# resultados.to_csv('resultados_train_test_estrato_0_1_2.csv',sep=";",decimal=",")
# resultados.to_csv('resultados_train_test_estrato_3.csv',sep=";",decimal=",")
# resultados.to_csv('resultados_train_test_estrato_4.csv',sep=";",decimal=",")
# resultados.to_csv('resultados_train_test_estrato_5_6.csv',sep=";",decimal=",")

In [None]:
cross_validation_rf = pd.DataFrame(modelo_rf.cv_results_)
cross_validation_rf['escenario']='modelo_rf'

cross_validation_rft=pd.DataFrame(modelo_rf_t.cv_results_)
cross_validation_rft['escenario']='modelo_rft'

cross_validation_gbt=pd.DataFrame(modelo_gbt.cv_results_)
cross_validation_gbt['escenario']='modelo_gbt'

cross_validation_gbtt=pd.DataFrame(modelo_gbt_t.cv_results_)
cross_validation_gbtt['escenario']='modelo_gbtt'

cross_validation_rnn=pd.DataFrame(modelo_rnn.cv_results_)
cross_validation_rnn['escenario']='modelo_rnn'

cross_validation_rnnt=pd.DataFrame(modelo_rnn_t.cv_results_)
cross_validation_rnnt['escenario']='modelo_rnnt'

resultado_cross_total = pd.concat([cross_validation_rf,cross_validation_rft, cross_validation_gbt, cross_validation_gbtt,cross_validation_rnn,cross_validation_rnnt],
                            ignore_index = True)

columna = pd.DataFrame(resultado_cross_total['params'])

resultado_cross_total.drop(['params'],axis=1, inplace=True)
resultado_cross_total = resultado_cross_total.drop_duplicates()
resultado_cross_total.reset_index(inplace=True, drop=False)

In [None]:
# resultados.to_csv('resultados_crossvalidation_estrato_0_1_2.csv',sep=";",decimal=",")
# resultados.to_csv('resultados_crossvalidation_estrato_3.csv',sep=";",decimal=",")
# resultados.to_csv('resultados_crossvalidation_estrato_4.csv',sep=";",decimal=",")
# resultados.to_csv('resultados_crossvalidation_estrato_5_6.csv',sep=";",decimal=",")