# **Modelamiento**

## Librerías

In [86]:
!pip install scikit-learn==0.24.2



In [87]:
!pip install unidecode



In [88]:
#Generales
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#Proceso de Modelación
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_percentage_error,r2_score,median_absolute_error, make_scorer
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit, ShuffleSplit

## Lectura Información

In [89]:
#Descargar datasets desde github
!git clone https://github.com/andres-soto-h/monografia-udea-eacd.git

fatal: destination path 'monografia-udea-eacd' already exists and is not an empty directory.


In [90]:
#Lectura del dataset transformado
df_propiedades=pd.read_csv('/content/monografia-udea-eacd/df_prop_clean_12082021.csv', delimiter=';', encoding='latin1')

In [91]:
# from google.colab import drive
# drive.mount('/content/drive/')
# %cd '/content/drive/My Drive/Colab Notebooks/Seminario/DATASETS'
# # !ls
# df_propiedades=pd.read_csv('df_prop_clean_12082021.csv', delimiter=';', encoding='latin1') 
# print(df_propiedades.shape)

In [92]:
df_propiedades.rename(columns={"municipio":"ubicacion"}, inplace=True)

**NOTA:** se decide eliminar la información del municipio de **La Unión**.

In [93]:
mask_union = df_propiedades['ubicacion']!='la uniÃ³n'
df_propiedades = df_propiedades[mask_union]

In [94]:
df_propiedades['ubicacion'].value_counts()

rionegro                  1241
el retiro                  439
la ceja                    342
guarne                     196
marinilla                  117
el carmen de viboral        62
san antonio de pereira      38
llanogrande                 31
santuario                   15
Name: ubicacion, dtype: int64

**Nota2:** Se convierte la variable Estrato en ordinal

In [95]:
df_propiedades['estrato'] = df_propiedades['estrato'].replace('Campestre', '0')
df_propiedades['estrato'] = df_propiedades['estrato'].astype(int)
df_propiedades['estrato'].value_counts()

4    947
5    636
3    584
0    162
6     89
2     58
1      5
Name: estrato, dtype: int64

## **Ajuste del Modelo**

**Funciones**

In [96]:
#Métricas datos transformados
def median_absolute_error2(y_true, y_pred):
  return median_absolute_error(np.expm1(y_true), np.expm1(y_pred))

def mean_squared_error2(y_true, y_pred):
  return mean_squared_error(np.expm1(y_true), np.expm1(y_pred), squared=False)

def mean_absolute_percentage_error2(y_true, y_pred):
  return mean_absolute_percentage_error(np.expm1(y_true), np.expm1(y_pred))
  
def r2_score2(y_true, y_pred):
  return r2_score(np.expm1(y_true), np.expm1(y_pred))

In [97]:
def div_train_test(X, y, var_stratify = ''):
  
  if var_stratify != '': 
    x_train, x_test, y_train, y_test = train_test_split(X,y, train_size=0.9, stratify = X[var_stratify],  random_state = 42)

    var_stratify_train = x_train[var_stratify]
    #ubicacion_test = X_test['ubicacion']

    x_train.drop(var_stratify,axis=1, inplace=True)
    x_test.drop(var_stratify,axis=1, inplace=True)

    #Variable Objetivo Transformada
    y_train_t = np.log1p(y_train)
    y_test_t = np.log1p(y_test)

    return x_train, x_test, y_train, y_test, y_train_t, y_test_t, var_stratify_train

  else:
    x_train, x_test, y_train, y_test = train_test_split(X,y, train_size=0.9,  random_state = 42)

    #Variable Objetivo Transformada
    y_train_t = np.log1p(y_train)
    y_test_t = np.log1p(y_test)
  
  return x_train, x_test, y_train, y_test, y_train_t, y_test_t

def busqueda_hiperparametros(x_train, y_train, var_stratify_train, estimator,param_grid = [] , scoring_med = 'neg_median_absolute_error', transf=False):

  if len(var_stratify_train) != 0:
    
    print('Seccion Stratify')
    sss = StratifiedShuffleSplit(n_splits=5, test_size=0.1, random_state=42)
    stratified = sss.split(x_train, var_stratify_train)
    
    if transf: 
      print('**para datos transformados**')
      scoring_grid={'metrica':scoring_med,'meae':make_scorer(median_absolute_error2), 'r2':make_scorer(r2_score2) , 'rmse':make_scorer(mean_squared_error2), 'mape':make_scorer(mean_absolute_percentage_error2)}
    else:
      print('**para datos sin transformar**')
      scoring_grid={'metrica':scoring_med,'meae':make_scorer(median_absolute_error), 'r2':make_scorer(r2_score) , 'rmse':make_scorer(mean_squared_error), 'mape':make_scorer(mean_absolute_percentage_error)}
    
    grid = GridSearchCV(estimator, param_grid=param_grid, cv = stratified, scoring = scoring_grid , refit = 'metrica',  return_train_score=True, n_jobs=-1, verbose=8)
    return grid.fit(x_train, y_train)
  
  else:
    print('OutStratify')
    
    if transf: 
      print('**para datos transformados**')
      scoring_grid={'metrica':scoring_med,'meae':make_scorer(median_absolute_error2), 'r2':make_scorer(r2_score2) , 'rmse':make_scorer(mean_squared_error2), 'mape':make_scorer(mean_absolute_percentage_error2)}
    else:
      print('**para datos sin transformar**')
      scoring_grid={'metrica':scoring_med,'meae':make_scorer(median_absolute_error), 'r2':make_scorer(r2_score) , 'rmse':make_scorer(mean_squared_error), 'mape':make_scorer(mean_absolute_percentage_error)}
    
    grid = GridSearchCV(estimator, param_grid=param_grid, scoring = scoring_grid, refit = 'metrica', return_train_score=True, n_jobs=-1, verbose=8 )
    return grid.fit(X_train, y_train)

def metricas(model,y_train, p_train, y_test, p_test): 

  resultados = {
      'parametros' : [model.get_params()['steps'][1][1]],
      'MeAE_train' : [median_absolute_error(y_train,p_train)],
      'MeAE_test'  : [median_absolute_error(y_test, p_test)],
      'MAPE_train' : [mean_absolute_percentage_error(y_train,p_train)],
      'MAPE_test'  : [mean_absolute_percentage_error(y_test, p_test)],
      'r2_train'   : [r2_score(y_train,p_train)],
      'r2_test'    : [r2_score(y_test, p_test)],
      'rmse_train' : [mean_squared_error(y_train,p_train,squared = False)],
      'rmse_test'  : [mean_squared_error(y_test,p_test,squared = False)]
  }
  return pd.DataFrame(resultados)

**División Covariables  y Variable Objetivo**

In [98]:
data_model=df_propiedades.copy()
# data_model = data_model[(data_model['ubicacion']=='el carmen de viboral') | (data_model['ubicacion']=='guarne')]
# data_model = data_model[(data_model['ubicacion']=='el retiro') | (data_model['ubicacion']=='llanogrande')]
# data_model = data_model[(data_model['ubicacion']=='la ceja') | (data_model['ubicacion']=='san antonio de pereira')]
# data_model = data_model[(data_model['ubicacion']=='marinilla') | (data_model['ubicacion']=='santuario')]
data_model = data_model[data_model['ubicacion']=='rionegro']

print(data_model['ubicacion'].unique())
#columnas_quitar=['tipo','url','titulo','descripcion','caractint','caractext','caractsec']
columnas_quitar=['url','titulo','descripcion','caractint','caractext','caractsec','otros_datos','ubicacion']
data_model.drop(columnas_quitar, axis=1, inplace=True)
data_model.shape

['rionegro']


(1241, 176)

In [99]:
X = data_model.drop(['precio'], axis=1)
y = data_model['precio']

In [100]:
var_stratify_train = pd.Series()
len(var_stratify_train)

  """Entry point for launching an IPython kernel.


0

**OneHotEncoder Variables Categóricas**

In [101]:
enc = OneHotEncoder(handle_unknown='ignore')
data_aux = pd.DataFrame(enc.fit_transform(X[['tipo','tipo_propiedad','antiguedad']]).toarray(), columns= enc.get_feature_names(['tipo','tipo_propiedad','antiguedad']))
data_aux['fila'] = range(0, X.shape[0])
data_aux.head()

X.drop(['tipo','tipo_propiedad','antiguedad'],axis=1,inplace=True)
X['fila'] = range(0, X.shape[0])

X = pd.merge(data_aux,X, on = 'fila', how='inner')
X.drop(['fila'], axis=1, inplace=True)
X.shape

(1241, 184)

**División train y Test**

In [102]:
X_train, X_test, Y_train, Y_test, Y_train_t, Y_test_t = div_train_test(X, y, var_stratify = '')

### **Random Forest**

In [None]:
pipe = Pipeline(steps = [('scaler', StandardScaler()), ('rf', RandomForestRegressor(random_state=42))])

####**Sin transformar la Y**

**Búsqueda de Hiperparámetros**

In [None]:
# para_grid = {'rf__n_estimators':[80,100,120,150], 'rf__max_depth':[3,5,10,20,25], 'rf__min_samples_split':[2,3,5,10,15],'rf__min_samples_leaf':[1,2,5,8]}
# para_grid = {'rf__n_estimators':[120], 'rf__max_depth':[5,10], 'rf__min_samples_split':[3,5],'rf__min_samples_leaf':[2,5]}
# para_grid = {'rf__max_depth':[3,5,10,12,15], 'rf__min_samples_split':[3,5,10,15,20],'rf__min_samples_leaf':[2,3,5,10]}
para_grid = {'rf__n_estimators':[100,120,150], 'rf__max_depth':[6,7,8,9], 'rf__min_samples_split':[4,5,6], 'rf__min_samples_leaf':[2,3,4]}

modelo_rf = busqueda_hiperparametros(X_train, Y_train,var_stratify_train, pipe, param_grid = para_grid, transf=False)
modelo_rf

OutStratify
**para datos sin transformar**
Fitting 5 folds for each of 108 candidates, totalling 540 fits


GridSearchCV(estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('rf',
                                        RandomForestRegressor(random_state=42))]),
             n_jobs=-1,
             param_grid={'rf__max_depth': [6, 7, 8, 9],
                         'rf__min_samples_leaf': [2, 3, 4],
                         'rf__min_samples_split': [4, 5, 6],
                         'rf__n_estimators': [100, 120, 150]},
             refit='metrica', return_train_score=True,
             scoring={'mape': make_scorer(mean_absolute_percentage_error),
                      'meae': make_scorer(median_absolute_error),
                      'metrica': 'neg_median_absolute_error',
                      'r2': make_scorer(r2_score),
                      'rmse': make_scorer(mean_squared_error)},
             verbose=8)

In [None]:
metricas_rf = metricas(modelo_rf.best_estimator_,Y_train, modelo_rf.predict(X_train), Y_test, modelo_rf.predict(X_test))
metricas_rf

Unnamed: 0,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test
0,"(DecisionTreeRegressor(max_depth=9, max_featur...",37273960.0,64491380.0,0.115688,0.188712,0.949778,0.76875,121746600.0,302811700.0


In [None]:
modelo_rf.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('rf',
                 RandomForestRegressor(max_depth=9, min_samples_leaf=2,
                                       min_samples_split=4, random_state=42))])

In [None]:
modelo_rf.best_estimator_.get_params()

{'memory': None,
 'rf': RandomForestRegressor(max_depth=9, min_samples_leaf=2, min_samples_split=4,
                       random_state=42),
 'rf__bootstrap': True,
 'rf__ccp_alpha': 0.0,
 'rf__criterion': 'mse',
 'rf__max_depth': 9,
 'rf__max_features': 'auto',
 'rf__max_leaf_nodes': None,
 'rf__max_samples': None,
 'rf__min_impurity_decrease': 0.0,
 'rf__min_impurity_split': None,
 'rf__min_samples_leaf': 2,
 'rf__min_samples_split': 4,
 'rf__min_weight_fraction_leaf': 0.0,
 'rf__n_estimators': 100,
 'rf__n_jobs': None,
 'rf__oob_score': False,
 'rf__random_state': 42,
 'rf__verbose': 0,
 'rf__warm_start': False,
 'scaler': StandardScaler(),
 'scaler__copy': True,
 'scaler__with_mean': True,
 'scaler__with_std': True,
 'steps': [('scaler', StandardScaler()),
  ('rf',
   RandomForestRegressor(max_depth=9, min_samples_leaf=2, min_samples_split=4,
                         random_state=42))],
 'verbose': False}

In [None]:
dd=pd.DataFrame(modelo_rf.cv_results_)
dd[dd['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica',
'mean_test_meae', 'std_test_meae', 'rank_test_meae','mean_train_meae','std_train_meae',
'mean_test_r2', 'std_test_r2','rank_test_r2','mean_train_r2', 'std_train_r2',
'mean_test_rmse','std_test_rmse', 'rank_test_rmse','mean_train_rmse', 'std_train_rmse',
'mean_test_mape','std_test_mape', 'rank_test_mape', 'mean_train_mape', 'std_train_mape']]

Unnamed: 0,params,mean_test_metrica,std_test_metrica,mean_train_metrica,std_train_metrica,mean_test_meae,std_test_meae,rank_test_meae,mean_train_meae,std_train_meae,mean_test_r2,std_test_r2,rank_test_r2,mean_train_r2,std_train_r2,mean_test_rmse,std_test_rmse,rank_test_rmse,mean_train_rmse,std_train_rmse,mean_test_mape,std_test_mape,rank_test_mape,mean_train_mape,std_train_mape
81,"{'rf__max_depth': 9, 'rf__min_samples_leaf': 2...",-56037900.0,5215776.0,-36108370.0,415907.256904,56037900.0,5215776.0,108,36108370.0,415907.256904,0.801317,0.02938,18,0.950647,0.002795,5.818671e+16,1.016968e+16,90,1.455257e+16,795443900000000.0,0.196498,0.00817,104,0.112048,0.00157


####**Usando Y transformada**

**Búsqueda de Hiperparámetros**

In [None]:
# para_grid = {'rf__n_estimators':[20,25], 'rf__max_depth':[3,5], 'rf__min_samples_split':[2,3],'rf__min_samples_leaf':[1,2]}
# para_grid = {'rf__n_estimators':[80,100,120,150], 'rf__max_depth':[3,5,10,20,25], 'rf__min_samples_split':[2,3,5,10,15],'rf__min_samples_leaf':[1,2,5,8]}
# para_grid = {'rf__n_estimators':[120], 'rf__max_depth':[5,10], 'rf__min_samples_split':[3,5],'rf__min_samples_leaf':[2,5]}
# para_grid = {'rf__max_depth':[3,5,10,12,15], 'rf__min_samples_split':[3,5,10,15,20],'rf__min_samples_leaf':[2,3,5,10]}
para_grid = {'rf__n_estimators':[100,120,150], 'rf__max_depth':[6,7,8,9], 'rf__min_samples_split':[4,5,6], 'rf__min_samples_leaf':[2,3,4]}

modelo_rf_t = busqueda_hiperparametros(X_train, Y_train_t,var_stratify_train, pipe, param_grid = para_grid, transf = True)
modelo_rf_t

OutStratify
**para datos transformados**
Fitting 5 folds for each of 108 candidates, totalling 540 fits


GridSearchCV(estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('rf',
                                        RandomForestRegressor(random_state=42))]),
             n_jobs=-1,
             param_grid={'rf__max_depth': [6, 7, 8, 9],
                         'rf__min_samples_leaf': [2, 3, 4],
                         'rf__min_samples_split': [4, 5, 6],
                         'rf__n_estimators': [100, 120, 150]},
             refit='metrica', return_train_score=True,
             scoring={'mape': make_scorer(mean_absolute_percentage_error2),
                      'meae': make_scorer(median_absolute_error2),
                      'metrica': 'neg_median_absolute_error',
                      'r2': make_scorer(r2_score2),
                      'rmse': make_scorer(mean_squared_error2)},
             verbose=8)

In [None]:
metricas_rf_t = metricas(modelo_rf_t.best_estimator_,Y_train, np.expm1(modelo_rf_t.predict(X_train)), Y_test, np.expm1(modelo_rf_t.predict(X_test)))
metricas_rf_t

Unnamed: 0,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test
0,"(DecisionTreeRegressor(max_depth=8, max_featur...",35718710.0,56648740.0,0.113645,0.200789,0.923797,0.726303,149966600.0,329432500.0


In [None]:
modelo_rf_t.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('rf',
                 RandomForestRegressor(max_depth=8, min_samples_leaf=2,
                                       min_samples_split=4, n_estimators=150,
                                       random_state=42))])

In [None]:
dd_t=pd.DataFrame(modelo_rf_t.cv_results_)
dd_t[dd_t['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica',
'mean_test_meae', 'std_test_meae', 'rank_test_meae','mean_train_meae','std_train_meae',
'mean_test_r2', 'std_test_r2','rank_test_r2','mean_train_r2', 'std_train_r2',
'mean_test_rmse','std_test_rmse', 'rank_test_rmse','mean_train_rmse', 'std_train_rmse',
'mean_test_mape','std_test_mape', 'rank_test_mape', 'mean_train_mape', 'std_train_mape']]

Unnamed: 0,params,mean_test_metrica,std_test_metrica,mean_train_metrica,std_train_metrica,mean_test_meae,std_test_meae,rank_test_meae,mean_train_meae,std_train_meae,mean_test_r2,std_test_r2,rank_test_r2,mean_train_r2,std_train_r2,mean_test_rmse,std_test_rmse,rank_test_rmse,mean_train_rmse,std_train_rmse,mean_test_mape,std_test_mape,rank_test_mape,mean_train_mape,std_train_mape
56,"{'rf__max_depth': 8, 'rf__min_samples_leaf': 2...",-0.123757,0.009043,-0.078353,0.002024,52458110.0,1660973.0,96,34391300.0,870260.1398,0.794527,0.029653,13,0.926654,0.00323,244483700.0,22858890.0,97,147032300.0,3311338.0,0.182837,0.005104,96,0.109212,0.001781


### **Gradient Boosting Regression**

####**Sin tranformar la Y**

In [None]:
pipe_gbt = Pipeline(steps = [('scaler', StandardScaler()), ('gbt', GradientBoostingRegressor(random_state=42))])

**Búsqueda de Hiperparámetros**

In [None]:
# para_grid = {'gbt__n_estimators':[20,25], 'gbt__max_depth':[3,5], 'gbt__min_samples_split':[2,3]}
# para_grid = {'gbt__n_estimators':[80,100,120,180], 'gbt__max_depth':[3,5,10,15,20], 'gbt__min_samples_split':[5,10,15,17,25], 'gbt__min_samples_leaf':[1,3,5]}
# para_grid = {'gbt__n_estimators':[120], 'gbt__max_depth':[5,10], 'gbt__min_samples_split':[3,5],'gbt__min_samples_leaf':[2,5]}
# para_grid = {'gbt__max_depth':[3,5,10,12,15], 'gbt__min_samples_split':[3,5,10,15,20],'gbt__min_samples_leaf':[2,3,5,10]}
para_grid = {'gbt__n_estimators':[100,120,150], 'gbt__max_depth':[6,7,8,9], 'gbt__min_samples_split':[4,5,6], 'gbt__min_samples_leaf':[2,3,4]}

modelo_gbt = busqueda_hiperparametros(X_train, Y_train,var_stratify_train, pipe_gbt, param_grid = para_grid,transf=False )

OutStratify
**para datos sin transformar**
Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [None]:
metricas_gbt = metricas(modelo_gbt.best_estimator_,Y_train, modelo_gbt.predict(X_train), Y_test, modelo_gbt.predict(X_test))
metricas_gbt

Unnamed: 0,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test
0,([DecisionTreeRegressor(criterion='friedman_ms...,15771070.0,63093180.0,0.049952,0.195774,0.996087,0.776236,33981320.0,297869600.0


In [None]:
modelo_gbt.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('gbt',
                 GradientBoostingRegressor(max_depth=6, min_samples_leaf=2,
                                           min_samples_split=6,
                                           n_estimators=150,
                                           random_state=42))])

In [None]:
dd_t=pd.DataFrame(modelo_gbt.cv_results_)
dd_t[dd_t['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica',
'mean_test_meae', 'std_test_meae', 'rank_test_meae','mean_train_meae','std_train_meae',
'mean_test_r2', 'std_test_r2','rank_test_r2','mean_train_r2', 'std_train_r2',
'mean_test_rmse','std_test_rmse', 'rank_test_rmse','mean_train_rmse', 'std_train_rmse',
'mean_test_mape','std_test_mape', 'rank_test_mape', 'mean_train_mape', 'std_train_mape']]

Unnamed: 0,params,mean_test_metrica,std_test_metrica,mean_train_metrica,std_train_metrica,mean_test_meae,std_test_meae,rank_test_meae,mean_train_meae,std_train_meae,mean_test_r2,std_test_r2,rank_test_r2,mean_train_r2,std_train_r2,mean_test_rmse,std_test_rmse,rank_test_rmse,mean_train_rmse,std_train_rmse,mean_test_mape,std_test_mape,rank_test_mape,mean_train_mape,std_train_mape
8,"{'gbt__max_depth': 6, 'gbt__min_samples_leaf':...",-54618470.0,2781603.0,-11126080.0,672391.951576,54618470.0,2781603.0,108,11126080.0,672391.951576,0.799192,0.022832,22,0.997969,0.000196,5.94565e+16,1.278956e+16,87,599529000000000.0,65337460000000.0,0.188945,0.009566,108,0.037937,0.001686


####**Usando Y transformada**

**Búsqueda de hiperparámetros**

In [None]:
# para_grid = {'gbt__n_estimators':[20,25], 'gbt__max_depth':[3,5], 'gbt__min_samples_split':[2,3],'gbt__min_samples_leaf':[1,2]}
# para_grid = {'gbt__n_estimators':[80,100,120,180], 'gbt__max_depth':[3,5,10,15,20], 'gbt__min_samples_split':[5,10,15,17,25], 'gbt__min_samples_leaf':[1,3,5]}
# para_grid = {'gbt__n_estimators':[120], 'gbt__max_depth':[5,10], 'gbt__min_samples_split':[3,5],'gbt__min_samples_leaf':[2,5]}
# para_grid = {'gbt__max_depth':[3,5,10,12,15], 'gbt__min_samples_split':[3,5,10,15,20],'gbt__min_samples_leaf':[2,3,5,10]}
para_grid = {'gbt__n_estimators':[100,120,150], 'gbt__max_depth':[6,7,8,9], 'gbt__min_samples_split':[4,5,6], 'gbt__min_samples_leaf':[2,3,4]}

modelo_gbt_t = busqueda_hiperparametros(X_train, Y_train_t,var_stratify_train, pipe_gbt, param_grid = para_grid, transf=True )

OutStratify
**para datos transformados**
Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [None]:
metricas_gbt_t = metricas(modelo_gbt_t.best_estimator_,Y_train, np.expm1(modelo_gbt_t.predict(X_train)), Y_test, np.expm1(modelo_gbt_t.predict(X_test)))
metricas_gbt_t

Unnamed: 0,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test
0,([DecisionTreeRegressor(criterion='friedman_ms...,9449993.0,56868070.0,0.030508,0.190507,0.995968,0.749338,34497720.0,315264900.0


In [None]:
modelo_gbt_t.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('gbt',
                 GradientBoostingRegressor(max_depth=7, min_samples_leaf=2,
                                           min_samples_split=6,
                                           random_state=42))])

In [None]:
dd_t=pd.DataFrame(modelo_gbt_t.cv_results_)
dd_t[dd_t['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica',
'mean_test_meae', 'std_test_meae', 'rank_test_meae','mean_train_meae','std_train_meae',
'mean_test_r2', 'std_test_r2','rank_test_r2','mean_train_r2', 'std_train_r2',
'mean_test_rmse','std_test_rmse', 'rank_test_rmse','mean_train_rmse', 'std_train_rmse',
'mean_test_mape','std_test_mape', 'rank_test_mape', 'mean_train_mape', 'std_train_mape']]

Unnamed: 0,params,mean_test_metrica,std_test_metrica,mean_train_metrica,std_train_metrica,mean_test_meae,std_test_meae,rank_test_meae,mean_train_meae,std_train_meae,mean_test_r2,std_test_r2,rank_test_r2,mean_train_r2,std_train_r2,mean_test_rmse,std_test_rmse,rank_test_rmse,mean_train_rmse,std_train_rmse,mean_test_mape,std_test_mape,rank_test_mape,mean_train_mape,std_train_mape
33,"{'gbt__max_depth': 7, 'gbt__min_samples_leaf':...",-0.115714,0.005311,-0.014719,0.000797,53797400.0,3530814.0,68,7266495.0,473654.08468,0.796151,0.035555,47,0.997438,0.000231,243650700.0,29564520.0,65,27458300.0,1195071.0,0.177473,0.006786,70,0.023521,0.001019


### **XG Boost**

#### Sin transformar la Y

In [103]:
import xgboost as xgb
pipe_xgb = Pipeline(steps = [
    ('scaler', StandardScaler()), 
    ('xgb',  xgb.XGBRegressor(
        objective='reg:squarederror', n_jobs=-1, 
    ))
])

**Búsqueda de Hiperparámetros**

In [104]:
para_grid = {'xgb__n_estimators':[100,120,150], 'xgb__max_depth':[5,6], 'xgb__learning_rate':[0.05,0.1], 'xgb__reg_alpha':[0.01,0.5,0.1]}
modelo_xgb = busqueda_hiperparametros(X_train, Y_train,var_stratify_train, pipe_xgb, param_grid = para_grid )

OutStratify
**para datos sin transformar**
Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [105]:
metricas_xgb = metricas(modelo_xgb.best_estimator_,Y_train, modelo_xgb.predict(X_train), Y_test, modelo_xgb.predict(X_test))
metricas_xgb

Unnamed: 0,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test
0,"XGBRegressor(learning_rate=0.05, max_depth=6, ...",24951536.0,60452320.0,0.076532,0.203144,0.9883,0.742149,58762370.0,319754300.0


In [106]:
modelo_xgb.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('xgb',
                 XGBRegressor(learning_rate=0.05, max_depth=6, n_estimators=150,
                              n_jobs=-1, objective='reg:squarederror',
                              reg_alpha=0.01))])

### **Red Neuronal**

####**Sin tranformar la Y**

In [None]:
from sklearn.neural_network import MLPRegressor
pipe_rnn = Pipeline(steps = [('scaler', StandardScaler()), ('rnn', MLPRegressor(activation='relu',random_state=42))])

In [None]:
para_grid = {'rnn__hidden_layer_sizes': [(45,25),(95,50),(110,60),(150,75),(180,90)],
    'rnn__alpha': [0.001,0.004,0.01,0.1],
    'rnn__learning_rate_init': [0.001,0.004,0.01,0.1],
    'rnn__max_iter':[200,500]}
#'rnn__alpha': np.logspace(-3, 3, 10),
modelo_rnn = busqueda_hiperparametros(X_train, Y_train,var_stratify_train, pipe_rnn, param_grid = para_grid,transf=False)

OutStratify
**para datos sin transformar**
Fitting 5 folds for each of 160 candidates, totalling 800 fits




In [None]:
metricas_rnn = metricas(modelo_rnn.best_estimator_,Y_train, modelo_rnn.predict(X_train), Y_test, modelo_rnn.predict(X_test))
metricas_rnn

Unnamed: 0,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test
0,"MLPRegressor(alpha=0.001, hidden_layer_sizes=(...",78362100.0,130687800.0,0.244411,0.344494,0.824061,0.661662,227871500.0,366274900.0


In [None]:
modelo_rnn.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('rnn',
                 MLPRegressor(alpha=0.001, hidden_layer_sizes=(180, 90),
                              learning_rate_init=0.1, random_state=42))])

In [None]:
dd_t=pd.DataFrame(modelo_rnn.cv_results_)
dd_t[dd_t['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica',
'mean_test_meae', 'std_test_meae', 'rank_test_meae','mean_train_meae','std_train_meae',
'mean_test_r2', 'std_test_r2','rank_test_r2','mean_train_r2', 'std_train_r2',
'mean_test_rmse','std_test_rmse', 'rank_test_rmse','mean_train_rmse', 'std_train_rmse',
'mean_test_mape','std_test_mape', 'rank_test_mape', 'mean_train_mape', 'std_train_mape']]

Unnamed: 0,params,mean_test_metrica,std_test_metrica,mean_train_metrica,std_train_metrica,mean_test_meae,std_test_meae,rank_test_meae,mean_train_meae,std_train_meae,mean_test_r2,std_test_r2,rank_test_r2,mean_train_r2,std_train_r2,mean_test_rmse,std_test_rmse,rank_test_rmse,mean_train_rmse,std_train_rmse,mean_test_mape,std_test_mape,rank_test_mape,mean_train_mape,std_train_mape
38,"{'rnn__alpha': 0.001, 'rnn__hidden_layer_sizes...",-123142400.0,11388260.0,-79703730.0,3408058.0,123142400.0,11388260.0,160,79703730.0,3408058.0,-1019.098866,2039.409175,141,0.831119,0.004798,2.459503e+20,4.9166e+20,20,4.980722e+16,1660052000000000.0,0.643999,0.547811,136,0.244583,0.005144


#### Usando Y transformada

In [None]:
# para_grid = {'rnn__hidden_layer_sizes': [(45,25),(95,50),(110,60),(150,75),(180,90)],
#     'rnn__alpha': np.logspace(-3, 3, 10),
#     'rnn__learning_rate_init': [0.001, 0.01]}
para_grid = {'rnn__hidden_layer_sizes': [(45,25),(95,50),(110,60),(150,75),(180,90)],
    'rnn__alpha': [0.001,0.004,0.01,0.1],
    'rnn__learning_rate_init': [0.001,0.004,0.01,0.1],
    'rnn__max_iter':[200,500]}
modelo_rnn_t = busqueda_hiperparametros(X_train, Y_train_t,var_stratify_train, pipe_rnn, param_grid = para_grid,transf= True )

OutStratify
**para datos transformados**
Fitting 5 folds for each of 160 candidates, totalling 800 fits


         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan -0.46828212 -0.46828212
         nan         nan         nan         nan         nan         nan
 -0.63578904 -0.63578904         nan         nan         nan         nan
         nan         nan -0.46972022 -0.46972022         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan         nan         nan         nan         nan
         nan         nan -0.46734087 -0.46734087         nan         nan
         nan         nan         nan         nan -0.46479139 -0.46479139
         nan         nan         nan         nan         nan         nan
 -0.46872297 -0.46872297         nan         nan         nan         nan
         nan         nan         nan         nan   

In [None]:
metricas_rnn_t = metricas(modelo_rnn_t.best_estimator_,Y_train, modelo_rnn_t.predict(X_train), Y_test, modelo_rnn_t.predict(X_test))
metricas_rnn_t

Unnamed: 0,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test
0,"MLPRegressor(alpha=0.01, hidden_layer_sizes=(1...",440000000.0,490000000.0,1.0,1.0,-1.45062,-1.412764,850446300.0,978112800.0


In [None]:
modelo_rnn_t.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('rnn',
                 MLPRegressor(alpha=0.01, hidden_layer_sizes=(150, 75),
                              learning_rate_init=0.1, max_iter=500,
                              random_state=42))])

In [None]:
dd_t=pd.DataFrame(modelo_rnn_t.cv_results_)
res = dd_t[dd_t['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica',
'mean_test_meae', 'std_test_meae', 'rank_test_meae','mean_train_meae','std_train_meae',
'mean_test_r2', 'std_test_r2','rank_test_r2','mean_train_r2', 'std_train_r2',
'mean_test_rmse','std_test_rmse', 'rank_test_rmse','mean_train_rmse', 'std_train_rmse',
'mean_test_mape','std_test_mape', 'rank_test_mape', 'mean_train_mape', 'std_train_mape']]
res

Unnamed: 0,params,mean_test_metrica,std_test_metrica,mean_train_metrica,std_train_metrica,mean_test_meae,std_test_meae,rank_test_meae,mean_train_meae,std_train_meae,mean_test_r2,std_test_r2,rank_test_r2,mean_train_r2,std_train_r2,mean_test_rmse,std_test_rmse,rank_test_rmse,mean_train_rmse,std_train_rmse,mean_test_mape,std_test_mape,rank_test_mape,mean_train_mape,std_train_mape
111,"{'rnn__alpha': 0.01, 'rnn__hidden_layer_sizes'...",-0.458359,0.02046,-0.458548,0.009557,207171000.0,12019770.0,19,207348900.0,8010809.0,-0.091886,0.028942,17,-0.08999,0.012419,565641200.0,44555680.0,6,567050100.0,12091230.0,0.585677,0.010852,19,0.585064,0.02202


Referencia: https://www.cienciadedatos.net/documentos/py35-redes-neuronales-python.html

## Resumen

In [None]:
from google.colab import drive
drive.mount('/content/drive/')
%cd '/content/drive/My Drive/Colab Notebooks/Seminario/Etapa Modelamiento/Resultados'

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/My Drive/Colab Notebooks/Seminario/Etapa Modelamiento/Resultados


In [None]:
import os
os.getcwd()

'/content/drive/My Drive/Colab Notebooks/Seminario/Etapa Modelamiento/Resultados'

In [None]:
# resultados_rnn =  pd.concat([metricas_rnn, metricas_rnn_t],ignore_index= True)
# metricas_rnn.to_csv('resultados_train_test_rionegro_rnn.csv',sep=";",decimal=",")

In [None]:
cross_validation_rf = pd.DataFrame(modelo_rf.cv_results_)
cross_validation_rf = cross_validation_rf[cross_validation_rf['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica','mean_train_meae','mean_test_meae','mean_train_mape','mean_test_mape','mean_train_r2','mean_test_r2','mean_train_rmse','mean_test_rmse']]
cross_validation_rf['escenario']='modelo_rf'

cross_validation_rft=pd.DataFrame(modelo_rf_t.cv_results_)
cross_validation_rft = cross_validation_rft[cross_validation_rft['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica','mean_train_meae','mean_test_meae','mean_train_mape','mean_test_mape','mean_train_r2','mean_test_r2','mean_train_rmse','mean_test_rmse']]
cross_validation_rft['escenario']='modelo_rft'

cross_validation_gbt=pd.DataFrame(modelo_gbt.cv_results_)
cross_validation_gbt = cross_validation_gbt[cross_validation_gbt['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica','mean_train_meae','mean_test_meae','mean_train_mape','mean_test_mape','mean_train_r2','mean_test_r2','mean_train_rmse','mean_test_rmse']]
cross_validation_gbt['escenario']='modelo_gbt'

cross_validation_gbtt=pd.DataFrame(modelo_gbt_t.cv_results_)
cross_validation_gbtt = cross_validation_gbtt[cross_validation_gbtt['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica','mean_train_meae','mean_test_meae','mean_train_mape','mean_test_mape','mean_train_r2','mean_test_r2','mean_train_rmse','mean_test_rmse']]
cross_validation_gbtt['escenario']='modelo_gbtt'

cross_validation_rnn=pd.DataFrame(modelo_rnn.cv_results_)
cross_validation_rnn = cross_validation_rnn[cross_validation_rnn['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica','mean_train_meae','mean_test_meae','mean_train_mape','mean_test_mape','mean_train_r2','mean_test_r2','mean_train_rmse','mean_test_rmse']]
cross_validation_rnn['escenario']='modelo_rnn'

cross_validation_rnnt=pd.DataFrame(modelo_rnn_t.cv_results_)
cross_validation_rnnt = cross_validation_rnnt[cross_validation_rnnt['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica','mean_train_meae','mean_test_meae','mean_train_mape','mean_test_mape','mean_train_r2','mean_test_r2','mean_train_rmse','mean_test_rmse']]
cross_validation_rnnt['escenario']='modelo_rnnt'

resultado_cross = pd.concat([cross_validation_rf,cross_validation_rft, cross_validation_gbt, cross_validation_gbtt,cross_validation_rnn,cross_validation_rnnt],
                            ignore_index = True)

columna = pd.DataFrame(resultado_cross['params'])

resultado_cross.drop(['params'],axis=1, inplace=True)
resultado_cross = resultado_cross.drop_duplicates()
resultado_cross.reset_index(inplace=True, drop=False)
resultado_cross

Unnamed: 0,index,mean_test_metrica,std_test_metrica,mean_train_metrica,std_train_metrica,mean_train_meae,mean_test_meae,mean_train_mape,mean_test_mape,mean_train_r2,mean_test_r2,mean_train_rmse,mean_test_rmse,escenario
0,0,-56037900.0,5215776.0,-36108370.0,415907.3,36108370.0,56037900.0,0.112048,0.196498,0.950647,0.801317,1.455257e+16,5.818671e+16,modelo_rf
1,1,-0.123757,0.009042972,-0.07835286,0.002024113,34391300.0,52458110.0,0.109212,0.182837,0.926654,0.794527,147032300.0,244483700.0,modelo_rft
2,2,-54618470.0,2781603.0,-11126080.0,672392.0,11126080.0,54618470.0,0.037937,0.188945,0.997969,0.799192,599529000000000.0,5.94565e+16,modelo_gbt
3,3,-0.1157135,0.005311469,-0.01471878,0.0007971955,7266495.0,53797400.0,0.023521,0.177473,0.997438,0.796151,27458300.0,243650700.0,modelo_gbtt
4,4,-123142400.0,11388260.0,-79703730.0,3408058.0,79703730.0,123142400.0,0.244583,0.643999,0.831119,-1019.098866,4.980722e+16,2.459503e+20,modelo_rnn
5,5,-0.4583593,0.02045977,-0.4585478,0.00955665,207348900.0,207171000.0,0.585064,0.585677,-0.08999,-0.091886,567050100.0,565641200.0,modelo_rnnt


In [None]:
resultado_metricas = pd.concat([metricas_rf, metricas_rf_t, metricas_gbt, metricas_gbt_t, metricas_rnn, metricas_rnn_t],
                            ignore_index = True)
resultado_metricas.drop_duplicates(inplace=True)
resultado_metricas.reset_index(inplace=True, drop=False)
resultado_metricas

Unnamed: 0,index,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test
0,0,"(DecisionTreeRegressor(max_depth=9, max_featur...",37273960.0,64491380.0,0.115688,0.188712,0.949778,0.76875,121746600.0,302811700.0
1,1,"(DecisionTreeRegressor(max_depth=8, max_featur...",35718710.0,56648740.0,0.113645,0.200789,0.923797,0.726303,149966600.0,329432500.0
2,2,([DecisionTreeRegressor(criterion='friedman_ms...,15771070.0,63093180.0,0.049952,0.195774,0.996087,0.776236,33981320.0,297869600.0
3,3,([DecisionTreeRegressor(criterion='friedman_ms...,9449993.0,56868070.0,0.030508,0.190507,0.995968,0.749338,34497720.0,315264900.0
4,4,"MLPRegressor(alpha=0.001, hidden_layer_sizes=(...",78362100.0,130687800.0,0.244411,0.344494,0.824061,0.661662,227871500.0,366274900.0
5,5,"MLPRegressor(alpha=0.01, hidden_layer_sizes=(1...",440000000.0,490000000.0,1.0,1.0,-1.45062,-1.412764,850446300.0,978112800.0


In [None]:
resultados =  pd.concat([resultado_metricas, resultado_cross], axis=1)
resultados

Unnamed: 0,index,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test,index.1,mean_test_metrica,std_test_metrica,mean_train_metrica,std_train_metrica,mean_train_meae,mean_test_meae,mean_train_mape,mean_test_mape,mean_train_r2,mean_test_r2,mean_train_rmse,mean_test_rmse,escenario
0,0,"(DecisionTreeRegressor(max_depth=9, max_featur...",37273960.0,64491380.0,0.115688,0.188712,0.949778,0.76875,121746600.0,302811700.0,0,-56037900.0,5215776.0,-36108370.0,415907.3,36108370.0,56037900.0,0.112048,0.196498,0.950647,0.801317,1.455257e+16,5.818671e+16,modelo_rf
1,1,"(DecisionTreeRegressor(max_depth=8, max_featur...",35718710.0,56648740.0,0.113645,0.200789,0.923797,0.726303,149966600.0,329432500.0,1,-0.123757,0.009042972,-0.07835286,0.002024113,34391300.0,52458110.0,0.109212,0.182837,0.926654,0.794527,147032300.0,244483700.0,modelo_rft
2,2,([DecisionTreeRegressor(criterion='friedman_ms...,15771070.0,63093180.0,0.049952,0.195774,0.996087,0.776236,33981320.0,297869600.0,2,-54618470.0,2781603.0,-11126080.0,672392.0,11126080.0,54618470.0,0.037937,0.188945,0.997969,0.799192,599529000000000.0,5.94565e+16,modelo_gbt
3,3,([DecisionTreeRegressor(criterion='friedman_ms...,9449993.0,56868070.0,0.030508,0.190507,0.995968,0.749338,34497720.0,315264900.0,3,-0.1157135,0.005311469,-0.01471878,0.0007971955,7266495.0,53797400.0,0.023521,0.177473,0.997438,0.796151,27458300.0,243650700.0,modelo_gbtt
4,4,"MLPRegressor(alpha=0.001, hidden_layer_sizes=(...",78362100.0,130687800.0,0.244411,0.344494,0.824061,0.661662,227871500.0,366274900.0,4,-123142400.0,11388260.0,-79703730.0,3408058.0,79703730.0,123142400.0,0.244583,0.643999,0.831119,-1019.098866,4.980722e+16,2.459503e+20,modelo_rnn
5,5,"MLPRegressor(alpha=0.01, hidden_layer_sizes=(1...",440000000.0,490000000.0,1.0,1.0,-1.45062,-1.412764,850446300.0,978112800.0,5,-0.4583593,0.02045977,-0.4585478,0.00955665,207348900.0,207171000.0,0.585064,0.585677,-0.08999,-0.091886,567050100.0,565641200.0,modelo_rnnt


In [None]:
# resultados.to_csv('resultados_train_test_carmen_guarne.csv',sep=";",decimal=",")
# resultados.to_csv('resultados_train_test_retiro_llanogrande.csv',sep=";",decimal=",")
# resultados.to_csv('resultados_train_test_ceja_sanantonio.csv',sep=";",decimal=",")
# resultados.to_csv('resultados_train_test_marinilla_santuario.csv',sep=";",decimal=",")
# resultados.to_csv('resultados_train_test_rionegro.csv',sep=";",decimal=",")

In [None]:
cross_validation_rf = pd.DataFrame(modelo_rf.cv_results_)
cross_validation_rf['escenario']='modelo_rf'

cross_validation_rft=pd.DataFrame(modelo_rf_t.cv_results_)
cross_validation_rft['escenario']='modelo_rft'

cross_validation_gbt=pd.DataFrame(modelo_gbt.cv_results_)
cross_validation_gbt['escenario']='modelo_gbt'

cross_validation_gbtt=pd.DataFrame(modelo_gbt_t.cv_results_)
cross_validation_gbtt['escenario']='modelo_gbtt'

cross_validation_rnn=pd.DataFrame(modelo_rnn.cv_results_)
cross_validation_rnn['escenario']='modelo_rnn'

cross_validation_rnnt=pd.DataFrame(modelo_rnn_t.cv_results_)
cross_validation_rnnt['escenario']='modelo_rnnt'

resultado_cross_total = pd.concat([cross_validation_rf,cross_validation_rft, cross_validation_gbt, cross_validation_gbtt,cross_validation_rnn,cross_validation_rnnt],
                            ignore_index = True)

columna = pd.DataFrame(resultado_cross_total['params'])

resultado_cross_total.drop(['params'],axis=1, inplace=True)
resultado_cross_total = resultado_cross_total.drop_duplicates()
resultado_cross_total.reset_index(inplace=True, drop=False)

In [None]:
# resultado_cross_total.to_csv('resultados_crossvalidation_carmen_guarne.csv',sep=";",decimal=",")
# resultados.to_csv('resultados_crossvalidation_retiro_llanogrande.csv',sep=";",decimal=",")
# resultados.to_csv('resultados_crossvalidation_ceja_sanantonio.csv',sep=";",decimal=",")
# resultados.to_csv('resultados_crossvalidation_marinilla_santuario.csv',sep=";",decimal=",")
# resultados.to_csv('resultados_crossvalidation_rionegro.csv',sep=";",decimal=",")