# **Modelamiento**

## Librerías

In [None]:
!pip install scikit-learn==0.24.2

Collecting scikit-learn==0.24.2
  Downloading scikit_learn-0.24.2-cp37-cp37m-manylinux2010_x86_64.whl (22.3 MB)
[K     |████████████████████████████████| 22.3 MB 1.6 MB/s 
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-2.2.0-py3-none-any.whl (12 kB)
Installing collected packages: threadpoolctl, scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.22.2.post1
    Uninstalling scikit-learn-0.22.2.post1:
      Successfully uninstalled scikit-learn-0.22.2.post1
Successfully installed scikit-learn-0.24.2 threadpoolctl-2.2.0


In [None]:
!pip install unidecode

Collecting unidecode
  Downloading Unidecode-1.3.2-py3-none-any.whl (235 kB)
[?25l[K     |█▍                              | 10 kB 24.0 MB/s eta 0:00:01[K     |██▉                             | 20 kB 28.1 MB/s eta 0:00:01[K     |████▏                           | 30 kB 20.7 MB/s eta 0:00:01[K     |█████▋                          | 40 kB 15.4 MB/s eta 0:00:01[K     |███████                         | 51 kB 9.2 MB/s eta 0:00:01[K     |████████▍                       | 61 kB 7.9 MB/s eta 0:00:01[K     |█████████▊                      | 71 kB 8.7 MB/s eta 0:00:01[K     |███████████▏                    | 81 kB 9.7 MB/s eta 0:00:01[K     |████████████▌                   | 92 kB 10.4 MB/s eta 0:00:01[K     |██████████████                  | 102 kB 8.5 MB/s eta 0:00:01[K     |███████████████▎                | 112 kB 8.5 MB/s eta 0:00:01[K     |████████████████▊               | 122 kB 8.5 MB/s eta 0:00:01[K     |██████████████████              | 133 kB 8.5 MB/s eta 0:00:

In [None]:
#Generales
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#Proceso de Modelación
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_percentage_error,r2_score,median_absolute_error, make_scorer
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit, ShuffleSplit

## Lectura Información

In [None]:
#Descargar datasets desde github
!git clone https://github.com/andres-soto-h/monografia-udea-eacd.git

Cloning into 'monografia-udea-eacd'...
remote: Enumerating objects: 11, done.[K
remote: Counting objects: 100% (11/11), done.[K
remote: Compressing objects: 100% (11/11), done.[K
remote: Total 11 (delta 3), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (11/11), done.


In [None]:
#Lectura del dataset transformado
df_propiedades=pd.read_csv('/content/monografia-udea-eacd/df_prop_clean_12082021.csv', delimiter=';', encoding='latin1')

In [None]:
# from google.colab import drive
# drive.mount('/content/drive/')
# %cd '/content/drive/My Drive/Colab Notebooks/Seminario/DATASETS'
# # !ls
# df_propiedades=pd.read_csv('df_prop_clean_12082021.csv', delimiter=';', encoding='latin1') 
# print(df_propiedades.shape)

In [None]:
df_propiedades.rename(columns={"municipio":"ubicacion"}, inplace=True)

**NOTA:** se decide eliminar la información del municipio de **La Unión**.

In [None]:
mask_union = df_propiedades['ubicacion']!='la uniÃ³n'
df_propiedades = df_propiedades[mask_union]

In [None]:
df_propiedades['ubicacion'].value_counts()

rionegro                  1241
el retiro                  439
la ceja                    342
guarne                     196
marinilla                  117
el carmen de viboral        62
san antonio de pereira      38
llanogrande                 31
santuario                   15
Name: ubicacion, dtype: int64

**Nota2:** Se convierte la variable Estrato en ordinal

In [None]:
df_propiedades['estrato'] = df_propiedades['estrato'].replace('Campestre', '0')
df_propiedades['estrato'] = df_propiedades['estrato'].astype(int)
df_propiedades['estrato'].value_counts()

4    947
5    636
3    584
0    162
6     89
2     58
1      5
Name: estrato, dtype: int64

## **Ajuste del Modelo**

**Funciones**

In [None]:
#Métricas datos transformados
def median_absolute_error2(y_true, y_pred):
  return median_absolute_error(np.expm1(y_true), np.expm1(y_pred))

def mean_squared_error2(y_true, y_pred):
  return mean_squared_error(np.expm1(y_true), np.expm1(y_pred), squared=False)


def mean_absolute_percentage_error2(y_true, y_pred):
  return mean_absolute_percentage_error(np.expm1(y_true), np.expm1(y_pred))
  
def r2_score2(y_true, y_pred):
  return r2_score(np.expm1(y_true), np.expm1(y_pred))

In [None]:
def div_train_test(X, y, var_stratify = ''):
  
  if var_stratify != '': 
    x_train, x_test, y_train, y_test = train_test_split(X,y, train_size=0.9, stratify = X[var_stratify],  random_state = 42)

    var_stratify_train = x_train[var_stratify]
    #ubicacion_test = X_test['ubicacion']

    x_train.drop(var_stratify,axis=1, inplace=True)
    x_test.drop(var_stratify,axis=1, inplace=True)

    #Variable Objetivo Transformada
    y_train_t = np.log1p(y_train)
    y_test_t = np.log1p(y_test)

    return x_train, x_test, y_train, y_test, y_train_t, y_test_t, var_stratify_train

  else:
    x_train, x_test, y_train, y_test = train_test_split(X,y, train_size=0.9,  random_state = 42)

    #Variable Objetivo Transformada
    y_train_t = np.log1p(y_train)
    y_test_t = np.log1p(y_test)
  
  return x_train, x_test, y_train, y_test, y_train_t, y_test_t

def busqueda_hiperparametros(x_train, y_train, var_stratify_train, estimator,param_grid = [] , scoring_med = 'neg_median_absolute_error', transf=False):

  if len(var_stratify_train) != 0:
    
    print('Seccion Stratify')
    sss = StratifiedShuffleSplit(n_splits=5, test_size=0.1, random_state=42)
    stratified = sss.split(x_train, var_stratify_train)
    
    if transf: 
      print('**para datos transformados**')
      scoring_grid={'metrica':scoring_med,'meae':make_scorer(median_absolute_error2), 'r2':make_scorer(r2_score2) , 'rmse':make_scorer(mean_squared_error2), 'mape':make_scorer(mean_absolute_percentage_error2)}
    else:
      print('**para datos sin transformar**')
      scoring_grid={'metrica':scoring_med,'meae':make_scorer(median_absolute_error), 'r2':make_scorer(r2_score) , 'rmse':make_scorer(mean_squared_error), 'mape':make_scorer(mean_absolute_percentage_error)}
    
    grid = GridSearchCV(estimator, param_grid=param_grid, cv = stratified, scoring = scoring_grid , refit = 'metrica',  return_train_score=True, n_jobs=-1, verbose=8)
    return grid.fit(x_train, y_train)
  
  else:
    print('OutStratify')
    
    if transf: 
      print('**para datos transformados**')
      scoring_grid={'metrica':scoring_med,'meae':make_scorer(median_absolute_error2), 'r2':make_scorer(r2_score2) , 'rmse':make_scorer(mean_squared_error2), 'mape':make_scorer(mean_absolute_percentage_error2)}
    else:
      print('**para datos sin transformar**')
      scoring_grid={'metrica':scoring_med,'meae':make_scorer(median_absolute_error), 'r2':make_scorer(r2_score) , 'rmse':make_scorer(mean_squared_error), 'mape':make_scorer(mean_absolute_percentage_error)}
    
    grid = GridSearchCV(estimator, param_grid=param_grid, scoring = scoring_grid, refit = 'metrica', return_train_score=True, n_jobs=-1, verbose=8 )
    return grid.fit(X_train, y_train)

def metricas(model,y_train, p_train, y_test, p_test): 

  resultados = {
      'parametros' : [model.get_params()['steps'][1][1]],
      'MeAE_train' : [median_absolute_error(y_train,p_train)],
      'MeAE_test'  : [median_absolute_error(y_test, p_test)],
      'MAPE_train' : [mean_absolute_percentage_error(y_train,p_train)],
      'MAPE_test'  : [mean_absolute_percentage_error(y_test, p_test)],
      'r2_train'   : [r2_score(y_train,p_train)],
      'r2_test'    : [r2_score(y_test, p_test)],
      'rmse_train' : [mean_squared_error(y_train,p_train,squared = False)],
      'rmse_test'  : [mean_squared_error(y_test,p_test,squared = False)]
  }
  return pd.DataFrame(resultados)

**División Covariables  y Variable Objetivo**

In [None]:
data_model=df_propiedades.copy()
#columnas_quitar=['tipo','url','titulo','descripcion','caractint','caractext','caractsec']
columnas_quitar=['url','titulo','descripcion','caractint','caractext','caractsec','otros_datos']
data_model.drop(columnas_quitar, axis=1, inplace=True)
data_model.shape

(2481, 177)

In [None]:
X = data_model.drop(['precio'], axis=1)
y = data_model['precio']

**OneHotEncoder Variables Categóricas**

In [None]:
enc = OneHotEncoder(handle_unknown='ignore')
data_aux = pd.DataFrame(enc.fit_transform(X[['tipo','tipo_propiedad','ubicacion','antiguedad']]).toarray(), columns= enc.get_feature_names(['tipo','tipo_propiedad','ubicacion','antiguedad']))
data_aux['fila'] = range(0, X.shape[0])
data_aux.head()

X.drop(['tipo','tipo_propiedad','antiguedad'],axis=1,inplace=True)
X['fila'] = range(0, X.shape[0])

X = pd.merge(data_aux,X, on = 'fila', how='inner')
X.drop(['fila'], axis=1, inplace=True)
X.shape

(2481, 194)

**División train y Test**

In [None]:
X_train, X_test, Y_train, Y_test, Y_train_t, Y_test_t, var_stratify_train = div_train_test(X, y, var_stratify = 'ubicacion')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [None]:
hasta aquí

### **Random Forest**

In [None]:
pipe = Pipeline(steps = [('scaler', StandardScaler()), ('rf', RandomForestRegressor(random_state=42))])

####**Sin transformar la Y**

**Búsqueda de Hiperparámetros**

In [None]:
# para_grid = {'rf__n_estimators':[80,100,120,150], 'rf__max_depth':[3,5,10,20,25], 'rf__min_samples_split':[2,3,5,10,15],'rf__min_samples_leaf':[1,2,5,8]}
para_grid = {'rf__n_estimators':[100,120,150], 'rf__max_depth':[6,7,8,9], 'rf__min_samples_split':[4,5,6], 'rf__min_samples_leaf':[2,3,4]}

modelo_rf = busqueda_hiperparametros(X_train, Y_train,var_stratify_train, pipe, param_grid = para_grid, transf=False)
modelo_rf

Seccion Stratify
**para datos sin transformar**
Fitting 5 folds for each of 108 candidates, totalling 540 fits


GridSearchCV(cv=<generator object BaseShuffleSplit.split at 0x7f55756b4550>,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('rf',
                                        RandomForestRegressor(random_state=42))]),
             n_jobs=-1,
             param_grid={'rf__max_depth': [6, 7, 8, 9],
                         'rf__min_samples_leaf': [2, 3, 4],
                         'rf__min_samples_split': [4, 5, 6],
                         'rf__n_estimators': [100, 120, 150]},
             refit='metrica', return_train_score=True,
             scoring={'mape': make_scorer(mean_absolute_percentage_error),
                      'meae': make_scorer(median_absolute_error),
                      'metrica': 'neg_median_absolute_error',
                      'r2': make_scorer(r2_score),
                      'rmse': make_scorer(mean_squared_error)},
             verbose=8)

In [None]:
metricas_rf = metricas(modelo_rf.best_estimator_,Y_train, modelo_rf.predict(X_train), Y_test, modelo_rf.predict(X_test))
metricas_rf

Unnamed: 0,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test
0,"(DecisionTreeRegressor(max_depth=9, max_featur...",53483620.0,73400180.0,0.163045,0.220709,0.893049,0.771647,184540200.0,304639900.0


In [None]:
modelo_rf.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('rf',
                 RandomForestRegressor(max_depth=9, min_samples_leaf=4,
                                       min_samples_split=4, n_estimators=150,
                                       random_state=42))])

In [None]:
dd=pd.DataFrame(modelo_rf.cv_results_)
dd[dd['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica',
'mean_test_meae', 'std_test_meae', 'rank_test_meae','mean_train_meae','std_train_meae',
'mean_test_r2', 'std_test_r2','rank_test_r2','mean_train_r2', 'std_train_r2',
'mean_test_rmse','std_test_rmse', 'rank_test_rmse','mean_train_rmse', 'std_train_rmse',
'mean_test_mape','std_test_mape', 'rank_test_mape', 'mean_train_mape', 'std_train_mape']]

Unnamed: 0,params,mean_test_metrica,std_test_metrica,mean_train_metrica,std_train_metrica,mean_test_meae,std_test_meae,rank_test_meae,mean_train_meae,std_train_meae,mean_test_r2,std_test_r2,rank_test_r2,mean_train_r2,std_train_r2,mean_test_rmse,std_test_rmse,rank_test_rmse,mean_train_rmse,std_train_rmse,mean_test_mape,std_test_mape,rank_test_mape,mean_train_mape,std_train_mape
101,"{'rf__max_depth': 9, 'rf__min_samples_leaf': 4...",-71239320.0,7119693.0,-53164680.0,712722.460535,71239320.0,7119693.0,106,53164680.0,712722.460535,0.729726,0.034436,24,0.89631,0.001892,9.178554e+16,1.145901e+16,84,3.272395e+16,614409800000000.0,0.230457,0.010604,88,0.159293,0.001785
104,"{'rf__max_depth': 9, 'rf__min_samples_leaf': 4...",-71239320.0,7119693.0,-53164680.0,712722.460535,71239320.0,7119693.0,106,53164680.0,712722.460535,0.729726,0.034436,24,0.89631,0.001892,9.178554e+16,1.145901e+16,84,3.272395e+16,614409800000000.0,0.230457,0.010604,88,0.159293,0.001785
107,"{'rf__max_depth': 9, 'rf__min_samples_leaf': 4...",-71239320.0,7119693.0,-53164680.0,712722.460535,71239320.0,7119693.0,106,53164680.0,712722.460535,0.729726,0.034436,24,0.89631,0.001892,9.178554e+16,1.145901e+16,84,3.272395e+16,614409800000000.0,0.230457,0.010604,88,0.159293,0.001785


####**Usando Y transformada**

**Búsqueda de Hiperparámetros**

In [None]:
# para_grid = {'rf__n_estimators':[80,100,120,150], 'rf__max_depth':[3,5,10,20,25], 'rf__min_samples_split':[2,3,5,10,15],'rf__min_samples_leaf':[1,2,5,8]}
para_grid = {'rf__n_estimators':[100,120,150], 'rf__max_depth':[6,7,8,9], 'rf__min_samples_split':[4,5,6], 'rf__min_samples_leaf':[2,3,4]}

modelo_rf_t = busqueda_hiperparametros(X_train, Y_train_t,var_stratify_train, pipe, param_grid = para_grid, transf = True)
modelo_rf_t

Seccion Stratify
**para datos transformados**
Fitting 5 folds for each of 108 candidates, totalling 540 fits


GridSearchCV(cv=<generator object BaseShuffleSplit.split at 0x7f55755eccd0>,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('rf',
                                        RandomForestRegressor(random_state=42))]),
             n_jobs=-1,
             param_grid={'rf__max_depth': [6, 7, 8, 9],
                         'rf__min_samples_leaf': [2, 3, 4],
                         'rf__min_samples_split': [4, 5, 6],
                         'rf__n_estimators': [100, 120, 150]},
             refit='metrica', return_train_score=True,
             scoring={'mape': make_scorer(mean_absolute_percentage_error2),
                      'meae': make_scorer(median_absolute_error2),
                      'metrica': 'neg_median_absolute_error',
                      'r2': make_scorer(r2_score2),
                      'rmse': make_scorer(mean_squared_error2)},
             verbose=8)

In [None]:
metricas_rf_t = metricas(modelo_rf_t.best_estimator_,Y_train, np.expm1(modelo_rf_t.predict(X_train)), Y_test, np.expm1(modelo_rf_t.predict(X_test)))
metricas_rf_t

Unnamed: 0,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test
0,"(DecisionTreeRegressor(max_depth=9, max_featur...",46309850.0,75026190.0,0.135886,0.205783,0.886589,0.743525,190032500.0,322853900.0


In [None]:
modelo_rf_t.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('rf',
                 RandomForestRegressor(max_depth=9, min_samples_leaf=2,
                                       min_samples_split=5, n_estimators=120,
                                       random_state=42))])

In [None]:
dd_t=pd.DataFrame(modelo_rf_t.cv_results_)
dd_t[dd_t['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica',
'mean_test_meae', 'std_test_meae', 'rank_test_meae','mean_train_meae','std_train_meae',
'mean_test_r2', 'std_test_r2','rank_test_r2','mean_train_r2', 'std_train_r2',
'mean_test_rmse','std_test_rmse', 'rank_test_rmse','mean_train_rmse', 'std_train_rmse',
'mean_test_mape','std_test_mape', 'rank_test_mape', 'mean_train_mape', 'std_train_mape']]

Unnamed: 0,params,mean_test_metrica,std_test_metrica,mean_train_metrica,std_train_metrica,mean_test_meae,std_test_meae,rank_test_meae,mean_train_meae,std_train_meae,mean_test_r2,std_test_r2,rank_test_r2,mean_train_r2,std_train_r2,mean_test_rmse,std_test_rmse,rank_test_rmse,mean_train_rmse,std_train_rmse,mean_test_mape,std_test_mape,rank_test_mape,mean_train_mape,std_train_mape
85,"{'rf__max_depth': 9, 'rf__min_samples_leaf': 2...",-0.150624,0.014653,-0.094544,0.001878,72091650.0,10926610.0,92,44533100.0,711963.505776,0.703439,0.032058,13,0.890133,0.002761,317186600.0,20456090.0,95,186194200.0,2566814.0,0.213649,0.010387,100,0.132163,0.001725


### **Gradient Boosting Regression**

####**Sin tranformar la Y**

In [None]:
pipe_gbt = Pipeline(steps = [('scaler', StandardScaler()), ('gbt', GradientBoostingRegressor(random_state=42))])

**Búsqueda de Hiperparámetros**

In [None]:
# para_grid = {'gbt__n_estimators':[70,90,100,120,180], 'gbt__max_depth':[3,5,10,12,15,20,25], 'gbt__min_samples_split':[5,10,15,17,25,30,40], 'gbt__min_samples_leaf':[1,3,5]}
para_grid = {'gbt__n_estimators':[100,120,150], 'gbt__max_depth':[6,7,8,9], 'gbt__min_samples_split':[4,5,6], 'gbt__min_samples_leaf':[2,3,4]}

modelo_gbt = busqueda_hiperparametros(X_train, Y_train,var_stratify_train, pipe_gbt, param_grid = para_grid )

Seccion Stratify
**para datos sin transformar**
Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [None]:
metricas_gbt = metricas(modelo_gbt.best_estimator_,Y_train, modelo_gbt.predict(X_train), Y_test, modelo_gbt.predict(X_test))
metricas_gbt

Unnamed: 0,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test
0,([DecisionTreeRegressor(criterion='friedman_ms...,7774991.0,74172440.0,0.02737,0.206195,0.998624,0.785648,20929510.0,295153400.0


In [None]:
modelo_gbt.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('gbt',
                 GradientBoostingRegressor(max_depth=9, min_samples_leaf=3,
                                           min_samples_split=4,
                                           n_estimators=150,
                                           random_state=42))])

####**Usando Y transformada**

**Búsqueda de hiperparámetros**

In [None]:
# para_grid = {'gbt__n_estimators':[70,90,100,120,180], 'gbt__max_depth':[3,5,10,12,15,20,25], 'gbt__min_samples_split':[5,10,15,17,25,30,40], 'gbt__min_samples_leaf':[1,3,5]}
para_grid = {'gbt__n_estimators':[100,120,150], 'gbt__max_depth':[6,7,8,9], 'gbt__min_samples_split':[4,5,6], 'gbt__min_samples_leaf':[2,3,4]}
modelo_gbt_t = busqueda_hiperparametros(X_train, Y_train_t,var_stratify_train, pipe_gbt, param_grid = para_grid, transf=True )

Seccion Stratify
**para datos transformados**
Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [None]:
metricas_gbt_t = metricas(modelo_gbt_t.best_estimator_,Y_train, np.expm1(modelo_gbt_t.predict(X_train)), Y_test, np.expm1(modelo_gbt_t.predict(X_test)))
metricas_gbt_t

Unnamed: 0,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test
0,([DecisionTreeRegressor(criterion='friedman_ms...,9519395.0,75271240.0,0.028575,0.189862,0.994003,0.790916,43699920.0,291503600.0


In [None]:
modelo_gbt_t.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('gbt',
                 GradientBoostingRegressor(max_depth=9, min_samples_leaf=4,
                                           min_samples_split=4,
                                           n_estimators=120,
                                           random_state=42))])

### **XG Boost**

#### Sin transformar la Y

In [None]:
import xgboost as xgb
pipe_xgb = Pipeline(steps = [
    ('scaler', StandardScaler()), 
    ('xgb',  xgb.XGBRegressor(
        objective='reg:squarederror', n_jobs=-1, 
    ))
])

**Búsqueda de Hiperparámetros**

In [None]:
para_grid = {'xgb__n_estimators':[100,120,150], 'xgb__max_depth':[5,6], 'xgb__learning_rate':[0.05,0.1], 'xgb__reg_alpha':[0.01,0.5,0.1]}
modelo_xgb = busqueda_hiperparametros(X_train, Y_train,var_stratify_train, pipe_xgb, param_grid = para_grid )

Seccion Stratify
**para datos sin transformar**
Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [None]:
metricas_xgb = metricas(modelo_xgb.best_estimator_,Y_train, modelo_xgb.predict(X_train), Y_test, modelo_xgb.predict(X_test))
metricas_xgb

Unnamed: 0,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test
0,"XGBRegressor(max_depth=6, n_jobs=-1, objective...",43208272.0,85208512.0,0.118753,0.21386,0.961429,0.783696,110822400.0,296494100.0


In [None]:
modelo_xgb.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('xgb',
                 XGBRegressor(max_depth=6, n_jobs=-1,
                              objective='reg:squarederror', reg_alpha=0.01))])

### **Red Neuronal**

####**Sin tranformar la Y**

In [None]:
from sklearn.neural_network import MLPRegressor
pipe_rnn = Pipeline(steps = [('scaler', StandardScaler()), ('rnn', MLPRegressor(activation='relu',max_iter=100,random_state=42))])

**Búsqueda de Hiperparámetros**

In [None]:
# para_grid = {'rnn__hidden_layer_sizes':[(45,25),(90,45),(95,50),(110,60)],'rnn__alpha':[0.001,0.01,0.1]}

# para_grid = {'rnn__hidden_layer_sizes': [(45,25),(95,50),(110,60)],
#     'rnn__alpha': np.logspace(-3, 3, 10),
#     'rnn__learning_rate_init': [0.001, 0.01,0.1,0.2],
#     'rnn__solver':['adam']}

para_grid = {'rnn__hidden_layer_sizes': [(45,25),(95,50),(110,60),(150,75),(180,90)],
    'rnn__alpha': [0.001,0.004,0.01,0.1],
    'rnn__learning_rate_init': [0.001,0.004,0.01,0.1],
    'rnn__max_iter':[200,500]}

modelo_rnn = busqueda_hiperparametros(X_train, Y_train,var_stratify_train, pipe_rnn, param_grid = para_grid,transf=False )

Seccion Stratify
**para datos sin transformar**
Fitting 5 folds for each of 160 candidates, totalling 800 fits




In [None]:
metricas_rnn = metricas(modelo_rnn.best_estimator_,Y_train, modelo_rnn.predict(X_train), Y_test, modelo_rnn.predict(X_test))
metricas_rnn

Unnamed: 0,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test
0,"MLPRegressor(alpha=0.001, hidden_layer_sizes=(...",81790920.0,159229000.0,0.240227,0.359388,0.812771,0.604068,244166600.0,401138200.0


In [None]:
modelo_rnn.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('rnn',
                 MLPRegressor(alpha=0.001, hidden_layer_sizes=(45, 25),
                              learning_rate_init=0.1, max_iter=500,
                              random_state=42))])

####**Usando Y transformada**

In [None]:
# para_grid = {'rnn__hidden_layer_sizes': [(45,25),(95,50),(110,60),(150,75),(180,90)],
#     'rnn__alpha': np.logspace(-3, 3, 10),
#     'rnn__learning_rate_init': [0.001, 0.01]}
para_grid = {'rnn__hidden_layer_sizes': [(45,25),(95,50),(110,60),(150,75),(180,90)],
    'rnn__alpha': [0.001,0.004,0.01,0.1],
    'rnn__learning_rate_init': [0.001,0.004,0.01,0.1],
    'rnn__max_iter':[200,500]}
    
modelo_rnn_t = busqueda_hiperparametros(X_train, Y_train_t,var_stratify_train, pipe_rnn, param_grid = para_grid, transf=True )

Seccion Stratify
**para datos transformados**
Fitting 5 folds for each of 160 candidates, totalling 800 fits


In [None]:
metricas_rnn_t = metricas(modelo_rnn_t.best_estimator_,Y_train, np.expm1(modelo_rnn_t.predict(X_train)), Y_test, np.expm1(modelo_rnn_t.predict(X_test)))
metricas_rnn_t

Unnamed: 0,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test
0,"MLPRegressor(alpha=0.1, hidden_layer_sizes=(18...",20089350.0,218686100.0,0.067313,135263.763282,0.966192,-38722130000000.0,103755100.0,3967011000000000.0


In [None]:
modelo_rnn_t.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('rnn',
                 MLPRegressor(alpha=0.1, hidden_layer_sizes=(180, 90),
                              random_state=42))])

Referencia: https://www.cienciadedatos.net/documentos/py35-redes-neuronales-python.html

In [None]:
resultados_rnn =  pd.concat([metricas_rnn, metricas_rnn_t],ignore_index= True)
metricas_rnn.to_csv('resultados_train_test_general_rnn.csv',sep=";",decimal=",")

KNN con Keras

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation

model = Sequential([
    Dense(32, input_shape=(input_dim,)),
    Activation('tanh'),
    Dense(nb_classes),
    Activation('softmax'),
])

## Resumen

In [None]:
from google.colab import drive
drive.mount('/content/drive/')
%cd '/content/drive/My Drive/Colab Notebooks/Seminario/Etapa Modelamiento/Resultados'

Mounted at /content/drive/
/content/drive/My Drive/Colab Notebooks/Seminario/Etapa Modelamiento/Resultados


In [None]:
import os
os.getcwd()

'/content/drive/My Drive/Colab Notebooks/Seminario/Etapa Modelamiento/Resultados'

In [None]:
cross_validation_rf = pd.DataFrame(modelo_rf.cv_results_)
cross_validation_rf = cross_validation_rf[cross_validation_rf['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica','mean_train_meae','mean_test_meae','mean_train_mape','mean_test_mape','mean_train_r2','mean_test_r2','mean_train_rmse','mean_test_rmse']]
cross_validation_rf['escenario']='modelo_rf'

cross_validation_rft=pd.DataFrame(modelo_rf_t.cv_results_)
cross_validation_rft = cross_validation_rft[cross_validation_rft['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica','mean_train_meae','mean_test_meae','mean_train_mape','mean_test_mape','mean_train_r2','mean_test_r2','mean_train_rmse','mean_test_rmse']]
cross_validation_rft['escenario']='modelo_rft'

cross_validation_gbt=pd.DataFrame(modelo_gbt.cv_results_)
cross_validation_gbt = cross_validation_gbt[cross_validation_gbt['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica','mean_train_meae','mean_test_meae','mean_train_mape','mean_test_mape','mean_train_r2','mean_test_r2','mean_train_rmse','mean_test_rmse']]
cross_validation_gbt['escenario']='modelo_gbt'

cross_validation_gbtt=pd.DataFrame(modelo_gbt_t.cv_results_)
cross_validation_gbtt = cross_validation_gbtt[cross_validation_gbtt['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica','mean_train_meae','mean_test_meae','mean_train_mape','mean_test_mape','mean_train_r2','mean_test_r2','mean_train_rmse','mean_test_rmse']]
cross_validation_gbtt['escenario']='modelo_gbtt'

cross_validation_rnn=pd.DataFrame(modelo_rnn.cv_results_)
cross_validation_rnn = cross_validation_rnn[cross_validation_rnn['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica','mean_train_meae','mean_test_meae','mean_train_mape','mean_test_mape','mean_train_r2','mean_test_r2','mean_train_rmse','mean_test_rmse']]
cross_validation_rnn['escenario']='modelo_rnn'

cross_validation_rnnt=pd.DataFrame(modelo_rnn_t.cv_results_)
cross_validation_rnnt = cross_validation_rnnt[cross_validation_rnnt['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica','mean_train_meae','mean_test_meae','mean_train_mape','mean_test_mape','mean_train_r2','mean_test_r2','mean_train_rmse','mean_test_rmse']]
cross_validation_rnnt['escenario']='modelo_rnnt'

resultado_cross = pd.concat([cross_validation_rf,cross_validation_rft, cross_validation_gbt, cross_validation_gbtt,cross_validation_rnn,cross_validation_rnnt],
                            ignore_index = True)

columna = pd.DataFrame(resultado_cross['params'])

resultado_cross.drop(['params'],axis=1, inplace=True)
resultado_cross = resultado_cross.drop_duplicates()
resultado_cross.reset_index(inplace=True, drop=False)
resultado_cross

Unnamed: 0,index,mean_test_metrica,std_test_metrica,mean_train_metrica,std_train_metrica,mean_train_meae,mean_test_meae,mean_train_mape,mean_test_mape,mean_train_r2,mean_test_r2,mean_train_rmse,mean_test_rmse,escenario
0,0,-71239320.0,7119693.0,-53164680.0,712722.5,53164680.0,71239320.0,0.159293,0.2304569,0.89631,0.7297261,3.272395e+16,9.178554e+16,modelo_rf
1,3,-0.1506244,0.01465277,-0.09454374,0.001877897,44533100.0,72091650.0,0.132163,0.2136494,0.890133,0.7034389,186194200.0,317186600.0,modelo_rft
2,4,-61423280.0,7260291.0,-6570505.0,879440.3,6570505.0,61423280.0,0.024161,0.2148322,0.998964,0.7172251,326365700000000.0,9.634683e+16,modelo_gbt
3,7,-0.12989,0.01458501,-0.01590032,0.001141182,8282135.0,60966630.0,0.025676,0.1989429,0.995258,0.7154888,38667270.0,310406100.0,modelo_gbtt
4,10,-127801200.0,8317124.0,-92152810.0,3327533.0,92152810.0,127801200.0,0.261387,0.3495676,0.790765,0.5990284,6.6047e+16,1.37256e+17,modelo_rnn
5,11,-0.4479736,0.0623044,-0.0866925,0.01315596,42845840.0,195777700.0,0.15525,9538777000000.0,0.241742,-7.872653e+29,413668900.0,2.426977e+23,modelo_rnnt


In [None]:
resultado_metricas = pd.concat([metricas_rf, metricas_rf_t, metricas_gbt, metricas_gbt_t, metricas_rnn, metricas_rnn_t],
                            ignore_index = True)
resultado_metricas.drop_duplicates(inplace=True)
resultado_metricas.reset_index(inplace=True, drop=False)
resultado_metricas

Unnamed: 0,index,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test
0,0,"(DecisionTreeRegressor(max_depth=9, max_featur...",53483620.0,73400180.0,0.163045,0.220709,0.893049,0.7716474,184540200.0,304639900.0
1,1,"(DecisionTreeRegressor(max_depth=9, max_featur...",46309850.0,75026190.0,0.135886,0.205783,0.886589,0.7435254,190032500.0,322853900.0
2,2,([DecisionTreeRegressor(criterion='friedman_ms...,7774991.0,74172440.0,0.02737,0.206195,0.998624,0.7856478,20929510.0,295153400.0
3,3,([DecisionTreeRegressor(criterion='friedman_ms...,9519395.0,75271240.0,0.028575,0.189862,0.994003,0.7909163,43699920.0,291503600.0
4,4,"MLPRegressor(alpha=0.001, hidden_layer_sizes=(...",81790920.0,159229000.0,0.240227,0.359388,0.812771,0.6040681,244166600.0,401138200.0
5,5,"MLPRegressor(alpha=0.1, hidden_layer_sizes=(18...",20089350.0,218686100.0,0.067313,135263.763282,0.966192,-38722130000000.0,103755100.0,3967011000000000.0


In [None]:
resultados =  pd.concat([resultado_metricas, resultado_cross], axis=1)
resultados

Unnamed: 0,index,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test,index.1,mean_test_metrica,std_test_metrica,mean_train_metrica,std_train_metrica,mean_train_meae,mean_test_meae,mean_train_mape,mean_test_mape,mean_train_r2,mean_test_r2,mean_train_rmse,mean_test_rmse,escenario
0,0,"(DecisionTreeRegressor(max_depth=9, max_featur...",53483620.0,73400180.0,0.163045,0.220709,0.893049,0.7716474,184540200.0,304639900.0,0,-71239320.0,7119693.0,-53164680.0,712722.5,53164680.0,71239320.0,0.159293,0.2304569,0.89631,0.7297261,3.272395e+16,9.178554e+16,modelo_rf
1,1,"(DecisionTreeRegressor(max_depth=9, max_featur...",46309850.0,75026190.0,0.135886,0.205783,0.886589,0.7435254,190032500.0,322853900.0,3,-0.1506244,0.01465277,-0.09454374,0.001877897,44533100.0,72091650.0,0.132163,0.2136494,0.890133,0.7034389,186194200.0,317186600.0,modelo_rft
2,2,([DecisionTreeRegressor(criterion='friedman_ms...,7774991.0,74172440.0,0.02737,0.206195,0.998624,0.7856478,20929510.0,295153400.0,4,-61423280.0,7260291.0,-6570505.0,879440.3,6570505.0,61423280.0,0.024161,0.2148322,0.998964,0.7172251,326365700000000.0,9.634683e+16,modelo_gbt
3,3,([DecisionTreeRegressor(criterion='friedman_ms...,9519395.0,75271240.0,0.028575,0.189862,0.994003,0.7909163,43699920.0,291503600.0,7,-0.12989,0.01458501,-0.01590032,0.001141182,8282135.0,60966630.0,0.025676,0.1989429,0.995258,0.7154888,38667270.0,310406100.0,modelo_gbtt
4,4,"MLPRegressor(alpha=0.001, hidden_layer_sizes=(...",81790920.0,159229000.0,0.240227,0.359388,0.812771,0.6040681,244166600.0,401138200.0,10,-127801200.0,8317124.0,-92152810.0,3327533.0,92152810.0,127801200.0,0.261387,0.3495676,0.790765,0.5990284,6.6047e+16,1.37256e+17,modelo_rnn
5,5,"MLPRegressor(alpha=0.1, hidden_layer_sizes=(18...",20089350.0,218686100.0,0.067313,135263.763282,0.966192,-38722130000000.0,103755100.0,3967011000000000.0,11,-0.4479736,0.0623044,-0.0866925,0.01315596,42845840.0,195777700.0,0.15525,9538777000000.0,0.241742,-7.872653e+29,413668900.0,2.426977e+23,modelo_rnnt


In [None]:
resultados.to_csv('resultados_train_test_general.csv',sep=";",decimal=",")

In [None]:
cross_validation_rf = pd.DataFrame(modelo_rf.cv_results_)
cross_validation_rf['escenario']='modelo_rf'

cross_validation_rft=pd.DataFrame(modelo_rf_t.cv_results_)
cross_validation_rft['escenario']='modelo_rft'

cross_validation_gbt=pd.DataFrame(modelo_gbt.cv_results_)
cross_validation_gbt['escenario']='modelo_gbt'

cross_validation_gbtt=pd.DataFrame(modelo_gbt_t.cv_results_)
cross_validation_gbtt['escenario']='modelo_gbtt'

cross_validation_rnn=pd.DataFrame(modelo_rnn.cv_results_)
cross_validation_rnn['escenario']='modelo_rnn'

cross_validation_rnnt=pd.DataFrame(modelo_rnn_t.cv_results_)
cross_validation_rnnt['escenario']='modelo_rnnt'

resultado_cross_total = pd.concat([cross_validation_rf,cross_validation_rft, cross_validation_gbt, cross_validation_gbtt,cross_validation_rnn,cross_validation_rnnt],
                            ignore_index = True)

columna = pd.DataFrame(resultado_cross_total['params'])

resultado_cross_total.drop(['params'],axis=1, inplace=True)
resultado_cross_total = resultado_cross_total.drop_duplicates()
resultado_cross_total.reset_index(inplace=True, drop=False)

In [None]:
resultado_cross_total.to_csv('resultados_crossvalidacion_general.csv',sep=";",decimal=",")