# **Modelamiento**

## Librerías

In [1]:
!pip install scikit-learn==0.24.2

Collecting scikit-learn==0.24.2
  Downloading scikit_learn-0.24.2-cp37-cp37m-manylinux2010_x86_64.whl (22.3 MB)
[K     |████████████████████████████████| 22.3 MB 58.5 MB/s 
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.0.1
    Uninstalling scikit-learn-1.0.1:
      Successfully uninstalled scikit-learn-1.0.1
Successfully installed scikit-learn-0.24.2


In [2]:
!pip install unidecode

Collecting unidecode
  Downloading Unidecode-1.3.2-py3-none-any.whl (235 kB)
[?25l[K     |█▍                              | 10 kB 22.2 MB/s eta 0:00:01[K     |██▉                             | 20 kB 28.1 MB/s eta 0:00:01[K     |████▏                           | 30 kB 31.2 MB/s eta 0:00:01[K     |█████▋                          | 40 kB 33.8 MB/s eta 0:00:01[K     |███████                         | 51 kB 35.0 MB/s eta 0:00:01[K     |████████▍                       | 61 kB 37.5 MB/s eta 0:00:01[K     |█████████▊                      | 71 kB 28.2 MB/s eta 0:00:01[K     |███████████▏                    | 81 kB 27.9 MB/s eta 0:00:01[K     |████████████▌                   | 92 kB 28.3 MB/s eta 0:00:01[K     |██████████████                  | 102 kB 30.0 MB/s eta 0:00:01[K     |███████████████▎                | 112 kB 30.0 MB/s eta 0:00:01[K     |████████████████▊               | 122 kB 30.0 MB/s eta 0:00:01[K     |██████████████████              | 133 kB 30.0 MB/s e

In [3]:
#Generales
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#Proceso de Modelación
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_percentage_error,r2_score,median_absolute_error, make_scorer
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit, ShuffleSplit

## Lectura Información

In [4]:
#Descargar datasets desde github
!git clone https://github.com/andres-soto-h/monografia-udea-eacd.git

Cloning into 'monografia-udea-eacd'...
remote: Enumerating objects: 11, done.[K
remote: Counting objects: 100% (11/11), done.[K
remote: Compressing objects: 100% (11/11), done.[K
remote: Total 11 (delta 3), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (11/11), done.


In [5]:
#Lectura del dataset transformado
df_propiedades=pd.read_csv('/content/monografia-udea-eacd/df_prop_clean_12082021.csv', delimiter=';', encoding='latin1')

In [6]:
# from google.colab import drive
# drive.mount('/content/drive/')
# %cd '/content/drive/My Drive/Colab Notebooks/Seminario/DATASETS'
# # !ls
# df_propiedades=pd.read_csv('df_prop_clean_12082021.csv', delimiter=';', encoding='latin1') 
# print(df_propiedades.shape)

In [7]:
df_propiedades.rename(columns={"municipio":"ubicacion"}, inplace=True)

**NOTA:** se decide eliminar la información del municipio de **La Unión**.

In [8]:
mask_union = df_propiedades['ubicacion']!='la uniÃ³n'
df_propiedades = df_propiedades[mask_union]

In [9]:
df_propiedades['ubicacion'].value_counts()

rionegro                  1241
el retiro                  439
la ceja                    342
guarne                     196
marinilla                  117
el carmen de viboral        62
san antonio de pereira      38
llanogrande                 31
santuario                   15
Name: ubicacion, dtype: int64

**Nota2:** Se convierte la variable Estrato en ordinal

In [10]:
df_propiedades['estrato'] = df_propiedades['estrato'].replace('Campestre', '0')
df_propiedades['estrato'] = df_propiedades['estrato'].astype(int)
df_propiedades['estrato'].value_counts()

4    947
5    636
3    584
0    162
6     89
2     58
1      5
Name: estrato, dtype: int64

## **Ajuste del Modelo**

**Funciones**

In [11]:
#Métricas datos transformados
def median_absolute_error2(y_true, y_pred):
  return median_absolute_error(np.expm1(y_true), np.expm1(y_pred))

def mean_squared_error2(y_true, y_pred):
  return mean_squared_error(np.expm1(y_true), np.expm1(y_pred), squared=False)


def mean_absolute_percentage_error2(y_true, y_pred):
  return mean_absolute_percentage_error(np.expm1(y_true), np.expm1(y_pred))
  
def r2_score2(y_true, y_pred):
  return r2_score(np.expm1(y_true), np.expm1(y_pred))

In [12]:
def div_train_test(X, y, var_stratify = ''):
  
  if var_stratify != '': 
    x_train, x_test, y_train, y_test = train_test_split(X,y, train_size=0.9, stratify = X[var_stratify],  random_state = 42)

    var_stratify_train = x_train[var_stratify]
    #ubicacion_test = X_test['ubicacion']

    x_train.drop(var_stratify,axis=1, inplace=True)
    x_test.drop(var_stratify,axis=1, inplace=True)

    #Variable Objetivo Transformada
    y_train_t = np.log1p(y_train)
    y_test_t = np.log1p(y_test)

    return x_train, x_test, y_train, y_test, y_train_t, y_test_t, var_stratify_train

  else:
    x_train, x_test, y_train, y_test = train_test_split(X,y, train_size=0.9,  random_state = 42)

    #Variable Objetivo Transformada
    y_train_t = np.log1p(y_train)
    y_test_t = np.log1p(y_test)
  
  return x_train, x_test, y_train, y_test, y_train_t, y_test_t

def busqueda_hiperparametros(x_train, y_train, var_stratify_train, estimator,param_grid = [] , scoring_med = 'neg_median_absolute_error', transf=False):

  if len(var_stratify_train) != 0:
    
    print('Seccion Stratify')
    sss = StratifiedShuffleSplit(n_splits=5, test_size=0.1, random_state=42)
    stratified = sss.split(x_train, var_stratify_train)
    
    if transf: 
      print('**para datos transformados**')
      scoring_grid={'metrica':scoring_med,'meae':make_scorer(median_absolute_error2), 'r2':make_scorer(r2_score2) , 'rmse':make_scorer(mean_squared_error2), 'mape':make_scorer(mean_absolute_percentage_error2)}
    else:
      print('**para datos sin transformar**')
      scoring_grid={'metrica':scoring_med,'meae':make_scorer(median_absolute_error), 'r2':make_scorer(r2_score) , 'rmse':make_scorer(mean_squared_error), 'mape':make_scorer(mean_absolute_percentage_error)}
    
    grid = GridSearchCV(estimator, param_grid=param_grid, cv = stratified, scoring = scoring_grid , refit = 'metrica',  return_train_score=True, n_jobs=-1, verbose=8)
    return grid.fit(x_train, y_train)
  
  else:
    print('OutStratify')
    
    if transf: 
      print('**para datos transformados**')
      scoring_grid={'metrica':scoring_med,'meae':make_scorer(median_absolute_error2), 'r2':make_scorer(r2_score2) , 'rmse':make_scorer(mean_squared_error2), 'mape':make_scorer(mean_absolute_percentage_error2)}
    else:
      print('**para datos sin transformar**')
      scoring_grid={'metrica':scoring_med,'meae':make_scorer(median_absolute_error), 'r2':make_scorer(r2_score) , 'rmse':make_scorer(mean_squared_error), 'mape':make_scorer(mean_absolute_percentage_error)}
    
    grid = GridSearchCV(estimator, param_grid=param_grid, scoring = scoring_grid, refit = 'metrica', return_train_score=True, n_jobs=-1, verbose=8 )
    return grid.fit(X_train, y_train)

def metricas(model,y_train, p_train, y_test, p_test): 

  resultados = {
      'parametros' : [model.get_params()['steps'][1][1]],
      'MeAE_train' : [median_absolute_error(y_train,p_train)],
      'MeAE_test'  : [median_absolute_error(y_test, p_test)],
      'MAPE_train' : [mean_absolute_percentage_error(y_train,p_train)],
      'MAPE_test'  : [mean_absolute_percentage_error(y_test, p_test)],
      'r2_train'   : [r2_score(y_train,p_train)],
      'r2_test'    : [r2_score(y_test, p_test)],
      'rmse_train' : [mean_squared_error(y_train,p_train,squared = False)],
      'rmse_test'  : [mean_squared_error(y_test,p_test,squared = False)]
  }
  return pd.DataFrame(resultados)

**División Covariables  y Variable Objetivo**

In [13]:
data_model=df_propiedades.copy()
#columnas_quitar=['tipo','url','titulo','descripcion','caractint','caractext','caractsec']
# varibles_considerar = ['tipo','precio','area_m2','banos','garajes','antiguedad','latitud','longitud','estrato','ubicacion','tipo_propiedad','balcon','zonas_verdes','en_conjunto_cerrado','zona_infantil','supermercados_ccomerciales','colegios_universidades','trans_publico_cercano']
varibles_considerar = ['tipo','precio','area_m2','banos','garajes','antiguedad','estrato','ubicacion','tipo_propiedad','balcon','zonas_verdes','en_conjunto_cerrado','zona_infantil','supermercados_ccomerciales','colegios_universidades','trans_publico_cercano']

# columnas_quitar=['url','titulo','descripcion','caractint','caractext','caractsec','otros_datos']
# data_model.drop(columnas_quitar, axis=1, inplace=True)
data_model = data_model.loc[:,varibles_considerar]
data_model.shape

(2481, 16)

In [14]:
data_model.head()

Unnamed: 0,tipo,precio,area_m2,banos,garajes,antiguedad,estrato,ubicacion,tipo_propiedad,balcon,zonas_verdes,en_conjunto_cerrado,zona_infantil,supermercados_ccomerciales,colegios_universidades,trans_publico_cercano
0,Nueva,512262900.0,89.49,3.0,1.0,0,0,rionegro,apartamento,1.0,1.0,1.0,1.0,1.0,0.0,1.0
1,Usada,534000000.0,182.0,4.0,2.0,1 a 8 aÃ±os,4,la ceja,casa,1.0,0.0,0.0,1.0,1.0,0.0,1.0
2,Usada,270000000.0,4070.0,1.0,9.0,1 a 8 aÃ±os,2,guarne,finca,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Usada,1389622000.0,274.0,5.0,0.0,16 a 30 aÃ±os,1,rionegro,finca,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,Usada,199000000.0,49.0,2.0,1.0,0,4,el retiro,apartamento,0.0,0.0,1.0,1.0,1.0,0.0,1.0


In [15]:
X = data_model.drop(['precio'], axis=1)
y = data_model['precio']

**OneHotEncoder Variables Categóricas**

In [16]:
enc = OneHotEncoder(handle_unknown='ignore')
data_aux = pd.DataFrame(enc.fit_transform(X[['tipo','tipo_propiedad','ubicacion','antiguedad']]).toarray(), columns= enc.get_feature_names(['tipo','tipo_propiedad','ubicacion','antiguedad']))
data_aux['fila'] = range(0, X.shape[0])
data_aux.head()

X.drop(['tipo','tipo_propiedad','antiguedad'],axis=1,inplace=True)
X['fila'] = range(0, X.shape[0])

X = pd.merge(data_aux,X, on = 'fila', how='inner')
X.drop(['fila'], axis=1, inplace=True)
X.shape

(2481, 33)

**División train y Test**

In [17]:
X_train, X_test, Y_train, Y_test, Y_train_t, Y_test_t, var_stratify_train = div_train_test(X, y, var_stratify = 'ubicacion')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


### **Random Forest**

In [None]:
pipe = Pipeline(steps = [('scaler', StandardScaler()), ('rf', RandomForestRegressor(random_state=42))])

####**Sin transformar la Y**

**Búsqueda de Hiperparámetros**

In [None]:
# para_grid = {'rf__n_estimators':[80,100,120,150], 'rf__max_depth':[3,5,10,20,25], 'rf__min_samples_split':[2,3,5,10,15],'rf__min_samples_leaf':[1,2,5,8]}
para_grid = {'rf__n_estimators':[100,120,150], 'rf__max_depth':[6,7,8,9], 'rf__min_samples_split':[4,5,6], 'rf__min_samples_leaf':[2,3,4]}

modelo_rf = busqueda_hiperparametros(X_train, Y_train,var_stratify_train, pipe, param_grid = para_grid, transf=False)
modelo_rf

Seccion Stratify
**para datos sin transformar**
Fitting 5 folds for each of 108 candidates, totalling 540 fits


GridSearchCV(cv=<generator object BaseShuffleSplit.split at 0x7fb6da75a3d0>,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('rf',
                                        RandomForestRegressor(random_state=42))]),
             n_jobs=-1,
             param_grid={'rf__max_depth': [6, 7, 8, 9],
                         'rf__min_samples_leaf': [2, 3, 4],
                         'rf__min_samples_split': [4, 5, 6],
                         'rf__n_estimators': [100, 120, 150]},
             refit='metrica', return_train_score=True,
             scoring={'mape': make_scorer(mean_absolute_percentage_error),
                      'meae': make_scorer(median_absolute_error),
                      'metrica': 'neg_median_absolute_error',
                      'r2': make_scorer(r2_score),
                      'rmse': make_scorer(mean_squared_error)},
             verbose=8)

In [None]:
metricas_rf = metricas(modelo_rf.best_estimator_,Y_train, modelo_rf.predict(X_train), Y_test, modelo_rf.predict(X_test))
metricas_rf

Unnamed: 0,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test
0,"(DecisionTreeRegressor(max_depth=9, max_featur...",59264530.0,73215840.0,0.172622,0.215121,0.885647,0.778355,190820000.0,300132600.0


In [None]:
modelo_rf.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('rf',
                 RandomForestRegressor(max_depth=9, min_samples_leaf=3,
                                       min_samples_split=4, n_estimators=150,
                                       random_state=42))])

In [None]:
dd=pd.DataFrame(modelo_rf.cv_results_)
dd[dd['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica',
'mean_test_meae', 'std_test_meae', 'rank_test_meae','mean_train_meae','std_train_meae',
'mean_test_r2', 'std_test_r2','rank_test_r2','mean_train_r2', 'std_train_r2',
'mean_test_rmse','std_test_rmse', 'rank_test_rmse','mean_train_rmse', 'std_train_rmse',
'mean_test_mape','std_test_mape', 'rank_test_mape', 'mean_train_mape', 'std_train_mape']]

Unnamed: 0,params,mean_test_metrica,std_test_metrica,mean_train_metrica,std_train_metrica,mean_test_meae,std_test_meae,rank_test_meae,mean_train_meae,std_train_meae,mean_test_r2,std_test_r2,rank_test_r2,mean_train_r2,std_train_r2,mean_test_rmse,std_test_rmse,rank_test_rmse,mean_train_rmse,std_train_rmse,mean_test_mape,std_test_mape,rank_test_mape,mean_train_mape,std_train_mape
92,"{'rf__max_depth': 9, 'rf__min_samples_leaf': 3...",-76145670.0,4939623.0,-58639400.0,772135.365423,76145670.0,4939623.0,106,58639400.0,772135.365423,0.718674,0.034645,36,0.889917,0.002912,9.546208e+16,1.070703e+16,70,3.47399e+16,867618700000000.0,0.238447,0.009312,91,0.16853,0.001707
95,"{'rf__max_depth': 9, 'rf__min_samples_leaf': 3...",-76145670.0,4939623.0,-58639400.0,772135.365423,76145670.0,4939623.0,106,58639400.0,772135.365423,0.718674,0.034645,36,0.889917,0.002912,9.546208e+16,1.070703e+16,70,3.47399e+16,867618700000000.0,0.238447,0.009312,91,0.16853,0.001707
98,"{'rf__max_depth': 9, 'rf__min_samples_leaf': 3...",-76145670.0,4939623.0,-58639400.0,772135.365423,76145670.0,4939623.0,106,58639400.0,772135.365423,0.718674,0.034645,36,0.889917,0.002912,9.546208e+16,1.070703e+16,70,3.47399e+16,867618700000000.0,0.238447,0.009312,91,0.16853,0.001707


####**Usando Y transformada**

**Búsqueda de Hiperparámetros**

In [None]:
# para_grid = {'rf__n_estimators':[80,100,120,150], 'rf__max_depth':[3,5,10,20,25], 'rf__min_samples_split':[2,3,5,10,15],'rf__min_samples_leaf':[1,2,5,8]}
para_grid = {'rf__n_estimators':[100,120,150], 'rf__max_depth':[6,7,8,9], 'rf__min_samples_split':[4,5,6], 'rf__min_samples_leaf':[2,3,4]}

modelo_rf_t = busqueda_hiperparametros(X_train, Y_train_t,var_stratify_train, pipe, param_grid = para_grid, transf = True)
modelo_rf_t

Seccion Stratify
**para datos transformados**
Fitting 5 folds for each of 108 candidates, totalling 540 fits


GridSearchCV(cv=<generator object BaseShuffleSplit.split at 0x7fb6d9ed9a50>,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('rf',
                                        RandomForestRegressor(random_state=42))]),
             n_jobs=-1,
             param_grid={'rf__max_depth': [6, 7, 8, 9],
                         'rf__min_samples_leaf': [2, 3, 4],
                         'rf__min_samples_split': [4, 5, 6],
                         'rf__n_estimators': [100, 120, 150]},
             refit='metrica', return_train_score=True,
             scoring={'mape': make_scorer(mean_absolute_percentage_error2),
                      'meae': make_scorer(median_absolute_error2),
                      'metrica': 'neg_median_absolute_error',
                      'r2': make_scorer(r2_score2),
                      'rmse': make_scorer(mean_squared_error2)},
             verbose=8)

In [None]:
metricas_rf_t = metricas(modelo_rf_t.best_estimator_,Y_train, np.expm1(modelo_rf_t.predict(X_train)), Y_test, np.expm1(modelo_rf_t.predict(X_test)))
metricas_rf_t

Unnamed: 0,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test
0,"(DecisionTreeRegressor(max_depth=9, max_featur...",52954920.0,72558250.0,0.150728,0.20504,0.867465,0.754846,205430200.0,315648100.0


In [None]:
modelo_rf_t.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('rf',
                 RandomForestRegressor(max_depth=9, min_samples_leaf=2,
                                       min_samples_split=5, random_state=42))])

In [None]:
dd_t=pd.DataFrame(modelo_rf_t.cv_results_)
dd_t[dd_t['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica',
'mean_test_meae', 'std_test_meae', 'rank_test_meae','mean_train_meae','std_train_meae',
'mean_test_r2', 'std_test_r2','rank_test_r2','mean_train_r2', 'std_train_r2',
'mean_test_rmse','std_test_rmse', 'rank_test_rmse','mean_train_rmse', 'std_train_rmse',
'mean_test_mape','std_test_mape', 'rank_test_mape', 'mean_train_mape', 'std_train_mape']]

Unnamed: 0,params,mean_test_metrica,std_test_metrica,mean_train_metrica,std_train_metrica,mean_test_meae,std_test_meae,rank_test_meae,mean_train_meae,std_train_meae,mean_test_r2,std_test_r2,rank_test_r2,mean_train_r2,std_train_r2,mean_test_rmse,std_test_rmse,rank_test_rmse,mean_train_rmse,std_train_rmse,mean_test_mape,std_test_mape,rank_test_mape,mean_train_mape,std_train_mape
84,"{'rf__max_depth': 9, 'rf__min_samples_leaf': 2...",-0.152194,0.014723,-0.108394,0.001861,71848200.0,9763920.0,103,50954130.0,941200.60534,0.700873,0.033118,4,0.874067,0.003085,318443800.0,19273130.0,105,199339800.0,2270625.0,0.217674,0.008665,105,0.146663,0.001712


### **Gradient Boosting Regression**

####**Sin tranformar la Y**

In [None]:
pipe_gbt = Pipeline(steps = [('scaler', StandardScaler()), ('gbt', GradientBoostingRegressor(random_state=42))])

**Búsqueda de Hiperparámetros**

In [None]:
# para_grid = {'gbt__n_estimators':[70,90,100,120,180], 'gbt__max_depth':[3,5,10,12,15,20,25], 'gbt__min_samples_split':[5,10,15,17,25,30,40], 'gbt__min_samples_leaf':[1,3,5]}
para_grid = {'gbt__n_estimators':[100,120,150], 'gbt__max_depth':[6,7,8,9], 'gbt__min_samples_split':[4,5,6], 'gbt__min_samples_leaf':[2,3,4]}

modelo_gbt = busqueda_hiperparametros(X_train, Y_train,var_stratify_train, pipe_gbt, param_grid = para_grid )

Seccion Stratify
**para datos sin transformar**
Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [None]:
metricas_gbt = metricas(modelo_gbt.best_estimator_,Y_train, modelo_gbt.predict(X_train), Y_test, modelo_gbt.predict(X_test))
metricas_gbt

Unnamed: 0,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test
0,([DecisionTreeRegressor(criterion='friedman_ms...,36762560.0,73080390.0,0.100509,0.215789,0.975973,0.77814,87467380.0,300278100.0


In [None]:
modelo_gbt.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('gbt',
                 GradientBoostingRegressor(max_depth=7, min_samples_leaf=2,
                                           min_samples_split=4,
                                           random_state=42))])

####**Usando Y transformada**

**Búsqueda de hiperparámetros**

In [None]:
# para_grid = {'gbt__n_estimators':[70,90,100,120,180], 'gbt__max_depth':[3,5,10,12,15,20,25], 'gbt__min_samples_split':[5,10,15,17,25,30,40], 'gbt__min_samples_leaf':[1,3,5]}
para_grid = {'gbt__n_estimators':[100,120,150], 'gbt__max_depth':[6,7,8,9], 'gbt__min_samples_split':[4,5,6], 'gbt__min_samples_leaf':[2,3,4]}
modelo_gbt_t = busqueda_hiperparametros(X_train, Y_train_t,var_stratify_train, pipe_gbt, param_grid = para_grid, transf=True )

Seccion Stratify
**para datos transformados**
Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [None]:
metricas_gbt_t = metricas(modelo_gbt_t.best_estimator_,Y_train, np.expm1(modelo_gbt_t.predict(X_train)), Y_test, np.expm1(modelo_gbt_t.predict(X_test)))
metricas_gbt_t

Unnamed: 0,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test
0,([DecisionTreeRegressor(criterion='friedman_ms...,17295960.0,74871650.0,0.049577,0.195328,0.989129,0.764,58834800.0,309698800.0


In [None]:
modelo_gbt_t.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('gbt',
                 GradientBoostingRegressor(max_depth=9, min_samples_leaf=3,
                                           min_samples_split=4,
                                           n_estimators=120,
                                           random_state=42))])

### **XG Boost**

#### Sin transformar la Y

In [None]:
import xgboost as xgb
# pipe_xgb = Pipeline(steps = [
#     ('scaler', StandardScaler()), 
#     ('xgb',  xgb.XGBRegressor(
#         objective='reg:squarederror', reg_alpha = 3, reg_lambda = 3, max_depth=5, learning_rate=0.05, n_jobs=-1, n_estimators = 175, 
#     ))
# ])

pipe_xgb = Pipeline(steps = [
    ('scaler', StandardScaler()), 
    ('xgb',  xgb.XGBRegressor(
        objective='reg:squarederror', n_jobs=-1, 
    ))
])

**Búsqueda de Hiperparámetros**

In [None]:
para_grid = { 'xgb__max_depth':[5,6], 'xgb__learning_rate':[0.05,0.1], 'xgb__reg_alpha':[0.01,0.5,0.1]}
#'xgb__n_estimators':[100,120,150],
modelo_xgb = busqueda_hiperparametros(X_train, Y_train,var_stratify_train, pipe_xgb, param_grid = para_grid )

Seccion Stratify
**para datos sin transformar**
Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [None]:
metricas_xgb = metricas(modelo_xgb.best_estimator_,Y_train, modelo_xgb.predict(X_train), Y_test, modelo_xgb.predict(X_test))
metricas_xgb

Unnamed: 0,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test
0,"XGBRegressor(max_depth=5, n_jobs=-1, objective...",59268584.0,74319680.0,0.162174,0.210494,0.913513,0.782425,165948600.0,297364200.0


In [None]:
modelo_xgb.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('xgb',
                 XGBRegressor(max_depth=5, n_jobs=-1,
                              objective='reg:squarederror', reg_alpha=0.01))])

### **Red Neuronal**

####**Sin tranformar la Y**

In [None]:
# from sklearn.neural_network import MLPRegressor
# pipe_rnn = Pipeline(steps = [('scaler', StandardScaler()), ('rnn', MLPRegressor(activation='relu',max_iter=100,random_state=42))])

**Búsqueda de Hiperparámetros**

In [None]:
# para_grid = {'rnn__hidden_layer_sizes': [(45,25),(95,50),(110,60),(150,75),(180,90)],
#     'rnn__alpha': [0.001,0.004,0.01,0.1],
#     'rnn__learning_rate_init': [0.001,0.004,0.01,0.1],
#     'rnn__max_iter':[200,500]}

# modelo_rnn = busqueda_hiperparametros(X_train, Y_train,var_stratify_train, pipe_rnn, param_grid = para_grid,transf=False )

In [None]:
# metricas_rnn = metricas(modelo_rnn.best_estimator_,Y_train, modelo_rnn.predict(X_train), Y_test, modelo_rnn.predict(X_test))
# metricas_rnn

In [None]:
# modelo_rnn.best_estimator_

####**Usando Y transformada**

In [None]:
# para_grid = {'rnn__hidden_layer_sizes': [(45,25),(95,50),(110,60),(150,75),(180,90)],
#     'rnn__alpha': [0.001,0.004,0.01,0.1],
#     'rnn__learning_rate_init': [0.001,0.004,0.01,0.1],
#     'rnn__max_iter':[200,500]}
    
# modelo_rnn_t = busqueda_hiperparametros(X_train, Y_train_t,var_stratify_train, pipe_rnn, param_grid = para_grid, transf=True )

In [None]:
# metricas_rnn_t = metricas(modelo_rnn_t.best_estimator_,Y_train, np.expm1(modelo_rnn_t.predict(X_train)), Y_test, np.expm1(modelo_rnn_t.predict(X_test)))
# metricas_rnn_t

In [None]:
# modelo_rnn_t.best_estimator_

Referencia: https://www.cienciadedatos.net/documentos/py35-redes-neuronales-python.html

In [None]:
# resultados_rnn =  pd.concat([metricas_rnn, metricas_rnn_t],ignore_index= True)
# metricas_rnn.to_csv('resultados_train_test_general_rnn.csv',sep=";",decimal=",")

KNN con Keras

In [None]:
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Activation

# model = Sequential([
#     Dense(32, input_shape=(input_dim,)),
#     Activation('tanh'),
#     Dense(nb_classes),
#     Activation('softmax'),
# ])

## Resumen

In [None]:
from google.colab import drive
drive.mount('/content/drive/')
%cd '/content/drive/My Drive/Colab Notebooks/Seminario/Etapa Modelamiento/Resultados'

KeyboardInterrupt: ignored

In [None]:
import os
os.getcwd()

'/content'

In [None]:
cross_validation_rf = pd.DataFrame(modelo_rf.cv_results_)
cross_validation_rf = cross_validation_rf[cross_validation_rf['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica','mean_train_meae','mean_test_meae','mean_train_mape','mean_test_mape','mean_train_r2','mean_test_r2','mean_train_rmse','mean_test_rmse']]
cross_validation_rf['escenario']='modelo_rf'

cross_validation_rft=pd.DataFrame(modelo_rf_t.cv_results_)
cross_validation_rft = cross_validation_rft[cross_validation_rft['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica','mean_train_meae','mean_test_meae','mean_train_mape','mean_test_mape','mean_train_r2','mean_test_r2','mean_train_rmse','mean_test_rmse']]
cross_validation_rft['escenario']='modelo_rft'

cross_validation_gbt=pd.DataFrame(modelo_gbt.cv_results_)
cross_validation_gbt = cross_validation_gbt[cross_validation_gbt['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica','mean_train_meae','mean_test_meae','mean_train_mape','mean_test_mape','mean_train_r2','mean_test_r2','mean_train_rmse','mean_test_rmse']]
cross_validation_gbt['escenario']='modelo_gbt'

cross_validation_gbtt=pd.DataFrame(modelo_gbt_t.cv_results_)
cross_validation_gbtt = cross_validation_gbtt[cross_validation_gbtt['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica','mean_train_meae','mean_test_meae','mean_train_mape','mean_test_mape','mean_train_r2','mean_test_r2','mean_train_rmse','mean_test_rmse']]
cross_validation_gbtt['escenario']='modelo_gbtt'

cross_validation_xgb=pd.DataFrame(modelo_xgb.cv_results_)
cross_validation_xgb = cross_validation_xgb[cross_validation_xgb['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica','mean_train_meae','mean_test_meae','mean_train_mape','mean_test_mape','mean_train_r2','mean_test_r2','mean_train_rmse','mean_test_rmse']]
cross_validation_xgb['escenario']='modelo_xgb'

# cross_validation_rnn=pd.DataFrame(modelo_rnn.cv_results_)
# cross_validation_rnn = cross_validation_rnn[cross_validation_rnn['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica','mean_train_meae','mean_test_meae','mean_train_mape','mean_test_mape','mean_train_r2','mean_test_r2','mean_train_rmse','mean_test_rmse']]
# cross_validation_rnn['escenario']='modelo_rnn'

# cross_validation_rnnt=pd.DataFrame(modelo_rnn_t.cv_results_)
# cross_validation_rnnt = cross_validation_rnnt[cross_validation_rnnt['rank_test_metrica']==1][['params','mean_test_metrica', 'std_test_metrica','mean_train_metrica', 'std_train_metrica','mean_train_meae','mean_test_meae','mean_train_mape','mean_test_mape','mean_train_r2','mean_test_r2','mean_train_rmse','mean_test_rmse']]
# cross_validation_rnnt['escenario']='modelo_rnnt'

# resultado_cross = pd.concat([cross_validation_rf,cross_validation_rft, cross_validation_gbt, cross_validation_gbtt,cross_validation_rnn,cross_validation_rnnt],
#                             ignore_index = True)
resultado_cross = pd.concat([cross_validation_rf,cross_validation_rft, cross_validation_gbt, cross_validation_gbtt,cross_validation_xgb],
                            ignore_index = True)

columna = pd.DataFrame(resultado_cross['params'])

resultado_cross.drop(['params'],axis=1, inplace=True)
resultado_cross = resultado_cross.drop_duplicates()
resultado_cross.reset_index(inplace=True, drop=False)
resultado_cross

Unnamed: 0,index,mean_test_metrica,std_test_metrica,mean_train_metrica,std_train_metrica,mean_train_meae,mean_test_meae,mean_train_mape,mean_test_mape,mean_train_r2,mean_test_r2,mean_train_rmse,mean_test_rmse,escenario
0,0,-76145670.0,4939623.0,-58639400.0,772135.4,58639400.0,76145670.0,0.16853,0.238447,0.889917,0.718674,3.47399e+16,9.546208e+16,modelo_rf
1,3,-0.1521939,0.0147229,-0.1083944,0.001860813,50954130.0,71848200.0,0.146663,0.217674,0.874067,0.700873,199339800.0,318443800.0,modelo_rft
2,4,-64299970.0,7342527.0,-34513410.0,1022537.0,34513410.0,64299970.0,0.093753,0.210542,0.979256,0.719483,6547787000000000.0,9.521331e+16,modelo_gbt
3,5,-0.1319696,0.01978878,-0.03026754,0.002505411,15754030.0,59866690.0,0.045006,0.208542,0.990035,0.697333,55911480.0,320894000.0,modelo_gbtt
4,8,-72098430.0,9336828.0,-57138520.0,1068107.0,57138520.0,72098430.0,0.155515,0.219528,0.922193,0.741451,2.455513e+16,8.808756e+16,modelo_xgb
5,9,-72098430.0,9336828.0,-57138520.0,1068107.0,57138520.0,72098430.0,0.155515,0.219528,0.922193,0.741451,2.455513e+16,8.808756e+16,modelo_xgb
6,10,-72098430.0,9336828.0,-57138520.0,1068107.0,57138520.0,72098430.0,0.155515,0.219528,0.922193,0.741451,2.455513e+16,8.808756e+16,modelo_xgb


In [None]:
resultado_metricas = pd.concat([metricas_rf, metricas_rf_t, metricas_gbt, metricas_gbt_t, metricas_xgb],
                            ignore_index = True)
resultado_metricas.drop_duplicates(inplace=True)
resultado_metricas.reset_index(inplace=True, drop=False)
resultado_metricas

Unnamed: 0,index,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test
0,0,"(DecisionTreeRegressor(max_depth=9, max_featur...",59264530.0,73215840.0,0.172622,0.215121,0.885647,0.778355,190820000.0,300132600.0
1,1,"(DecisionTreeRegressor(max_depth=9, max_featur...",52954920.0,72558250.0,0.150728,0.20504,0.867465,0.754846,205430200.0,315648100.0
2,2,([DecisionTreeRegressor(criterion='friedman_ms...,36762560.0,73080390.0,0.100509,0.215789,0.975973,0.77814,87467380.0,300278100.0
3,3,([DecisionTreeRegressor(criterion='friedman_ms...,17295960.0,74871650.0,0.049577,0.195328,0.989129,0.764,58834800.0,309698800.0
4,4,"XGBRegressor(max_depth=5, n_jobs=-1, objective...",59268580.0,74319680.0,0.162174,0.210494,0.913513,0.782425,165948600.0,297364200.0


In [None]:
resultados =  pd.concat([resultado_metricas, resultado_cross], axis=1)
resultados

Unnamed: 0,index,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test,index.1,mean_test_metrica,std_test_metrica,mean_train_metrica,std_train_metrica,mean_train_meae,mean_test_meae,mean_train_mape,mean_test_mape,mean_train_r2,mean_test_r2,mean_train_rmse,mean_test_rmse,escenario
0,0.0,"(DecisionTreeRegressor(max_depth=9, max_featur...",59264530.0,73215840.0,0.172622,0.215121,0.885647,0.778355,190820000.0,300132600.0,0,-76145670.0,4939623.0,-58639400.0,772135.4,58639400.0,76145670.0,0.16853,0.238447,0.889917,0.718674,3.47399e+16,9.546208e+16,modelo_rf
1,1.0,"(DecisionTreeRegressor(max_depth=9, max_featur...",52954920.0,72558250.0,0.150728,0.20504,0.867465,0.754846,205430200.0,315648100.0,3,-0.1521939,0.0147229,-0.1083944,0.001860813,50954130.0,71848200.0,0.146663,0.217674,0.874067,0.700873,199339800.0,318443800.0,modelo_rft
2,2.0,([DecisionTreeRegressor(criterion='friedman_ms...,36762560.0,73080390.0,0.100509,0.215789,0.975973,0.77814,87467380.0,300278100.0,4,-64299970.0,7342527.0,-34513410.0,1022537.0,34513410.0,64299970.0,0.093753,0.210542,0.979256,0.719483,6547787000000000.0,9.521331e+16,modelo_gbt
3,3.0,([DecisionTreeRegressor(criterion='friedman_ms...,17295960.0,74871650.0,0.049577,0.195328,0.989129,0.764,58834800.0,309698800.0,5,-0.1319696,0.01978878,-0.03026754,0.002505411,15754030.0,59866690.0,0.045006,0.208542,0.990035,0.697333,55911480.0,320894000.0,modelo_gbtt
4,4.0,"XGBRegressor(max_depth=5, n_jobs=-1, objective...",59268580.0,74319680.0,0.162174,0.210494,0.913513,0.782425,165948600.0,297364200.0,8,-72098430.0,9336828.0,-57138520.0,1068107.0,57138520.0,72098430.0,0.155515,0.219528,0.922193,0.741451,2.455513e+16,8.808756e+16,modelo_xgb
5,,,,,,,,,,,9,-72098430.0,9336828.0,-57138520.0,1068107.0,57138520.0,72098430.0,0.155515,0.219528,0.922193,0.741451,2.455513e+16,8.808756e+16,modelo_xgb
6,,,,,,,,,,,10,-72098430.0,9336828.0,-57138520.0,1068107.0,57138520.0,72098430.0,0.155515,0.219528,0.922193,0.741451,2.455513e+16,8.808756e+16,modelo_xgb


In [None]:
resultados.to_csv('resultados_train_test_general_varibles_basicas2.csv',sep=";",decimal=",")

In [None]:
cross_validation_rf = pd.DataFrame(modelo_rf.cv_results_)
cross_validation_rf['escenario']='modelo_rf'

cross_validation_rft=pd.DataFrame(modelo_rf_t.cv_results_)
cross_validation_rft['escenario']='modelo_rft'

cross_validation_gbt=pd.DataFrame(modelo_gbt.cv_results_)
cross_validation_gbt['escenario']='modelo_gbt'

cross_validation_gbtt=pd.DataFrame(modelo_gbt_t.cv_results_)
cross_validation_gbtt['escenario']='modelo_gbtt'

cross_validation_xgb=pd.DataFrame(modelo_xgb.cv_results_)
cross_validation_xgb['escenario']='modelo_xgb'

resultado_cross_total = pd.concat([cross_validation_rf,cross_validation_rft, cross_validation_gbt, cross_validation_gbtt,cross_validation_xgb],
                            ignore_index = True)

columna = pd.DataFrame(resultado_cross_total['params'])

resultado_cross_total.drop(['params'],axis=1, inplace=True)
resultado_cross_total = resultado_cross_total.drop_duplicates()
resultado_cross_total.reset_index(inplace=True, drop=False)

In [None]:
resultado_cross_total.to_csv('resultados_crossvalidacion_general_varibles_basicas2.csv',sep=";",decimal=",")

##Exportar modelos

In [38]:
from joblib import dump, load
import xgboost as xgb

In [39]:
#Exportar modelo básico 15 variables

data_model=df_propiedades.copy()
varibles_considerar = ['tipo','precio','area_m2','banos','garajes','antiguedad','estrato','ubicacion','tipo_propiedad','balcon','zonas_verdes','en_conjunto_cerrado','zona_infantil','supermercados_ccomerciales','colegios_universidades','trans_publico_cercano','habitaciones']
data_model = data_model.loc[:,varibles_considerar]

X = data_model.drop(['precio'], axis=1)
y = data_model['precio']

enc = OneHotEncoder(handle_unknown='ignore')
data_aux = pd.DataFrame(enc.fit_transform(X[['tipo','tipo_propiedad','ubicacion','antiguedad']]).toarray(), columns= enc.get_feature_names(['tipo','tipo_propiedad','ubicacion','antiguedad']))
data_aux['fila'] = range(0, X.shape[0])

X.drop(['tipo','tipo_propiedad','antiguedad'],axis=1,inplace=True)
X['fila'] = range(0, X.shape[0])

X = pd.merge(data_aux,X, on = 'fila', how='inner')
X.drop(['fila'], axis=1, inplace=True)

X_train, X_test, Y_train, Y_test, Y_train_t, Y_test_t, var_stratify_train = div_train_test(X, y, var_stratify = 'ubicacion')

pipe_xgb = Pipeline(steps = [('scaler', StandardScaler()), ('xgb', xgb.XGBRegressor(max_depth=7, n_jobs=-1,
                              objective='reg:squarederror', reg_alpha=0.01))])

pipe_xgb.fit(X_train, Y_train)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Pipeline(steps=[('scaler', StandardScaler()),
                ('xgb',
                 XGBRegressor(max_depth=7, n_jobs=-1,
                              objective='reg:squarederror', reg_alpha=0.01))])

In [40]:
metricas_ = metricas(pipe_xgb,Y_train, pipe_xgb.predict(X_train), Y_test, pipe_xgb.predict(X_test))
metricas_

Unnamed: 0,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test
0,"XGBRegressor(max_depth=7, n_jobs=-1, objective...",39303600.0,83417216.0,0.109669,0.208713,0.967241,0.789518,102132700.0,292476600.0


In [41]:
dump(pipe_gbt, 'export_model_xgb_15112021.joblib') 

['export_model_xgb_15112021.joblib']

In [42]:
X_train.head(1).to_csv('x_labels_xgb_15V.csv')

In [43]:
#Exportar modelo 7 variables

data_model=df_propiedades.copy()
varibles_considerar = ['tipo','tipo_propiedad','precio','area_m2','banos','estrato','habitaciones','ubicacion']


data_model = data_model.loc[:,varibles_considerar]

X = data_model.drop(['precio'], axis=1)
y = data_model['precio']

In [44]:
enc = OneHotEncoder(handle_unknown='ignore')
data_aux = pd.DataFrame(enc.fit_transform(X[['tipo','tipo_propiedad','ubicacion']]).toarray(), columns= enc.get_feature_names(['tipo','tipo_propiedad','ubicacion']))
data_aux['fila'] = range(0, X.shape[0])

X.drop(['tipo','tipo_propiedad'],axis=1,inplace=True)
X['fila'] = range(0, X.shape[0])

X = pd.merge(data_aux,X, on = 'fila', how='inner')
X.drop(['fila'], axis=1, inplace=True)

X_train, X_test, Y_train, Y_test, Y_train_t, Y_test_t, var_stratify_train = div_train_test(X, y, var_stratify = 'ubicacion')

pipe_xgb2 = Pipeline(steps = [('scaler', StandardScaler()), ('xgb', xgb.XGBRegressor(max_depth=7, n_jobs=-1,
                              objective='reg:squarederror', reg_alpha=0.01))])

pipe_xgb2.fit(X_train, Y_train)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Pipeline(steps=[('scaler', StandardScaler()),
                ('xgb',
                 XGBRegressor(max_depth=7, n_jobs=-1,
                              objective='reg:squarederror', reg_alpha=0.01))])

In [45]:
pipe_gbt2.fit(X_train, Y_train)

Pipeline(steps=[('scaler', StandardScaler()),
                ('xgb',
                 XGBRegressor(max_depth=7, n_jobs=-1,
                              objective='reg:squarederror', reg_alpha=0.01))])

In [46]:
metricas_2 = metricas(pipe_xgb2,Y_train, pipe_xgb2.predict(X_train), Y_test, pipe_xgb2.predict(X_test))
metricas_2

Unnamed: 0,parametros,MeAE_train,MeAE_test,MAPE_train,MAPE_test,r2_train,r2_test,rmse_train,rmse_test
0,"XGBRegressor(max_depth=7, n_jobs=-1, objective...",45078528.0,80152320.0,0.128365,0.214738,0.949141,0.760602,127257400.0,311920700.0


In [47]:
dump(pipe_xgb2, 'export_model_xgb_7V_15112021.joblib') 

['export_model_xgb_7V_15112021.joblib']

In [48]:
X_train.head(1).to_csv('x_labels_xgb_7V.csv')