## Entrenamiento de modelos usando Pycaret


In [41]:
import pandas as pd
import joblib
import os

os.chdir('../')

#### Cargar datos de entrenamiento

In [43]:
df_train = joblib.load("datos/data_train.joblib")

In [44]:
df_train.head()

Unnamed: 0,motivo,departamento_inmueble,municipio_inmueble,barrio,vias_pavimentadas,sardineles_en_las_vias,andenes_en_las_vias,estrato,topografia_sector,demanda_interes,...,calidad_acabados_banos,estado_acabados_cocina,calidad_acabados_cocina,tipo_garaje,numero_total_de_garajes,total_cupos_parquedaro,tipo_deposito,numero_total_depositos,area_libre,clean_valor_total_avaluo
4279,Empleados,ANTIOQUIA,ITAGUI,Ditares,Si,No,No,4.0,Ligera,2.0,...,2.0,3.0,2.0,2.0,1.0,1.0,0,0.0,No,194906580.0
8278,Crédito hipotecario de vivienda,META,PUERTO LOPEZ,SANTANDER,Si,Si,Si,3.0,Plano,2.0,...,1.0,3.0,1.0,0.0,0.0,0.0,0,0.0,0,122794500.0
6684,Crédito hipotecario de vivienda,NORTE DE SANTANDER,CUCUTA,PRADOS DEL ESTE,Si,Si,Si,3.0,Plano,2.0,...,2.0,3.0,1.0,3.0,1.0,1.0,0,0.0,Si,161650000.0
12351,Crédito hipotecario de vivienda,ANTIOQUIA,RIONEGRO,Sector San Antonio de Pereira,Si,Si,Si,4.0,Plano,3.0,...,2.0,3.0,2.0,0.0,0.0,0.0,0,0.0,No,226860000.0
10770,Crédito hipotecario de vivienda,SANTANDER,GIRON,URB. MARIANELA,No,No,Si,2.0,Ligera,2.0,...,1.0,3.0,3.0,0.0,0.0,0.0,0,0.0,No,111707200.0


In [45]:
df_train.shape

(10155, 90)

#### Cargar encoder y selector de variables

In [46]:
import joblib
fwiz = joblib.load("modelos/fwiz.joblib")
cat_econder = joblib.load("modelos/cat_econder.joblib")

#### Aplicar encoder y selector de variables

In [47]:
X_train_t = cat_econder.transform(df_train.drop(columns=['clean_valor_total_avaluo']))

In [48]:
X_train_selected = pd.concat([fwiz.transform(X_train_t), df_train['clean_valor_total_avaluo']], axis=1)

#### Realizar experimento con pycaret

In [49]:
from pycaret.regression import *

In [50]:
reg = setup(data=X_train_selected, target='clean_valor_total_avaluo', train_size=0.2, log_experiment=True, experiment_name='reg_experiments', log_plots=True)

Unnamed: 0,Description,Value
0,Session id,559
1,Target,clean_valor_total_avaluo
2,Target type,Regression
3,Original data shape,"(10155, 104)"
4,Transformed data shape,"(10155, 104)"
5,Transformed train set shape,"(2031, 104)"
6,Transformed test set shape,"(8124, 104)"
7,Numeric features,103
8,Preprocess,True
9,Imputation type,simple


2023/11/02 18:54:27 INFO mlflow.tracking.fluent: Experiment with name 'reg_experiments' does not exist. Creating a new experiment.


In [51]:
top5 = compare_models(n_select=5, exclude=(['ransac', 'knn']), sort='MAPE', fold=5)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,38224086.3627,1.0040325892926634e+16,96852301.0325,0.8502,0.2165,0.1621,1.18
xgboost,Extreme Gradient Boosting,40774324.8,9254391587746612.0,94747179.2,0.8584,0.237,0.1733,0.152
lightgbm,Light Gradient Boosting Machine,41162216.5531,9192001017699348.0,93939430.5379,0.8581,0.2361,0.1736,0.128
et,Extra Trees Regressor,42780427.1562,9664976974867856.0,95856337.8392,0.851,0.2379,0.1787,0.312
rf,Random Forest Regressor,41507662.6435,1.0566499586375296e+16,101077549.3332,0.8382,0.2503,0.1794,0.452
gbr,Gradient Boosting Regressor,42735171.6789,8648254661906214.0,92015680.3124,0.8656,0.2455,0.1913,0.164
dt,Decision Tree Regressor,58128252.6493,1.712378766953389e+16,129329307.4651,0.7367,0.3516,0.2424,0.016
huber,Huber Regressor,65721485.1993,3.827592792268581e+16,191750651.6876,0.3797,0.4193,0.2472,0.072
omp,Orthogonal Matching Pursuit,82958084.8,3.905996782588395e+16,191769848.0,0.3306,0.5248,0.3869,0.01
br,Bayesian Ridge,84000520.0,3.825689269339422e+16,189953547.2,0.345,0.518,0.4024,0.014


#### Exportar mejor modelo

In [52]:
final_best = finalize_model(top5[0])

In [53]:
save_model(final_best, 'modelos/best_model-pipeline')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['barrio_50', 'barrio_90',
                                              'area_valorada', 'barrio_10',
                                              'cocina', 'estrato',
                                              'municipio_inmueble_50',
                                              'total_cupos_parquedaro',
                                              'vetustez', 'topografia_sector_75',
                                              'tipo_inmueble_25',
                                              'tipo_inmueble_10',
                                              'departamento_inmueble_10',
                                              'municipio_inmueble_10',
                                              'tipo_garaje'...
                                              'estado_acabados_cocina',
                                              'departamento_inmueble_