## Entrenamiento de modelos usando Pycaret


In [1]:
import pandas as pd
import joblib
import os

os.chdir('../')

#### Cargar datos de entrenamiento

In [2]:
df_train = joblib.load("datos/data_train.joblib")


In [3]:
df_train.head()

Unnamed: 0,motivo,departamento_inmueble,municipio_inmueble,barrio,vias_pavimentadas,sardineles_en_las_vias,andenes_en_las_vias,estrato,topografia_sector,demanda_interes,...,calidad_acabados_banos,estado_acabados_cocina,calidad_acabados_cocina,tipo_garaje,numero_total_de_garajes,total_cupos_parquedaro,tipo_deposito,numero_total_depositos,area_libre,clean_valor_total_avaluo
4279,Empleados,ANTIOQUIA,ITAGUI,Ditares,Si,No,No,4.0,Ligera,2.0,...,2.0,3.0,2.0,2.0,1.0,1.0,0,0.0,No,194906580.0
8278,Crédito hipotecario de vivienda,META,PUERTO LOPEZ,SANTANDER,Si,Si,Si,3.0,Plano,2.0,...,1.0,3.0,1.0,0.0,0.0,0.0,0,0.0,0,122794500.0
6684,Crédito hipotecario de vivienda,NORTE DE SANTANDER,CUCUTA,PRADOS DEL ESTE,Si,Si,Si,3.0,Plano,2.0,...,2.0,3.0,1.0,3.0,1.0,1.0,0,0.0,Si,161650000.0
12351,Crédito hipotecario de vivienda,ANTIOQUIA,RIONEGRO,Sector San Antonio de Pereira,Si,Si,Si,4.0,Plano,3.0,...,2.0,3.0,2.0,0.0,0.0,0.0,0,0.0,No,226860000.0
10770,Crédito hipotecario de vivienda,SANTANDER,GIRON,URB. MARIANELA,No,No,Si,2.0,Ligera,2.0,...,1.0,3.0,3.0,0.0,0.0,0.0,0,0.0,No,111707200.0


In [4]:
df_train.shape

(10155, 90)

#### Cargar encoder y selector de variables

In [5]:
import joblib
fwiz = joblib.load("modelos/fwiz.joblib")
cat_econder = joblib.load("modelos/cat_econder.joblib")

Imported 0.3.2 version. Select nrows to a small number when running on huge datasets.
output = featurewiz(dataname, target, corr_limit=0.90, verbose=2, sep=',', 
		header=0, test_data='',feature_engg='', category_encoders='',
		dask_xgboost_flag=False, nrows=None, skip_sulov=False, skip_xgboost=False)
Create new features via 'feature_engg' flag : ['interactions','groupby','target']



#### Aplicar encoder y selector de variables

In [6]:
X_train_t = cat_econder.transform(df_train.drop(columns=['clean_valor_total_avaluo']))

In [7]:
X_train_selected = pd.concat([fwiz.transform(X_train_t), df_train['clean_valor_total_avaluo']], axis=1)

#### Realizar experimento con pycaret

In [8]:
from pycaret.regression import *

In [9]:
reg = setup(data=X_train_selected, target='clean_valor_total_avaluo', train_size=0.8, log_experiment=True, experiment_name='reg_experiments', log_plots=True)

Unnamed: 0,Description,Value
0,Session id,1974
1,Target,clean_valor_total_avaluo
2,Target type,Regression
3,Original data shape,"(10155, 104)"
4,Transformed data shape,"(10155, 104)"
5,Transformed train set shape,"(8124, 104)"
6,Transformed test set shape,"(2031, 104)"
7,Numeric features,103
8,Preprocess,True
9,Imputation type,simple


2023/11/06 19:21:45 INFO mlflow.tracking.fluent: Experiment with name 'reg_experiments' does not exist. Creating a new experiment.


In [10]:
top5 = compare_models(n_select=5, exclude=(['ransac', 'knn']), sort='MAPE', fold=5)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,36257653.2692,2.636678063384905e+16,148522177.4104,0.6787,0.2181,0.1485,2.142
xgboost,Extreme Gradient Boosting,38763691.2,3.992999537818993e+16,180529113.6,0.4611,0.2146,0.1576,0.746
et,Extra Trees Regressor,38795851.1611,3.5871284555126924e+16,167564564.766,0.5404,0.2202,0.1578,1.952
lightgbm,Light Gradient Boosting Machine,39049364.0505,2.4558265532933748e+16,141352422.044,0.7471,0.2324,0.1624,0.242
gbr,Gradient Boosting Regressor,42813214.5136,2.934670440052056e+16,156198974.5883,0.6295,0.2448,0.1912,0.792
dt,Decision Tree Regressor,53503836.6782,4.460118722793694e+16,200077341.3751,0.3512,0.3051,0.2047,0.084
huber,Huber Regressor,64234228.3792,3.3124458521091224e+16,178050197.2072,0.5749,0.4465,0.2474,0.466
en,Elastic Net,71776425.6,3.3281770725677464e+16,175978534.4,0.5825,0.5954,0.3554,0.824
omp,Orthogonal Matching Pursuit,75330064.0,3.5162320438244148e+16,182775497.6,0.5398,0.5213,0.3568,0.04
br,Bayesian Ridge,75108673.6,3.467580307857736e+16,181022483.2,0.5512,0.5278,0.3614,0.048


#### Exportar mejor modelo

In [11]:
final_best = finalize_model(top5[0])

In [12]:
save_model(final_best, 'modelos/best_model-pipeline')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['barrio_50', 'barrio_90',
                                              'area_valorada', 'barrio_10',
                                              'cocina', 'estrato',
                                              'municipio_inmueble_50',
                                              'total_cupos_parquedaro',
                                              'vetustez', 'topografia_sector_75',
                                              'tipo_inmueble_25',
                                              'tipo_inmueble_10',
                                              'departamento_inmueble_10',
                                              'municipio_inmueble_10',
                                              'tipo_garaje'...
                                              'estado_acabados_cocina',
                                              'departamento_inmueble_