# ElasticNet + LGBM

---

Importamos librerias

In [18]:
import os
import sys
import warnings
import joblib
import json

import pandas as pd
import numpy as np

from sklearn.base import clone
from sklearn.metrics import r2_score, root_mean_squared_error
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet

from lightgbm import LGBMRegressor as LGBM

import mlflow
from mlflow.tracking import MlflowClient
from mlflow.models import infer_signature

sys.path.append('../../../utils/')
from utils_yose import build_preprocessor
from utils_yose import MODEL_NAME
from mlflow_setup import setup_mlflow

sys.path.append('../../ensemble_2p/')
from model_ensemble.ensemble import WeightedEnsemble

warnings.filterwarnings('ignore')
pd.set_option('display.float_format', lambda x: '%.5f' % x)

---

Cargamos los datos

In [19]:
train_data = "../../../../data/housing_data/train.csv"
test_data = "../../../../data/housing_data/test.csv"

if os.path.exists(train_data):
    df_train = pd.read_csv(train_data)
else:
    print("No se encuentra el archivo de entrenamiento")
    
if os.path.exists(test_data):
    df_test = pd.read_csv(test_data)
    X_test = df_test.drop(["Id"], axis=1)
else:
    print("No se encuentra el archivo de prueba")

---

Modelos base con los que se va a trabajar

In [20]:
y = np.log1p(df_train["SalePrice"]).astype(float)
X = df_train.drop(["SalePrice", "Id"], axis=1)

rstate = 42
base_models = {
    "elasticnet": ElasticNet(alpha=0.0005, l1_ratio=0.9, random_state=rstate),
    "lgbm": LGBM(n_estimators=3000, learning_rate=0.03, max_depth=-1,
                 num_leaves=31, subsample=0.8, colsample_bytree=0.8,
                 random_state=rstate, n_jobs=-1)
}

---

Entrenamiento del modelo con Cross-Validation

In [21]:
kf = KFold(n_splits=10, shuffle=True, random_state=rstate)
oof_preds = {name: np.zeros(len(X), dtype=float) for name in base_models}
oof_idx_mask = np.zeros(len(X), dtype=bool)
fold_metrics = []

for fold, (tr_idx, va_idx) in enumerate(kf.split(X, y), 1):
    X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]
    pre = build_preprocessor(X_tr)
    fold_preds = {}
    for name, mdl in base_models.items():
        pipe = Pipeline([("pre", clone(pre)), ("model", clone(mdl))])
        pipe.fit(X_tr, y_tr)
        p = pipe.predict(X_va)
        fold_preds[name] = p
    p_ens = np.mean(np.column_stack([fold_preds[n] for n in base_models]), axis=1)
    rmse = root_mean_squared_error(y_va, p_ens)
    r2 = r2_score(y_va, p_ens)
    fold_metrics.append({"fold": fold, "rmse": float(rmse), "r2": float(r2)})
    for name in base_models:
        oof_preds[name][va_idx] = fold_preds[name]
    oof_idx_mask[va_idx] = True

cv_rmse_mean = float(np.mean([m["rmse"] for m in fold_metrics]))
cv_rmse_std  = float(np.std([m["rmse"] for m in fold_metrics]))
cv_r2_mean   = float(np.mean([m["r2"] for m in fold_metrics]))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002922 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3373
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 183
[LightGBM] [Info] Start training from score 12.025324
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003457 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3358
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 185
[LightGBM] [Info] Start training from score 12.028659
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002850 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not eno

---

Optimizacion de los pesos del ensamble

In [39]:
grid_step = 0.05
best = {"weights": None, "rmse": np.inf, "r2": -np.inf}
M = np.column_stack([oof_preds["elasticnet"], oof_preds["lgbm"]])
y_true = y.values

for w1 in np.arange(0, 1 + 1e-9, grid_step):
            w2 = 1 - w1
            y_pred = w1 * oof_preds["elasticnet"] + w2 * oof_preds["lgbm"]
            rmse = root_mean_squared_error(y_true, y_pred)
            r2 = r2_score(y_true, y_pred)
            if rmse < best["rmse"]:
                best["weights"] = (w1, w2)
                best["rmse"] = rmse
                best["r2"] = r2

opt_weights = {"elasticnet": best["weights"][0], "lgbm": best["weights"][1]}

pre_final = build_preprocessor(X)
final_pipes = {}
for name, mdl in base_models.items():
    pipe = Pipeline([("pre", clone(pre_final)), ("model", clone(mdl))])
    pipe.fit(X, y)
    final_pipes[name] = pipe

for model in final_pipes:
    model_path = os.path.join('../model_ensemble/ensemble_model/', f"model_{model}.pkl")
    joblib.dump(final_pipes[model], model_path)

weights_json = "../model_ensemble/ensemble_weights/weights.json"
json.dump({
    "w_elnet": float(opt_weights["elasticnet"]),
    "w_lgbm":  float(opt_weights["lgbm"])
}, open(weights_json, "w"))


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003638 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3461
[LightGBM] [Info] Number of data points in the train set: 1460, number of used features: 187
[LightGBM] [Info] Start training from score 12.024057


---

Creamos y guardamos con MLFlow

In [33]:
exp_id = setup_mlflow(
    experiment="HousePrices-Competition", 
    description="ElasticNet+LGBM_Ensemble", 
    experiment_tags={"Kaggle-Model": "ElasticNet+LGBM_Ensemble"}
)

prediction_example = pd.read_csv("../../../../data/housing_data/sample_submission.csv").drop(["Id"], axis=1)
ensemble = WeightedEnsemble()
signature = infer_signature(X_test, prediction_example)

with mlflow.start_run(run_name="ElasticNet+LGBM_Ensemble", experiment_id=exp_id) as run:
    
    try:
        enet_est = getattr(final_pipes["elasticnet"], "named_steps", {}).get("model") \
            or final_pipes["elasticnet"].named_steps[list(final_pipes["elasticnet"].named_steps.keys())[-1]]
        lgbm_est = getattr(final_pipes["lgbm"], "named_steps", {}).get("model") \
            or final_pipes["lgbm"].named_steps[list(final_pipes["lgbm"].named_steps.keys())[-1]]
        mlflow.log_param("random_state", rstate)
        mlflow.log_param("elasticnet_alpha", getattr(enet_est, "alpha", None))
        mlflow.log_param("elasticnet_l1_ratio", getattr(enet_est, "l1_ratio", None))
        mlflow.log_param("lgbm_n_estimators", getattr(lgbm_est, "n_estimators", None))
        mlflow.log_param("lgbm_max_depth", getattr(lgbm_est, "max_depth", None))
        mlflow.log_param("lgbm_num_leaves", getattr(lgbm_est, "num_leaves", None))
        mlflow.log_param("lgbm_learning_rate", getattr(lgbm_est, "learning_rate", None))
        mlflow.log_param("lgbm_subsample", getattr(lgbm_est, "subsample", None))
        mlflow.log_param("lgbm_colsample_bytree", getattr(lgbm_est, "colsample_bytree", None))
    except Exception as e:
        print(e)
    
    train_ds = mlflow.data.from_pandas(df_train, source="data/housing_data/train.csv")
    mlflow.log_input(train_ds, context="training")
    
    test_ds = mlflow.data.from_pandas(df_test, source="data/housing_data/test.csv")
    mlflow.log_input(test_ds, context="inference")
    
    mlflow.log_param("n_features", X_test.shape[1])
    mlflow.log_param("ensemble_weights_elasticnet", opt_weights["elasticnet"])
    mlflow.log_param("ensemble_weights_lgbm", opt_weights["lgbm"])
    
    mlflow.log_metric("cv_rmse_mean", cv_rmse_mean)
    mlflow.log_metric("cv_rmse_std", cv_rmse_std)
    mlflow.log_metric("cv_r2_mean", cv_r2_mean)
    
    mlflow.pyfunc.log_model(
        python_model = ensemble,
        artifacts={
            "weights_json": weights_json,
            "model_elasticnet_pkl": "../model_ensemble/ensemble_model/model_elasticnet.pkl",
            "model_lgbm_pkl": "../model_ensemble/ensemble_model/model_lgbm.pkl"
        },
        signature=signature,
        artifact_path="ensemble_model",
        input_example=X_test.head(),
    )
    mlflow.log_artifact(train_data, artifact_path="data")
    mlflow.log_artifact(test_data, artifact_path="data")

    mlflow.log_artifact(weights_json, artifact_path="ensemble_weights")
    mlflow.log_artifact("../model_ensemble/ensemble_model/model_elasticnet.pkl", artifact_path="ensemble_model")
    mlflow.log_artifact("../model_ensemble/ensemble_model/model_lgbm.pkl", artifact_path="ensemble_model")

    mlflow.set_tags({
        "task": "regression",
        "dataset": "Kaggle-HousePrices-Competition",
        "target": "SalePrice",
        "target_transform": "log1p",
        "inference_transform": "expm1",
        "ensemble": "Weighted_ElasticNet+LGBM"
    })
    MODEL_URI = f"runs:/{run.info.run_id}/ensemble_model"
    print("URI del modelo:", f"runs:/{run.info.run_id}/ensemble_model")

client = MlflowClient()
remote_path = ['ensemble_model', 'ensemble_weights']
local_dir = "../model_ensemble/"
for path in remote_path:
    client.download_artifacts(run_id=run.info.run_id, path=path, dst_path=local_dir)

print("Guardado en:", local_dir)

[MLflow] Tracking URI: file:/Users/yosesotomayor/Code/retoCasas/.ML
[MLflow] Registry URI: file:/Users/yosesotomayor/Code/retoCasas/.ML
[MLflow] Experimento:  HousePrices-Competition (id=116606298190487712)
[MLflow] Artifact loc:  file:///Users/yosesotomayor/Code/retoCasas/.ML/116606298190487712


2025/08/27 17:30:38 INFO mlflow.pyfunc: Validating input example against model signature
Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 871.27it/s] 
Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 1468.08it/s]
Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 229.05it/s]


URI del modelo: runs:/1650df3d3ff24d328ac0e365a0283534/ensemble_model


Downloading artifacts: 100%|██████████| 2/2 [00:00<00:00, 76.19it/s]  
Downloading artifacts: 100%|██████████| 10/10 [00:00<00:00, 1505.12it/s]
Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 3013.15it/s]

Guardado en: ../model_ensemble/





---

Mostramos resultados

In [34]:
print("\n=== RESUMEN CV ===")
print(f"RMSE (mean ± std): {cv_rmse_mean:.5f} ± {cv_rmse_std:.5f}")
print(f"R2   (mean):       {cv_r2_mean:.5f}")
print("Pesos óptimos (OOF):", opt_weights)
print("Modelos y artefactos registrados en MLflow (ver en la UI del tracking server).")


=== RESUMEN CV ===
RMSE (mean ± std): 0.11708 ± 0.02141
R2   (mean):       0.90927
Pesos óptimos (OOF): {'elasticnet': np.float64(0.5), 'lgbm': np.float64(0.5)}
Modelos y artefactos registrados en MLflow (ver en la UI del tracking server).


---

### Deployment del modelo

Registrar versiones

In [35]:
mv = mlflow.register_model(
    model_uri = MODEL_URI,
    name = MODEL_NAME,
    tags = {
        "task": "regression",
        "dataset": "Kaggle-HousePrices-Competition",
        "target": "SalePrice",
        "target_transform": "log1p",
        "inference_transform": "expm1",
        "ensemble": "Weighted_ElasticNet+LGBM"
    },
)

client.set_registered_model_tag(mv.name, "created_by", "Yose Sotomayor")
client.set_registered_model_alias(mv.name, "challenger", version=mv.version)
print("Version registrada:", mv.version)

Registered model 'HousePrices_Ensemble' already exists. Creating a new version of this model...


Version registrada: 2


Created version '2' of model 'HousePrices_Ensemble'.


Transicionar versiones

In [36]:
client.delete_registered_model_alias(MODEL_NAME, "challenger")
client.set_registered_model_alias(MODEL_NAME, "champion", mv.version)

---

Guardar los datos para subir a Kaggle

In [38]:
model = mlflow.pyfunc.load_model(f"models:/{MODEL_NAME}@champion")

preds_orig  = model.predict(X_test)

sub_dir = "../../../data/housing_submissions/elnet_lgbm"
os.makedirs(sub_dir, exist_ok=True)
submission_path = os.path.join(sub_dir, "submission_elnet_lgbm.csv")


df_sub = pd.DataFrame({"Id": df_test["Id"], "SalePrice": preds_orig})
df_sub.to_csv(submission_path, index=False)

---