# ElasticNet + LGBM

---

Importamos librerias

In [1]:
import pandas as pd
import numpy as np

import os
import sys

sys.path.append('../../')
from utils_yose import build_preprocessor

from sklearn.base import clone
from sklearn.metrics import r2_score, root_mean_squared_error

from sklearn.linear_model import ElasticNet
from lightgbm import LGBMRegressor as LGBM

import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature

from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.float_format', lambda x: '%.5f' % x)

---

Cargamos los datos

In [2]:
train_data = "../../../data/housing_data/train.csv"
test_data = "../../../data/housing_data/test.csv"

if os.path.exists(train_data):
    df_train = pd.read_csv(train_data)
else:
    print("No se encuentra el archivo de entrenamiento")
    
if os.path.exists(test_data):
    df_test = pd.read_csv(test_data)
else:
    print("No se encuentra el archivo de prueba")

---

Modelos base con los que se va a trabajar

In [3]:
y = np.log1p(df_train["SalePrice"]).astype(float)
X = df_train.drop(["SalePrice", "Id"], axis=1)

rstate = 42
base_models = {
    "elasticnet": ElasticNet(alpha=0.0005, l1_ratio=0.9, random_state=rstate),
    "lgbm": LGBM(n_estimators=3000, learning_rate=0.03, max_depth=-1,
                 num_leaves=31, subsample=0.8, colsample_bytree=0.8,
                 random_state=rstate, n_jobs=-1)
}

---

Entrenamiento del modelo con Cross-Validation

In [4]:
kf = KFold(n_splits=10, shuffle=True, random_state=rstate)
oof_preds = {name: np.zeros(len(X), dtype=float) for name in base_models}
oof_idx_mask = np.zeros(len(X), dtype=bool)
fold_metrics = []

for fold, (tr_idx, va_idx) in enumerate(kf.split(X, y), 1):
    X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]
    pre = build_preprocessor(X_tr)
    fold_preds = {}
    for name, mdl in base_models.items():
        pipe = Pipeline([("pre", clone(pre)), ("model", clone(mdl))])
        pipe.fit(X_tr, y_tr)
        p = pipe.predict(X_va)
        fold_preds[name] = p
    p_ens = np.mean(np.column_stack([fold_preds[n] for n in base_models]), axis=1)
    rmse = root_mean_squared_error(y_va, p_ens)
    r2 = r2_score(y_va, p_ens)
    fold_metrics.append({"fold": fold, "rmse": float(rmse), "r2": float(r2)})
    for name in base_models:
        oof_preds[name][va_idx] = fold_preds[name]
    oof_idx_mask[va_idx] = True

cv_rmse_mean = float(np.mean([m["rmse"] for m in fold_metrics]))
cv_rmse_std  = float(np.std([m["rmse"] for m in fold_metrics]))
cv_r2_mean   = float(np.mean([m["r2"] for m in fold_metrics]))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004691 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3373
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 183
[LightGBM] [Info] Start training from score 12.025324
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006337 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3358
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 185
[LightGBM] [Info] Start training from score 12.028659
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004877 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not eno

---

Optimizacion de los pesos del ensamble

In [5]:
grid_step = 0.05
best = {"weights": None, "rmse": np.inf, "r2": -np.inf}
M = np.column_stack([oof_preds["elasticnet"], oof_preds["lgbm"]])
y_true = y.values

for w1 in np.arange(0, 1 + 1e-9, grid_step):
            w2 = 1 - w1
            y_pred = w1 * oof_preds["elasticnet"] + w2 * oof_preds["lgbm"]
            rmse = root_mean_squared_error(y_true, y_pred)
            r2 = r2_score(y_true, y_pred)
            if rmse < best["rmse"]:
                best["weights"] = (w1, w2)
                best["rmse"] = rmse
                best["r2"] = r2

opt_weights = {"elasticnet": best["weights"][0], "lgbm": best["weights"][1]}

pre_final = build_preprocessor(X)
final_pipes = {}
for name, mdl in base_models.items():
    pipe = Pipeline([("pre", clone(pre_final)), ("model", clone(mdl))])
    pipe.fit(X, y)
    final_pipes[name] = pipe

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006472 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3461
[LightGBM] [Info] Number of data points in the train set: 1460, number of used features: 187
[LightGBM] [Info] Start training from score 12.024057


---

Creamos el experimento de MLFlow

In [15]:
mlflow.set_experiment("HousePrices-2Ensemble")

sub_dir = "../../../data/housing_submissions/elnet_lgbm"
os.makedirs(sub_dir, exist_ok=True)
submission_path = os.path.join(sub_dir, "submission_elnet_lgbm.csv")

meta = {
    "cv": {
        "folds": fold_metrics,
        "rmse_mean": cv_rmse_mean, "rmse_std": cv_rmse_std,
        "r2_mean": cv_r2_mean
    },
    "ensemble_weights": {k: float(v) for k, v in opt_weights.items()},
}

---

Predicciones OOF/TEST y submission

In [16]:
X_test = df_test.drop(["Id"], axis=1)
preds = np.column_stack([
    final_pipes["elasticnet"].predict(X_test),
    final_pipes["lgbm"].predict(X_test)
])

w_vec = np.array([opt_weights["elasticnet"], opt_weights["lgbm"]], dtype=float)
pred_log = preds @ w_vec
pred_orig = np.expm1(pred_log)

submission = pd.DataFrame({"Id": df_test["Id"], "SalePrice": pred_orig})
submission.to_csv(submission_path, index=False)

---

Firmamos el modelo para reproducibilidad

In [17]:
signature = infer_signature(df_test.drop(["Id"], axis=1), pred_orig)

with mlflow.start_run(run_name="ElasticNet+LGBM_Ensemble") as run:
    try:
        enet_est = getattr(final_pipes["elasticnet"], "named_steps", {}).get("model") \
            or final_pipes["elasticnet"].named_steps[list(final_pipes["elasticnet"].named_steps.keys())[-1]]
        lgbm_est = getattr(final_pipes["lgbm"], "named_steps", {}).get("model") \
            or final_pipes["lgbm"].named_steps[list(final_pipes["lgbm"].named_steps.keys())[-1]]
        mlflow.log_param("random_state", rstate)
        mlflow.log_param("elasticnet_alpha", getattr(enet_est, "alpha", None))
        mlflow.log_param("elasticnet_l1_ratio", getattr(enet_est, "l1_ratio", None))
        mlflow.log_param("lgbm_n_estimators", getattr(lgbm_est, "n_estimators", None))
        mlflow.log_param("lgbm_max_depth", getattr(lgbm_est, "max_depth", None))
        mlflow.log_param("lgbm_num_leaves", getattr(lgbm_est, "num_leaves", None))
        mlflow.log_param("lgbm_learning_rate", getattr(lgbm_est, "learning_rate", None))
        mlflow.log_param("lgbm_subsample", getattr(lgbm_est, "subsample", None))
        mlflow.log_param("lgbm_colsample_bytree", getattr(lgbm_est, "colsample_bytree", None))
    except Exception as e:
        print(e)
    
    mlflow.log_param("n_features", X_test.shape[1])
    mlflow.log_param("ensemble_weights_elasticnet", opt_weights["elasticnet"])
    mlflow.log_param("ensemble_weights_lgbm", opt_weights["lgbm"])
    
    mlflow.log_metric("cv_rmse_mean", cv_rmse_mean)
    mlflow.log_metric("cv_rmse_std", cv_rmse_std)
    mlflow.log_metric("cv_r2_mean", cv_r2_mean)

    mlflow.log_dict(meta, "meta.json")
    
    mlflow.sklearn.log_model(
        sk_model=final_pipes["elasticnet"],
        name="models_elasticnet",
        signature=signature,
        input_example=X_test.head()
    )
    
    mlflow.sklearn.log_model(
        sk_model=final_pipes["lgbm"],
        name="models_lgbm",
        signature=signature,
        input_example=X_test.head()
    )
    
    mlflow.log_artifact(submission_path, artifact_path="submissions")
    mlflow.set_experiment_tags({
        "task": "regression",
        "dataset": "Kaggle-HousePrices-Competition",
        "target": "SalePrice",
        "target_transform": "log1p",
        "inference_transform": "expm1",
        "ensemble": "Weighted_ElasticNet+LGBM"
    })
    

---

Mostramos resultados en consola

In [18]:
print("\n=== RESUMEN CV ===")
print(f"RMSE (mean ± std): {cv_rmse_mean:.5f} ± {cv_rmse_std:.5f}")
print(f"R2   (mean):       {cv_r2_mean:.5f}")
print("Pesos óptimos (OOF):", opt_weights)
print(f"Submission guardado en: {submission_path}")
print("Modelos y artefactos registrados en MLflow (ver en la UI del tracking server).")


=== RESUMEN CV ===
RMSE (mean ± std): 0.11708 ± 0.02141
R2   (mean):       0.90927
Pesos óptimos (OOF): {'elasticnet': np.float64(0.5), 'lgbm': np.float64(0.5)}
Submission guardado en: ../../../data/housing_submissions/elnet_lgbm/submission_elnet_lgbm.csv
Modelos y artefactos registrados en MLflow (ver en la UI del tracking server).
