# Random Forest Regressor

---

Importamos librerias

In [6]:
import os
import sys
import warnings
import joblib

import pandas as pd
import numpy as np

from sklearn.base import clone
from sklearn.metrics import r2_score, root_mean_squared_error
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestRegressor

import mlflow
from mlflow.tracking import MlflowClient
from mlflow.models import infer_signature

sys.path.append('../../../utils/')
from utils_yose import build_preprocessor, load_data
from utils_yose import train_data, test_data
from mlflow_setup import setup_mlflow

sys.path.append('../../ensemble_2p/')
from model_ensemble.ensemble import WeightedEnsemble

warnings.filterwarnings('ignore')
pd.set_option('display.float_format', lambda x: '%.5f' % x)

MODEL_NAME = "random_forest"

---

In [7]:
df_train, X_test, df_test = load_data()

---

In [8]:
y = np.log1p(df_train["SalePrice"]).astype(float)
X = df_train.drop(["SalePrice", "Id"], axis=1)

rstate = 42
base_models = {
    "rf": RandomForestRegressor(n_estimators=1000, random_state=rstate, max_depth=None, n_jobs=-1, oob_score=True, verbose=1, warm_start=True, criterion="friedman_mse", min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, max_features="sqrt", max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True)
}

---

Entrenamiento del modelo con Cross-Validation

In [9]:
kf = KFold(n_splits=10, shuffle=True, random_state=rstate)
oof_preds = {name: np.zeros(len(X), dtype=float) for name in base_models}
oof_idx_mask = np.zeros(len(X), dtype=bool)
fold_metrics = []


for fold, (tr_idx, va_idx) in enumerate(kf.split(X, y), 1):
    X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]
    pre = build_preprocessor(X_tr)
    fold_preds = {}
    for name, mdl in base_models.items():
        pipe = Pipeline([("pre", clone(pre)), ("model", clone(mdl))])
        pipe.fit(X_tr, y_tr)
        p = pipe.predict(X_va)
        fold_preds[name] = p
    p_ens = np.mean(np.column_stack([fold_preds[n] for n in base_models]), axis=1)
    rmse = root_mean_squared_error(y_va, p_ens)
    r2 = r2_score(y_va, p_ens)
    fold_metrics.append({"fold": fold, "rmse": float(rmse), "r2": float(r2)})
    for name in base_models:
        oof_preds[name][va_idx] = fold_preds[name]
    oof_idx_mask[va_idx] = True
    
best_fold = np.argmin([m["rmse"] for m in fold_metrics])
pipe = Pipeline([("pre", clone(pre)), ("model", clone(base_models["rf"]))])
pipe.fit(X, y)
rf = pipe

os.makedirs("../model_random_forest/random_forest_model/", exist_ok=True)
joblib.dump(rf, "../model_random_forest/random_forest_model/model_rf.pkl")

cv_rmse_mean = float(np.mean([m["rmse"] for m in fold_metrics]))
cv_rmse_std  = float(np.std([m["rmse"] for m in fold_metrics]))
cv_r2_mean   = float(np.mean([m["r2"] for m in fold_metrics]))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    0.5s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    0.

---

Creamos y guardamos con MLFlow

In [10]:
exp_id = setup_mlflow(
    experiment="HousePrices-Competition", 
    description="RandomForestRegressor", 
    experiment_tags={"Kaggle-Model": "RandomForestRegressor"}
)

prediction_example = pd.read_csv("../../../../data/housing_data/sample_submission.csv").drop(["Id"], axis=1)

signature = infer_signature(df_test, prediction_example)

with mlflow.start_run(run_name="RandomForestRegressor", experiment_id=exp_id) as run:
    
    try:
        rf_est = getattr(rf, "named_steps", {}).get("model") or rf.named_steps[list(rf.named_steps.keys())[-1]]

        
        mlflow.log_param("random_state", rstate)
        mlflow.log_param("n_estimators", rf_est.n_estimators)
        mlflow.log_param("max_depth", rf_est.max_depth)
        mlflow.log_param("criterion", rf_est.criterion)
        mlflow.log_param("min_samples_split", rf_est.min_samples_split)
        mlflow.log_param("min_samples_leaf", rf_est.min_samples_leaf)
        mlflow.log_param("min_weight_fraction_leaf", rf_est.min_weight_fraction_leaf)
        mlflow.log_param("max_features", rf_est.max_features)
        mlflow.log_param("max_leaf_nodes", rf_est.max_leaf_nodes)
        mlflow.log_param("min_impurity_decrease", rf_est.min_impurity_decrease)
        mlflow.log_param("bootstrap", rf_est.bootstrap)
    except Exception as e:
        print(e)
    
    train_ds = mlflow.data.from_pandas(df_train, source="data/housing_data/train.csv")
    mlflow.log_input(train_ds, context="training")
    
    test_ds = mlflow.data.from_pandas(pd.DataFrame(df_test), source="data/housing_data/test.csv")
    mlflow.log_input(test_ds, context="inference")
    
    mlflow.log_param("n_features", df_test.shape[1])
    mlflow.log_param("n_samples", df_train.shape[0])
    mlflow.log_param("n_folds", kf.get_n_splits())
    
    mlflow.log_metric("cv_rmse_mean", cv_rmse_mean)
    mlflow.log_metric("cv_rmse_std", cv_rmse_std)
    mlflow.log_metric("cv_r2_mean", cv_r2_mean)
    
    mlflow.sklearn.log_model(
        sk_model=rf,
        name="random_forest_model",
        signature=signature
    )
    
    mlflow.log_artifact(train_data, artifact_path="data")
    mlflow.log_artifact(test_data, artifact_path="data")

    mlflow.log_artifact("../model_random_forest/random_forest_model/model_rf.pkl", artifact_path="random_forest_model")

    mlflow.set_tags({
        "task": "regression",
        "dataset": "Kaggle-HousePrices-Competition",
        "target": "SalePrice",
        "target_transform": "log1p",
        "inference_transform": "expm1",
        "model":"RandomForestRegressor"
    })
    MODEL_URI = f"runs:/{run.info.run_id}/random_forest_model"
    print("URI del modelo:", f"runs:/{run.info.run_id}/random_forest_model")

client = MlflowClient()
remote_path = ['random_forest_model']
local_dir = "../model_random_forest/"
for path in remote_path:
    client.download_artifacts(run_id=run.info.run_id, path=path, dst_path=local_dir)

print("Guardado en:", local_dir)

[MLflow] Tracking URI: file:/Users/yosesotomayor/Code/retoCasas/.ML
[MLflow] Registry URI: file:/Users/yosesotomayor/Code/retoCasas/.ML
[MLflow] Experimento:  HousePrices-Competition (id=116606298190487712)
[MLflow] Artifact loc:  file:///Users/yosesotomayor/Code/retoCasas/.ML/116606298190487712
URI del modelo: runs:/1f5cdce58bd44b1289c1d6a4423e2ea1/random_forest_model


Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 35.77it/s]
Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 25.42it/s]  

Guardado en: ../model_random_forest/





---

Mostrar resultados

In [11]:
print("\n=== RESUMEN CV ===")
print(f"RMSE (mean ± std): {cv_rmse_mean:.5f} ± {cv_rmse_std:.5f}")
print(f"R2   (mean):       {cv_r2_mean:.5f}")
print("Modelos y artefactos registrados en MLflow (ver en la UI del tracking server).")


=== RESUMEN CV ===
RMSE (mean ± std): 0.14216 ± 0.02032
R2   (mean):       0.87134
Modelos y artefactos registrados en MLflow (ver en la UI del tracking server).


---

Deployment del modelo

In [12]:
mv = mlflow.register_model(
    model_uri = MODEL_URI,
    name = MODEL_NAME,
    tags = {
        "task": "regression",
        "dataset": "Kaggle-HousePrices-Competition",
        "target": "SalePrice",
        "target_transform": "log1p",
        "inference_transform": "expm1",
        "model": "RandomForestRegressor"
    },
)

client.set_registered_model_tag(mv.name, "created_by", "Yose Sotomayor")
client.set_registered_model_alias(mv.name, "challenger", version=mv.version)
print("Version registrada:", mv.version)

Registered model 'random_forest' already exists. Creating a new version of this model...


Version registrada: 2


Created version '2' of model 'random_forest'.


Transicionar versiones

In [13]:
client.delete_registered_model_alias(MODEL_NAME, "champion")
client.set_registered_model_alias(MODEL_NAME, "challenger", mv.version)

---

Guardar archivo para Kaggle

In [19]:
model = mlflow.pyfunc.load_model(f"models:/{MODEL_NAME}@challenger")

preds_orig  = model.predict(df_test)

sub_dir = "../../../../data/housing_submissions/random_forest/"
os.makedirs(sub_dir, exist_ok=True)
submission_path = os.path.join(sub_dir, "submission_random_forest.csv")


df_sub = pd.DataFrame({"Id": df_test['Id'], "SalePrice": preds_orig})
df_sub.to_csv(submission_path, index=False)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:    0.1s finished


---