## Analisis de la Encuesta Continua de Hogares 2024

- Dataset: https://www4.ine.gub.uy/Anda5/index.php/catalog/767/get-microdata
- Diccionario: https://www4.ine.gub.uy/Anda5/index.php/catalog/767/data-dictionary/F4?file_name=ECH_implantacion_2024

## Comienza el notebook de Simulacion Montecarlo.
Se evalua estabilidad de los modelos finalistas: random forest simple y stacking.

## Carga del dataset depurado

In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [3]:
# Cargar datasets limpios
X = pd.read_csv(r'D:\ut603933\Tesis-MCD\data_processed\X_clean.csv')
y = pd.read_csv(r'D:\ut603933\Tesis-MCD\data_processed\y_clean.csv').squeeze()  # .squeeze() para que sea Serie y no DataFrame

print("Shapes cargados:")
print("X:", X.shape)
print("y:", y.shape)

Shapes cargados:
X: (55923, 2846)
y: (55923,)


## Definicion de modelos finaliestas

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import StackingRegressor

In [5]:
# Random Forest SIMPLE
rf_simple = RandomForestRegressor(
    n_estimators=150,
    max_depth=None,
    min_samples_leaf=1,
    random_state=42,
    n_jobs=-1
)

# Random Forest OPTIMIZADO
rf_best = RandomForestRegressor(
    n_estimators=600,
    min_samples_split=5,
    min_samples_leaf=2,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)

# XGBoost OPTIMIZADO
xgb_best = XGBRegressor(
    subsample=0.8,
    n_estimators=600,
    max_depth=5,
    learning_rate=0.1,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

# STACKING FINAL (RF OPT + XGB OPT → LR)
stacking_model = StackingRegressor(
    estimators=[("RF", rf_best), ("XGB", xgb_best)],
    final_estimator=LinearRegression(),
    cv=3,
    n_jobs=-1
)

## Funcion de Simulacion Montecarlo

In [6]:
def run_montecarlo(model, X, y, n_iter=200):
    """
    Corre Montecarlo con distintas seeds y devuelve métricas.
    """

    rmse_list = []
    mae_list = []
    r2_list = []

    for seed in tqdm(range(n_iter), desc="Montecarlo"):

        # división aleatoria distinta en cada iteración
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.20, random_state=seed
        )

        # ENTRENAR DESDE CERO
        modelo = model
        modelo.fit(X_train, y_train)

        y_pred = modelo.predict(X_test)

        # MÉTRICAS
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        rmse_list.append(rmse)
        mae_list.append(mae)
        r2_list.append(r2)

    return {
        "RMSE": np.array(rmse_list),
        "MAE": np.array(mae_list),
        "R2": np.array(r2_list)
    }


## Funcion resumen de metricas

In [7]:
def resumen_metricas(nombre, metricas):
    """Devuelve un diccionario con media, std y percentiles."""
    return {
        "Modelo": nombre,
        "RMSE_media": metricas["RMSE"].mean(),
        "RMSE_std": metricas["RMSE"].std(),
        "RMSE_p2.5": np.percentile(metricas["RMSE"], 2.5),
        "RMSE_p97.5": np.percentile(metricas["RMSE"], 97.5),

        "MAE_media": metricas["MAE"].mean(),
        "MAE_std": metricas["MAE"].std(),
        "MAE_p2.5": np.percentile(metricas["MAE"], 2.5),
        "MAE_p97.5": np.percentile(metricas["MAE"], 97.5),

        "R2_media": metricas["R2"].mean(),
        "R2_std": metricas["R2"].std(),
        "R2_p2.5": np.percentile(metricas["R2"], 2.5),
        "R2_p97.5": np.percentile(metricas["R2"], 97.5),
    }

## Ejecucion de la simulacion para los modelos finalistas

In [12]:
resultados = {}

In [13]:
print("\n=== Ejecutando Montecarlo: Random Forest SIMPLE ===")
resultados["RF_simple"] = run_montecarlo(rf_simple, X, y, n_iter=100)


=== Ejecutando Montecarlo: Random Forest SIMPLE ===


Montecarlo: 100%|██████████| 100/100 [8:02:26<00:00, 289.47s/it] 


In [14]:
print("\nResumen parcial RF_simple:")
print(resumen_metricas("RF_simple", resultados["RF_simple"]))


Resumen parcial RF_simple:
{'Modelo': 'RF_simple', 'RMSE_media': 0.16326241032398225, 'RMSE_std': 0.00891707335557199, 'RMSE_p2.5': 0.1502283799715136, 'RMSE_p97.5': 0.18585961569139975, 'MAE_media': 0.0911523599838265, 'MAE_std': 0.0012231592221087955, 'MAE_p2.5': 0.08872382768635106, 'MAE_p97.5': 0.09351375874514148, 'R2_media': 0.9428079261292099, 'R2_std': 0.00533453773005676, 'R2_p2.5': 0.9301945518755581, 'R2_p97.5': 0.949865790051853}


In [15]:
print("\n=== Ejecutando Montecarlo: Random Forest OPTIMIZADO ===")
resultados["RF_opt"] = run_montecarlo(rf_best, X, y, n_iter=100)


=== Ejecutando Montecarlo: Random Forest OPTIMIZADO ===


Montecarlo: 100%|██████████| 100/100 [31:55:50<00:00, 1149.51s/it]  


In [16]:
print("Resultados parciales Random Forest OPTIMIZADO:")
print(resumen_metricas("RF_opt", resultados["RF_opt"]))

Resultados parciales Random Forest OPTIMIZADO:
{'Modelo': 'RF_opt', 'RMSE_media': 0.1664453950395651, 'RMSE_std': 0.00845745724924377, 'RMSE_p2.5': 0.15329074581049987, 'RMSE_p97.5': 0.18776443364636275, 'MAE_media': 0.09413029275261384, 'MAE_std': 0.0012226907402002171, 'MAE_p2.5': 0.09168757201568299, 'MAE_p97.5': 0.0964327291197836, 'R2_media': 0.940576298492341, 'R2_std': 0.004997403372315126, 'R2_p2.5': 0.9279748063443559, 'R2_p97.5': 0.94744323516651}


In [17]:
print("\n=== Ejecutando Montecarlo: XGBoost OPTIMIZADO ===")
resultados["XGB_opt"] = run_montecarlo(xgb_best, X, y, n_iter=100)


=== Ejecutando Montecarlo: XGBoost OPTIMIZADO ===


Montecarlo: 100%|██████████| 100/100 [1:00:17<00:00, 36.17s/it]


In [18]:
print("Resultados parciales XGBoost OPTIMIZADO:")
print(resumen_metricas("XGB_opt", resultados["XGB_opt"]))

Resultados parciales XGBoost OPTIMIZADO:
{'Modelo': 'XGB_opt', 'RMSE_media': 0.197107985274924, 'RMSE_std': 0.008794948293921582, 'RMSE_p2.5': 0.1848651288773758, 'RMSE_p97.5': 0.22055923333305966, 'MAE_media': 0.13738510800365478, 'MAE_std': 0.0013416623379005072, 'MAE_p2.5': 0.13455491576467463, 'MAE_p97.5': 0.1399650871998259, 'R2_media': 0.9166842638629492, 'R2_std': 0.006140299338190928, 'R2_p2.5': 0.9013890597311565, 'R2_p97.5': 0.9258607944671646}


In [21]:
print("\n=== Ejecutando Montecarlo: STACKING RF + XGB ===")
resultados["Stacking"] = run_montecarlo(stacking_model, X, y, n_iter=20)


=== Ejecutando Montecarlo: STACKING RF + XGB ===


Montecarlo: 100%|██████████| 20/20 [18:40:18<00:00, 3360.91s/it]  


In [22]:
print("Resultados parciales Stacking:")
print(resumen_metricas("Stacking", resultados["Stacking"]))

Resultados parciales Stacking:
{'Modelo': 'Stacking', 'RMSE_media': 0.16533809505112892, 'RMSE_std': 0.009906610797342546, 'RMSE_p2.5': 0.15222246764476777, 'RMSE_p97.5': 0.1893005825881217, 'MAE_media': 0.1028972717543833, 'MAE_std': 0.001974869348750815, 'MAE_p2.5': 0.10016439720072337, 'MAE_p97.5': 0.10677028618562599, 'R2_media': 0.9413729326875513, 'R2_std': 0.0062139005727493745, 'R2_p2.5': 0.9272937906349461, 'R2_p97.5': 0.9492572383748434}


## Tabla con intervalos de confianza

In [23]:
tabla_resumen = pd.DataFrame([
    resumen_metricas("RF_simple", resultados["RF_simple"]),
    resumen_metricas("RF_opt", resultados["RF_opt"]),
    resumen_metricas("XGB_opt", resultados["XGB_opt"]),
    resumen_metricas("Stacking", resultados["Stacking"])
])

# Guardar resultados
tabla_resumen.to_csv("Montecarlo_Resultados.csv", index=False)

print("\n=== TABLA DE INTERVALOS DE CONFIANZA ===")
print(tabla_resumen)


=== TABLA DE INTERVALOS DE CONFIANZA ===
      Modelo  RMSE_media  RMSE_std  RMSE_p2.5  RMSE_p97.5  MAE_media  \
0  RF_simple    0.163262  0.008917   0.150228    0.185860   0.091152   
1     RF_opt    0.166445  0.008457   0.153291    0.187764   0.094130   
2    XGB_opt    0.197108  0.008795   0.184865    0.220559   0.137385   
3   Stacking    0.165338  0.009907   0.152222    0.189301   0.102897   

    MAE_std  MAE_p2.5  MAE_p97.5  R2_media    R2_std   R2_p2.5  R2_p97.5  
0  0.001223  0.088724   0.093514  0.942808  0.005335  0.930195  0.949866  
1  0.001223  0.091688   0.096433  0.940576  0.004997  0.927975  0.947443  
2  0.001342  0.134555   0.139965  0.916684  0.006140  0.901389  0.925861  
3  0.001975  0.100164   0.106770  0.941373  0.006214  0.927294  0.949257  
