In [2]:
import os
import pandas as pd
import numpy as np
import joblib

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

# =====================================================
# 0. CONFIGURACIÓN
# =====================================================
DATA_DIR = r"C:\Users\aitor.herran\Desktop\incendios"
DATASET_PATH = os.path.join(DATA_DIR, "dataset_ml_prov_daily.csv")

MODEL_PATH = os.path.join(
    DATA_DIR, "models", "modelo_rf_riesgo_aemet.joblib"
)

TARGET = "y_riesgo_alto"

# =====================================================
# 1. CARGA DATASET HISTÓRICO
# =====================================================
df = pd.read_csv(DATASET_PATH, parse_dates=["date"])
df["provincia"] = df["provincia"].astype(str)

print("Shape dataset:", df.shape)

# =====================================================
# 2. FEATURES AEMET (SIN LAGS, COHERENTE CON STREAMLIT)
# =====================================================
feat_cols = [
    "meteo_temp_max",
    "meteo_temp_min",
    "meteo_precip_sum",
    "meteo_wind_max",
    "meteo_humidity_max",
    "meteo_humidity_min",
]

# Comprobación defensiva
for c in feat_cols + [TARGET]:
    if c not in df.columns:
        raise ValueError(f"❌ Falta la columna {c} en el dataset")

df = df.dropna(subset=feat_cols + [TARGET])

X = df[feat_cols]
y = df[TARGET]

print("Filas válidas para entrenamiento:", len(df))
print("Distribución clases:")
print(y.value_counts())

# =====================================================
# 3. ENTRENAR RANDOM FOREST AEMET
# =====================================================
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=10,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1,
)

rf.fit(X, y)

# =====================================================
# 4. EVALUACIÓN RÁPIDA (SANITY CHECK)
# =====================================================
y_proba = rf.predict_proba(X)[:, 1]
auc = roc_auc_score(y, y_proba)

print(f"ROC-AUC (train, sanity check): {auc:.3f}")

# =====================================================
# 5. GUARDAR MODELO
# =====================================================
os.makedirs(os.path.dirname(MODEL_PATH), exist_ok=True)
joblib.dump(rf, MODEL_PATH)

print("\n✅ Modelo AEMET guardado en:")
print(MODEL_PATH)

Shape dataset: (46498, 42)
Filas válidas para entrenamiento: 46498
Distribución clases:
y_riesgo_alto
0    41763
1     4735
Name: count, dtype: int64
ROC-AUC (train, sanity check): 0.805

✅ Modelo AEMET guardado en:
C:\Users\aitor.herran\Desktop\incendios\models\modelo_rf_riesgo_aemet.joblib
