In [23]:
# ============================================================
# 01_dataset_ml.ipynb
# Construcción del dataset de ML (provincia–día)
# ============================================================

import os
import pandas as pd
import numpy as np

# ---------------------------------------------
# 1. Rutas y carga de datos
# ---------------------------------------------
DATA_DIR = r"C:\Users\aitor.herran\Desktop\incendios"
DAILY_VIZ_PATH = os.path.join(DATA_DIR, "prov_daily_viz.csv")

df = pd.read_csv(DAILY_VIZ_PATH, parse_dates=["date"])
df["provincia"] = df["provincia"].astype(str).str.strip()

print("Shape prov_daily_viz:", df.shape)
print("Columnas:", df.columns.tolist())
print(df.head())

# Aseguramos numeric para columnas clave
for col in ["firms_count", "effis_area_ha"]:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)

# ---------------------------------------------
# 2. Definición del target y_riesgo_alto
#    Basado en FIRMS: percentil 90 de firms_count
# ---------------------------------------------
umbral_firms = df["firms_count"].quantile(0.90)
print("\nUmbral de riesgo (FIRMS P90):", umbral_firms)

df["y_riesgo_alto"] = (df["firms_count"] >= umbral_firms).astype(int)

print("\nDistribución de y_riesgo_alto (conteos):")
print(df["y_riesgo_alto"].value_counts())
print("\nDistribución de y_riesgo_alto (proporciones):")
print(df["y_riesgo_alto"].value_counts(normalize=True))

# ---------------------------------------------
# 3. Creación de lags (t-1)
# ---------------------------------------------
df = df.sort_values(["provincia", "date"])

cols_lag1 = [
    "firms_count",
    "effis_area_ha",
    "meteo_temp_max",
    "meteo_temp_min",
    "meteo_precip_sum",
    "meteo_wind_max",
    "meteo_humidity_max",
    "meteo_humidity_min",
    "meteo_solar_radiation",
]

for col in cols_lag1:
    if col in df.columns:
        df[f"{col}_lag1"] = df.groupby("provincia")[col].shift(1)

# ---------------------------------------------
# 4. Rolling windows 3 días (clima)
# ---------------------------------------------
cols_roll3 = [
    "meteo_temp_max",
    "meteo_temp_min",
    "meteo_precip_sum",
    "meteo_wind_max",
    "meteo_humidity_min",
]

for col in cols_roll3:
    if col in df.columns:
        df[f"{col}_roll3_mean"] = (
            df.groupby("provincia")[col]
              .rolling(window=3, min_periods=1)
              .mean()
              .reset_index(level=0, drop=True)
        )

# ---------------------------------------------
# 5. Generar dataset ML (eliminar NaNs de lags)
# ---------------------------------------------
lag_cols = [c for c in df.columns if c.endswith("_lag1")]

print("\nNúmero de columnas lag1:", len(lag_cols))
print("Ejemplo lag_cols:", lag_cols[:10])

df_ml = df.dropna(subset=lag_cols).copy()

print("\nFilas originales:", len(df))
print("Filas tras eliminar NaNs de lags:", len(df_ml))

# ---------------------------------------------
# 6. Guardar dataset de ML
# ---------------------------------------------
DATASET_ML_PATH = os.path.join(DATA_DIR, "dataset_ml_prov_daily.csv")
df_ml.to_csv(DATASET_ML_PATH, index=False)

print("\n✅ Dataset ML guardado en:")
print(DATASET_ML_PATH)
print("Shape final:", df_ml.shape)


Shape prov_daily_viz: (46541, 27)
Columnas: ['provincia', 'date', 'firms_count', 'firms_frp_sum', 'firms_frp_mean', 'firms_brightness_mean', 'firms_brightness_max', 'firms_confidence_mean', 'meteo_temp_max', 'meteo_temp_min', 'meteo_precip_sum', 'meteo_wind_max', 'meteo_humidity_max', 'meteo_humidity_min', 'meteo_solar_radiation', 'effis_area_ha', 'effis_fire_count', 'effis_broadlea_pct', 'effis_conifer_pct', 'effis_mixed_pct', 'effis_scleroph_pct', 'effis_transit_pct', 'effis_othernatlc_pct', 'effis_agriareas_pct', 'effis_artifsurf_pct', 'effis_otherlc_pct', 'effis_percna2k_pct']
  provincia       date  firms_count  firms_frp_sum  firms_frp_mean  \
0     alava 2015-09-19            1           3.69            3.69   
1     alava 2015-10-01            1           7.98            7.98   
2     alava 2015-10-03            1           0.43            0.43   
3     alava 2015-10-10            1           1.67            1.67   
4     alava 2015-10-30            1           0.42            