In [1]:
# ==========================================
# 0. CONFIGURACI√ìN GENERAL
# ==========================================
import os
import pandas as pd
import numpy as np
import geopandas as gpd

pd.set_option("display.max_columns", None)

# Carpeta donde tienes TODO
DATA_DIR = r"C:\Users\aitor.herran\Desktop\incendios"

# Rutas de entrada
FIRMS_RAW_PATH   = os.path.join(DATA_DIR, "firms_spain.csv")
METEO_PATH       = os.path.join(DATA_DIR, "openmeteo_historico.csv")
GADM_PATH        = os.path.join(DATA_DIR, "gadm41_ESP_2.json")
EFFIS_SHP_PATH   = os.path.join(DATA_DIR, "copernicus", "modis.ba.poly.shp")

print("‚úÖ Configuraci√≥n OK")
print("FIRMS:", FIRMS_RAW_PATH)
print("METEO:", METEO_PATH)
print("GADM :", GADM_PATH)
print("EFFIS:", EFFIS_SHP_PATH)


‚úÖ Configuraci√≥n OK
FIRMS: C:\Users\aitor.herran\Desktop\incendios\firms_spain.csv
METEO: C:\Users\aitor.herran\Desktop\incendios\openmeteo_historico.csv
GADM : C:\Users\aitor.herran\Desktop\incendios\gadm41_ESP_2.json
EFFIS: C:\Users\aitor.herran\Desktop\incendios\copernicus\modis.ba.poly.shp


In [2]:
# ==========================================
# 1. FIRMS NUEVO + ASIGNAR PROVINCIA
# ==========================================

# 1.1 Cargar FIRMS nuevo
fires = pd.read_csv(FIRMS_RAW_PATH, low_memory=False)
print("FIRMS cargado:", len(fires), "filas")
print("Columnas:", fires.columns.tolist())

# Asegurar nombres esperados (por si vienen con may√∫sculas raras)
fires = fires.rename(columns={
    "ACQ_DATE": "acq_date",
    "ACQ_TIME": "acq_time",
    "LATITUDE": "latitude",
    "LONGITUDE": "longitude",
    "BRIGHTNESS": "brightness",
})

# Convertir fecha
fires["acq_date"] = pd.to_datetime(fires["acq_date"], errors="coerce")

print("üìÖ Rango de fechas FIRMS:",
      fires["acq_date"].min(), "‚Üí", fires["acq_date"].max())

# 1.2 Normalizar confidence (si viene como letras l/n/h)
if "confidence" in fires.columns:
    # Si es texto tipo l/n/h, lo convertimos; si ya es num√©rico, no tocamos
    if fires["confidence"].dtype == "object":
        conf_map = {"l": 1, "n": 2, "h": 3}
        fires["confidence"] = (
            fires["confidence"]
            .astype(str)
            .str.strip()
            .str.lower()
            .map(conf_map)
        )
        print("‚úÖ 'confidence' convertido a escala 1/2/3 (l/n/h).")
    else:
        print("‚ÑπÔ∏è 'confidence' ya era num√©rico; no se modifica.")
else:
    print("‚ö†Ô∏è No existe columna 'confidence' en FIRMS.")

# 1.3 Asignar provincia con GADM
print("üó∫Ô∏è Cargando provincias GADM‚Ä¶")
provincias = gpd.read_file(GADM_PATH)[["NAME_2", "geometry"]]
provincias = provincias.rename(columns={"NAME_2": "provincia"})

# GeoDataFrame de FIRMS en WGS84
fires_gdf = gpd.GeoDataFrame(
    fires,
    geometry=gpd.points_from_xy(fires["longitude"], fires["latitude"]),
    crs="EPSG:4326"
)

print("üîé Haciendo join espacial punto‚Üíprovincia‚Ä¶ (puede tardar)")
fires_with_prov = gpd.sjoin(fires_gdf, provincias, how="left", predicate="within")
fires_with_prov = fires_with_prov.drop(columns=["index_right"])

# Guardar intermedio con provincia
FIRMS_PROV_PATH = os.path.join(DATA_DIR, "firms_spain_provincia.csv")
fires_with_prov.to_csv(FIRMS_PROV_PATH, index=False)

print("‚úÖ Guardado:", FIRMS_PROV_PATH)
print("   Filas:", len(fires_with_prov))
print("   Provincias √∫nicas:", fires_with_prov["provincia"].nunique())
fires_with_prov.head()


FIRMS cargado: 338943 filas
Columnas: ['latitude', 'longitude', 'brightness', 'scan', 'track', 'acq_date', 'acq_time', 'satellite', 'instrument', 'confidence', 'bright_t31', 'frp', 'daynight', 'type', 'source_file', 'dataset_type', 'satellite_source']
üìÖ Rango de fechas FIRMS: 2015-09-19 00:00:00 ‚Üí 2025-09-18 00:00:00
‚úÖ 'confidence' convertido a escala 1/2/3 (l/n/h).
üó∫Ô∏è Cargando provincias GADM‚Ä¶
üîé Haciendo join espacial punto‚Üíprovincia‚Ä¶ (puede tardar)
‚úÖ Guardado: C:\Users\aitor.herran\Desktop\incendios\firms_spain_provincia.csv
   Filas: 338944
   Provincias √∫nicas: 52


Unnamed: 0,latitude,longitude,brightness,scan,track,acq_date,acq_time,satellite,instrument,confidence,bright_t31,frp,daynight,type,source_file,dataset_type,satellite_source,geometry,provincia
0,43.22312,-2.87678,298.46,0.5,0.49,2018-04-01,133,N20,VIIRS,2,275.84,0.85,N,2.0,fire_archive_J1V-C2_663078.csv,archive,NOAA-20 (J1V),POINT (-2.87678 43.22312),Vizcaya
1,41.47758,-1.48748,303.02,0.41,0.45,2018-04-01,133,N20,VIIRS,2,274.7,0.84,N,2.0,fire_archive_J1V-C2_663078.csv,archive,NOAA-20 (J1V),POINT (-1.48748 41.47758),Zaragoza
2,41.31344,1.66281,319.19,0.45,0.39,2018-04-01,133,N20,VIIRS,2,282.27,2.31,N,2.0,fire_archive_J1V-C2_663078.csv,archive,NOAA-20 (J1V),POINT (1.66281 41.31344),Barcelona
3,41.45441,1.97884,305.53,0.44,0.38,2018-04-01,133,N20,VIIRS,2,282.21,1.56,N,2.0,fire_archive_J1V-C2_663078.csv,archive,NOAA-20 (J1V),POINT (1.97884 41.45441),Barcelona
4,43.22594,-2.87745,296.51,0.5,0.49,2018-04-01,133,N20,VIIRS,2,276.81,0.7,N,2.0,fire_archive_J1V-C2_663078.csv,archive,NOAA-20 (J1V),POINT (-2.87745 43.22594),Vizcaya


In [3]:
# ==========================================
# 2. MERGE FIRMS + OPEN-METEO POR PROVINCIA Y FECHA
# ==========================================

def normaliza_prov(x: pd.Series) -> pd.Series:
    return (x.astype(str)
              .str.normalize("NFKD")
              .str.encode("ascii", errors="ignore")
              .str.decode("utf-8")
              .str.lower()
              .str.strip())

# 2.1 Cargar FIRMS con provincia
fires = pd.read_csv(FIRMS_PROV_PATH, low_memory=False)
fires["acq_date"] = pd.to_datetime(fires["acq_date"], errors="coerce")
fires["provincia"] = normaliza_prov(fires["provincia"])

print("üî• FIRMS+provincia:", len(fires), "filas")
print("   Rango fechas:", fires["acq_date"].min(), "‚Üí", fires["acq_date"].max())

# 2.2 Cargar Open-Meteo hist√≥rico
meteo = pd.read_csv(METEO_PATH, low_memory=False)
meteo["time"] = pd.to_datetime(meteo["time"], errors="coerce")
meteo["provincia"] = normaliza_prov(meteo["provincia"])

print("üå§Ô∏è Open-Meteo:", len(meteo), "filas")
print("   Rango fechas:", meteo["time"].min(), "‚Üí", meteo["time"].max())

print("Provincias FIRMS:", len(fires["provincia"].unique()))
print("Provincias METEO:", len(meteo["provincia"].unique()))

# 2.3 Merge
merged = pd.merge(
    fires,
    meteo,
    left_on=["provincia", "acq_date"],
    right_on=["provincia", "time"],
    how="inner",
)

MERGED_PATH = os.path.join(DATA_DIR, "merged_fires_openmeteo_provincia.csv")
merged.to_csv(MERGED_PATH, index=False)

print(f"‚úÖ Datos combinados FIRMS+METEO: {len(merged):,} registros")
print("   Rango fechas:", merged["acq_date"].min(), "‚Üí", merged["acq_date"].max())
print("üíæ Guardado:", MERGED_PATH)
merged.head()


üî• FIRMS+provincia: 338944 filas
   Rango fechas: 2015-09-19 00:00:00 ‚Üí 2025-09-18 00:00:00
üå§Ô∏è Open-Meteo: 439640 filas
   Rango fechas: 2015-01-01 00:00:00 ‚Üí 2025-11-06 00:00:00
Provincias FIRMS: 53
Provincias METEO: 50
‚úÖ Datos combinados FIRMS+METEO: 635,484 registros
   Rango fechas: 2015-09-19 00:00:00 ‚Üí 2025-09-18 00:00:00
üíæ Guardado: C:\Users\aitor.herran\Desktop\incendios\merged_fires_openmeteo_provincia.csv


Unnamed: 0,latitude,longitude,brightness,scan,track,acq_date,acq_time,satellite,instrument,confidence,bright_t31,frp,daynight,type,source_file,dataset_type,satellite_source,geometry,provincia,time,temperature_2m_max,temperature_2m_min,precipitation_sum,windspeed_10m_max,relative_humidity_2m_max,relative_humidity_2m_min,shortwave_radiation_sum,lat,lon
0,41.47758,-1.48748,303.02,0.41,0.45,2018-04-01,133,N20,VIIRS,2,274.7,0.84,N,2.0,fire_archive_J1V-C2_663078.csv,archive,NOAA-20 (J1V),POINT (-1.48748 41.47758),zaragoza,2018-04-01,19.8,3.9,0.0,18.9,86,27,20.71,41.65,-0.89
1,41.31344,1.66281,319.19,0.45,0.39,2018-04-01,133,N20,VIIRS,2,282.27,2.31,N,2.0,fire_archive_J1V-C2_663078.csv,archive,NOAA-20 (J1V),POINT (1.66281 41.31344),barcelona,2018-04-01,15.7,6.9,0.0,16.9,90,45,21.52,41.3874,2.1686
2,41.31344,1.66281,319.19,0.45,0.39,2018-04-01,133,N20,VIIRS,2,282.27,2.31,N,2.0,fire_archive_J1V-C2_663078.csv,archive,NOAA-20 (J1V),POINT (1.66281 41.31344),barcelona,2018-04-01,15.6,6.8,0.0,16.9,90,45,21.52,41.39,2.17
3,41.31344,1.66281,319.19,0.45,0.39,2018-04-01,133,N20,VIIRS,2,282.27,2.31,N,2.0,fire_archive_J1V-C2_663078.csv,archive,NOAA-20 (J1V),POINT (1.66281 41.31344),barcelona,2018-04-01,15.6,6.8,0.0,16.9,90,45,21.52,41.39,2.17
4,41.31344,1.66281,319.19,0.45,0.39,2018-04-01,133,N20,VIIRS,2,282.27,2.31,N,2.0,fire_archive_J1V-C2_663078.csv,archive,NOAA-20 (J1V),POINT (1.66281 41.31344),barcelona,2018-04-01,15.6,6.8,0.0,16.9,90,45,21.52,41.39,2.17


In [5]:
# ==========================================
# 3. INTEGRAR EFFIS (COPERNICUS) POR PROVINCIA Y FECHA
# ==========================================
import numpy as np

print("üî• Cargando shapefile EFFIS‚Ä¶")
effis = gpd.read_file(EFFIS_SHP_PATH)

# Fecha y √°rea
effis["fecha"]   = pd.to_datetime(effis["FIREDATE"], errors="coerce")
effis["AREA_HA"] = pd.to_numeric(effis["AREA_HA"], errors="coerce")
effis = effis.dropna(subset=["fecha", "AREA_HA"])

# Filtrar solo Espa√±a
effis_es = effis[effis["COUNTRY"] == "ES"].copy()
print("Registros totales EFFIS:", len(effis))
print("üá™üá∏ EFFIS Espa√±a:", len(effis_es))
print("Rango fechas EFFIS:", effis_es["fecha"].min(), "‚Üí", effis_es["fecha"].max())

# Provincias GADM
provincias = gpd.read_file(GADM_PATH)[["NAME_2", "geometry"]]
provincias = provincias.rename(columns={"NAME_2": "provincia"})

# Proyecci√≥n m√©trica (EPSG:3035) para calcular √°rea intersecci√≥n
effis_es_3035 = effis_es.to_crs(3035)
provincias_3035 = gpd.GeoDataFrame(provincias, geometry=provincias.geometry).to_crs(3035)

print("üîß Haciendo overlay EFFIS x provincias‚Ä¶ (tarda un poco)")
effis_prov = gpd.overlay(effis_es_3035, provincias_3035, how="intersection", keep_geom_type=True)

# √Årea en hect√°reas de la parte del pol√≠gono en cada provincia
effis_prov["area_ha"]   = effis_prov.geometry.area / 10_000
effis_prov["provincia"] = normaliza_prov(effis_prov["provincia"])

# Columnas de vegetaci√≥n (si existen)
veg_cols_all = [
    "BROADLEA","CONIFER","MIXED","SCLEROPH","TRANSIT",
    "OTHERNATLC","AGRIAREAS","ARTIFSURF","OTHERLC","PERCNA2K"
]
veg_cols = [c for c in veg_cols_all if c in effis_prov.columns]
for c in veg_cols:
    effis_prov[c] = pd.to_numeric(effis_prov[c], errors="coerce")

def agg_weighted(df):
    out = {}
    total_area = df["area_ha"].sum()
    out["effis_area_ha"] = total_area
    out["effis_fire_count"] = df["id"].nunique() if "id" in df.columns else df.shape[0]
    for c in veg_cols:
        out[f"effis_{c.lower()}_pct"] = (
            (df[c] * df["area_ha"]).sum() / total_area if total_area > 0 else np.nan
        )
    return pd.Series(out, dtype="float64")

ba_daily = (
    effis_prov.groupby(["provincia", "fecha"], as_index=False)
              .apply(agg_weighted)
              .reset_index()
              .drop(columns=["level_2"], errors="ignore")
)

ba_daily["provincia"] = normaliza_prov(ba_daily["provincia"])

print("‚úÖ Tabla EFFIS agregada provincia‚Äìd√≠a:", len(ba_daily), "filas")
print("   Rango fechas:", ba_daily["fecha"].min(), "‚Üí", ba_daily["fecha"].max())
ba_daily.head()


üî• Cargando shapefile EFFIS‚Ä¶
Registros totales EFFIS: 83915
üá™üá∏ EFFIS Espa√±a: 6528
Rango fechas EFFIS: 2016-04-30 00:00:00 ‚Üí 2025-11-05 13:20:00
üîß Haciendo overlay EFFIS x provincias‚Ä¶ (tarda un poco)
‚úÖ Tabla EFFIS agregada provincia‚Äìd√≠a: 4550 filas
   Rango fechas: 2016-04-30 00:00:00 ‚Üí 2025-11-05 13:20:00


  .apply(agg_weighted)


Unnamed: 0,index,provincia,fecha,effis_area_ha,effis_fire_count,effis_broadlea_pct,effis_conifer_pct,effis_mixed_pct,effis_scleroph_pct,effis_transit_pct,effis_othernatlc_pct,effis_agriareas_pct,effis_artifsurf_pct,effis_otherlc_pct,effis_percna2k_pct
0,0,acoruna,2016-05-01,34.584698,1.0,57.575758,0.0,0.0,0.0,0.0,42.424242,0.0,0.0,0.0,0.0
1,1,acoruna,2016-06-27,22.565679,1.0,0.0,0.0,0.0,0.0,31.818182,68.181818,0.0,0.0,0.0,100.0
2,2,acoruna,2016-08-08,99.278724,1.0,0.0,38.0,0.0,0.0,3.0,30.0,29.0,0.0,0.0,25.319339
3,3,acoruna,2016-08-09,78.496135,1.0,0.0,0.0,22.78481,0.0,5.063291,72.151899,0.0,0.0,0.0,0.0
4,4,acoruna,2016-08-10,2831.644659,4.0,9.566053,3.187794,4.598346,0.0,29.731232,46.38345,6.391982,0.141143,0.0,0.542921


In [6]:
# ==========================================
# 4. MERGE TOTAL FIRMS+METEO+EFFIS
# ==========================================

merged = pd.read_csv(MERGED_PATH, low_memory=False, parse_dates=["acq_date"])
merged["provincia"] = normaliza_prov(merged["provincia"])

merged_full = merged.merge(
    ba_daily.rename(columns={"fecha": "acq_date"}),
    on=["provincia", "acq_date"],
    how="left"
)

# Rellenar nulos l√≥gicos
merged_full["effis_area_ha"]   = pd.to_numeric(merged_full["effis_area_ha"], errors="coerce").fillna(0.0)
merged_full["effis_fire_count"] = pd.to_numeric(merged_full["effis_fire_count"], errors="coerce").fillna(0).astype(int)

for c in [col for col in merged_full.columns if col.startswith("effis_") and col.endswith("_pct")]:
    merged_full[c] = pd.to_numeric(merged_full[c], errors="coerce").fillna(0.0)

MERGED_FULL_PATH = os.path.join(DATA_DIR, "merged_full_fires_openmeteo_effis.csv")
merged_full.to_csv(MERGED_FULL_PATH, index=False)

print("üíæ Guardado:", MERGED_FULL_PATH)
print("   Filas:", len(merged_full), "| Columnas:", len(merged_full.columns))
print("   Rango fechas:", merged_full["acq_date"].min(), "‚Üí", merged_full["acq_date"].max())
merged_full.head()


üíæ Guardado: C:\Users\aitor.herran\Desktop\incendios\merged_full_fires_openmeteo_effis.csv
   Filas: 635484 | Columnas: 42
   Rango fechas: 2015-09-19 00:00:00 ‚Üí 2025-09-18 00:00:00


Unnamed: 0,latitude,longitude,brightness,scan,track,acq_date,acq_time,satellite,instrument,confidence,bright_t31,frp,daynight,type,source_file,dataset_type,satellite_source,geometry,provincia,time,temperature_2m_max,temperature_2m_min,precipitation_sum,windspeed_10m_max,relative_humidity_2m_max,relative_humidity_2m_min,shortwave_radiation_sum,lat,lon,index,effis_area_ha,effis_fire_count,effis_broadlea_pct,effis_conifer_pct,effis_mixed_pct,effis_scleroph_pct,effis_transit_pct,effis_othernatlc_pct,effis_agriareas_pct,effis_artifsurf_pct,effis_otherlc_pct,effis_percna2k_pct
0,41.47758,-1.48748,303.02,0.41,0.45,2018-04-01,133,N20,VIIRS,2,274.7,0.84,N,2.0,fire_archive_J1V-C2_663078.csv,archive,NOAA-20 (J1V),POINT (-1.48748 41.47758),zaragoza,2018-04-01,19.8,3.9,0.0,18.9,86,27,20.71,41.65,-0.89,,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,41.31344,1.66281,319.19,0.45,0.39,2018-04-01,133,N20,VIIRS,2,282.27,2.31,N,2.0,fire_archive_J1V-C2_663078.csv,archive,NOAA-20 (J1V),POINT (1.66281 41.31344),barcelona,2018-04-01,15.7,6.9,0.0,16.9,90,45,21.52,41.3874,2.1686,,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,41.31344,1.66281,319.19,0.45,0.39,2018-04-01,133,N20,VIIRS,2,282.27,2.31,N,2.0,fire_archive_J1V-C2_663078.csv,archive,NOAA-20 (J1V),POINT (1.66281 41.31344),barcelona,2018-04-01,15.6,6.8,0.0,16.9,90,45,21.52,41.39,2.17,,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,41.31344,1.66281,319.19,0.45,0.39,2018-04-01,133,N20,VIIRS,2,282.27,2.31,N,2.0,fire_archive_J1V-C2_663078.csv,archive,NOAA-20 (J1V),POINT (1.66281 41.31344),barcelona,2018-04-01,15.6,6.8,0.0,16.9,90,45,21.52,41.39,2.17,,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,41.31344,1.66281,319.19,0.45,0.39,2018-04-01,133,N20,VIIRS,2,282.27,2.31,N,2.0,fire_archive_J1V-C2_663078.csv,archive,NOAA-20 (J1V),POINT (1.66281 41.31344),barcelona,2018-04-01,15.6,6.8,0.0,16.9,90,45,21.52,41.39,2.17,,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# ==========================================
# 5. ENRICHED: RENOMBRAR COLUMNAS Y ORDENAR
# ==========================================

base = pd.read_csv(MERGED_FULL_PATH, low_memory=False, parse_dates=["acq_date"])
base["provincia"] = normaliza_prov(base["provincia"])

rename_map = {
    # FIRMS
    "latitude": "firms_latitude",
    "longitude": "firms_longitude",
    "brightness": "firms_brightness",
    "scan": "firms_scan",
    "track": "firms_track",
    "acq_date": "firms_date",
    "acq_time": "firms_time",
    "satellite": "firms_satellite",
    "instrument": "firms_instrument",
    "confidence": "firms_confidence",
    "version": "firms_version",
    "bright_t31": "firms_bright_t31",
    "frp": "firms_frp",
    "daynight": "firms_daynight",
    "type": "firms_type",
    "geometry": "firms_geometry",

    # METEO
    "time": "meteo_date",
    "temperature_2m_max": "meteo_temp_max",
    "temperature_2m_min": "meteo_temp_min",
    "precipitation_sum": "meteo_precip_sum",
    "windspeed_10m_max": "meteo_wind_max",
    "relative_humidity_2m_max": "meteo_humidity_max",
    "relative_humidity_2m_min": "meteo_humidity_min",
    "shortwave_radiation_sum": "meteo_solar_radiation",
    "lat": "meteo_lat",
    "lon": "meteo_lon"
    # EFFIS ya tiene prefijo effis_
}

enriched = base.rename(columns=rename_map)

# Deduplicar puntos id√©nticos
enriched = enriched.drop_duplicates(subset=[
    "firms_latitude","firms_longitude","firms_date","firms_time","firms_satellite","firms_instrument"
])

cols_core  = ["provincia"]
cols_firms = [c for c in enriched.columns if c.startswith("firms_")]
cols_meteo = [c for c in enriched.columns if c.startswith("meteo_")]
cols_effis = ["effis_area_ha","effis_fire_count"] + [
    c for c in enriched.columns if c.startswith("effis_") and c.endswith("_pct")
]

ordered = cols_core + cols_firms + cols_meteo + cols_effis
ordered += [c for c in enriched.columns if c not in ordered]
enriched = enriched[ordered]

ENRICHED_PATH = os.path.join(DATA_DIR, "fires_openmeteo_effis_enriched.csv")
enriched.to_csv(ENRICHED_PATH, index=False)

print("‚úÖ Guardado enriched:", ENRICHED_PATH)
print("   Filas:", len(enriched), "| Columnas:", len(enriched.columns))
print("   Rango fechas:", enriched["firms_date"].min(), "‚Üí", enriched["firms_date"].max())
enriched.head()


‚úÖ Guardado enriched: C:\Users\aitor.herran\Desktop\incendios\fires_openmeteo_effis_enriched.csv
   Filas: 287584 | Columnas: 42
   Rango fechas: 2015-09-19 00:00:00 ‚Üí 2025-09-18 00:00:00


Unnamed: 0,provincia,firms_latitude,firms_longitude,firms_brightness,firms_scan,firms_track,firms_date,firms_time,firms_satellite,firms_instrument,firms_confidence,firms_bright_t31,firms_frp,firms_daynight,firms_type,firms_geometry,meteo_date,meteo_temp_max,meteo_temp_min,meteo_precip_sum,meteo_wind_max,meteo_humidity_max,meteo_humidity_min,meteo_solar_radiation,meteo_lat,meteo_lon,effis_area_ha,effis_fire_count,effis_broadlea_pct,effis_conifer_pct,effis_mixed_pct,effis_scleroph_pct,effis_transit_pct,effis_othernatlc_pct,effis_agriareas_pct,effis_artifsurf_pct,effis_otherlc_pct,effis_percna2k_pct,source_file,dataset_type,satellite_source,index
0,zaragoza,41.47758,-1.48748,303.02,0.41,0.45,2018-04-01,133,N20,VIIRS,2,274.7,0.84,N,2.0,POINT (-1.48748 41.47758),2018-04-01,19.8,3.9,0.0,18.9,86,27,20.71,41.65,-0.89,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,fire_archive_J1V-C2_663078.csv,archive,NOAA-20 (J1V),
1,barcelona,41.31344,1.66281,319.19,0.45,0.39,2018-04-01,133,N20,VIIRS,2,282.27,2.31,N,2.0,POINT (1.66281 41.31344),2018-04-01,15.7,6.9,0.0,16.9,90,45,21.52,41.3874,2.1686,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,fire_archive_J1V-C2_663078.csv,archive,NOAA-20 (J1V),
6,barcelona,41.45441,1.97884,305.53,0.44,0.38,2018-04-01,133,N20,VIIRS,2,282.21,1.56,N,2.0,POINT (1.97884 41.45441),2018-04-01,15.7,6.9,0.0,16.9,90,45,21.52,41.3874,2.1686,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,fire_archive_J1V-C2_663078.csv,archive,NOAA-20 (J1V),
11,tarragona,40.57595,0.54365,297.79,0.48,0.4,2018-04-01,133,N20,VIIRS,2,281.3,0.54,N,2.0,POINT (0.54365 40.57595),2018-04-01,16.5,8.8,0.0,21.4,85,38,21.18,41.12,1.25,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,fire_archive_J1V-C2_663078.csv,archive,NOAA-20 (J1V),
12,madrid,40.24524,-3.47493,322.1,0.49,0.49,2018-04-01,134,N20,VIIRS,2,275.42,3.71,N,2.0,POINT (-3.47493 40.24524),2018-04-01,16.6,3.5,0.0,12.1,86,40,20.35,40.4168,-3.7038,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,fire_archive_J1V-C2_663078.csv,archive,NOAA-20 (J1V),


In [8]:
# ==========================================
# 6. LIMPIEZA + IMPUTACI√ìN PARA ML (dataset clean)
# ==========================================
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (8,4)

df = pd.read_csv(ENRICHED_PATH, low_memory=False, parse_dates=["firms_date"])
print("Filas enriched:", len(df), "| Columnas:", len(df.columns))

if "index" in df.columns:
    df = df.drop(columns=["index"])

if "provincia" in df.columns:
    df["provincia"] = df["provincia"].astype(str).str.strip().astype("category")

# Quitar filas sin coords o fecha
subset_criticas = [c for c in ["firms_latitude","firms_longitude","firms_date"] if c in df.columns]
df = df.dropna(subset=subset_criticas)
print("Filas tras quitar filas sin coords/fecha:", len(df))

# Columnas num√©ricas
num_cols = df.select_dtypes(include=["float64","int64"]).columns.tolist()
effis_area_cols  = [c for c in num_cols if c.startswith("effis_") and "area"  in c]
effis_count_cols = [c for c in num_cols if c.startswith("effis_") and "count" in c]
effis_pct_cols   = [c for c in num_cols if c.startswith("effis_") and c.endswith("_pct")]

meteo_num_cols = [c for c in num_cols if c not in effis_area_cols + effis_count_cols + effis_pct_cols]

# Imputaci√≥n
for c in meteo_num_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")
    df[c] = df[c].fillna(df[c].median())

for c in effis_area_cols + effis_count_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0)

for c in effis_pct_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0)
    df[c] = df[c].clip(lower=0, upper=100)

# Tratamiento suave de outliers
if "meteo_precip_sum" in df.columns:
    df["meteo_precip_sum"] = df["meteo_precip_sum"].clip(lower=0)

for col in ["meteo_humidity_max", "meteo_humidity_min"]:
    if col in df.columns:
        df[col] = df[col].clip(lower=0, upper=100)

if "meteo_wind_max" in df.columns:
    df["meteo_wind_max"] = df["meteo_wind_max"].clip(lower=0)

if "firms_brightness" in df.columns:
    df["firms_brightness"] = df["firms_brightness"].clip(lower=200, upper=400)

if "firms_frp" in df.columns:
    df["firms_frp"] = df["firms_frp"].clip(lower=0)

CLEAN_PATH = os.path.join(DATA_DIR, "fires_openmeteo_effis_clean.csv")
df.to_csv(CLEAN_PATH, index=False)

print("‚úÖ Guardado dataset limpio:", CLEAN_PATH)
print("   Filas:", len(df), "| Columnas:", len(df.columns))
print("   Rango fechas:", df["firms_date"].min(), "‚Üí", df["firms_date"].max())


Filas enriched: 287584 | Columnas: 42
Filas tras quitar filas sin coords/fecha: 287584
‚úÖ Guardado dataset limpio: C:\Users\aitor.herran\Desktop\incendios\fires_openmeteo_effis_clean.csv
   Filas: 287584 | Columnas: 41
   Rango fechas: 2015-09-19 00:00:00 ‚Üí 2025-09-18 00:00:00


In [9]:
# ==========================================
# 7. CSV PARA VISUALIZACI√ìN: events_viz.csv + prov_daily_viz.csv
# ==========================================

df = pd.read_csv(CLEAN_PATH, low_memory=False, parse_dates=["firms_date"])

# Asegurar num√©ricos clave
num_cols = [
    "firms_brightness","firms_bright_t31","firms_frp","firms_confidence",
    "meteo_temp_max","meteo_temp_min","meteo_precip_sum","meteo_wind_max",
    "meteo_humidity_max","meteo_humidity_min","meteo_solar_radiation",
    "effis_area_ha","effis_fire_count"
]
num_cols += [c for c in df.columns if c.startswith("effis_") and c.endswith("_pct")]
for c in num_cols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

for c in ["effis_area_ha","effis_fire_count"]:
    if c in df.columns:
        df[c] = df[c].fillna(0)

for c in [c for c in df.columns if c.startswith("effis_") and c.endswith("_pct")]:
    df[c] = df[c].fillna(0.0)

# 7.1 events_viz.csv
events_cols = [
    "provincia","firms_date","firms_time",
    "firms_latitude","firms_longitude",
    "firms_brightness","firms_bright_t31","firms_frp","firms_confidence",
    "meteo_temp_max","meteo_temp_min","meteo_precip_sum","meteo_wind_max",
    "meteo_humidity_max","meteo_humidity_min","meteo_solar_radiation",
    "effis_area_ha","effis_fire_count"
]
events_cols = [c for c in events_cols if c in df.columns]
events = df[events_cols].drop_duplicates(
    subset=["provincia","firms_date","firms_time","firms_latitude","firms_longitude"]
)

EVENTS_VIZ_PATH = os.path.join(DATA_DIR, "events_viz.csv")
events.to_csv(EVENTS_VIZ_PATH, index=False)
print("‚úÖ Guardado:", EVENTS_VIZ_PATH, "| filas:", len(events), "| columnas:", len(events.columns))

# 7.2 prov_daily_viz.csv
agg_dict = {
    "firms_frp": ["sum","mean"],
    "firms_brightness": ["mean","max"],
    "firms_confidence": "mean",
    "firms_latitude": "count",  # -> firms_count
    "meteo_temp_max": "first",
    "meteo_temp_min": "first",
    "meteo_precip_sum": "first",
    "meteo_wind_max": "first",
    "meteo_humidity_max": "first",
    "meteo_humidity_min": "first",
    "meteo_solar_radiation": "first",
    "effis_area_ha": "sum",
    "effis_fire_count": "sum",
}
effis_pct_cols = [c for c in df.columns if c.startswith("effis_") and c.endswith("_pct")]
for c in effis_pct_cols:
    agg_dict[c] = "first"

daily = (df.groupby(["provincia","firms_date"], as_index=False)
           .agg(agg_dict))

daily.columns = ["_".join([str(s) for s in col if s]).rstrip("_") for col in daily.columns]

daily = daily.rename(columns={
    "firms_date": "date",
    "firms_latitude_count": "firms_count",
    "firms_frp_sum": "firms_frp_sum",
    "firms_frp_mean": "firms_frp_mean",
    "firms_brightness_mean": "firms_brightness_mean",
    "firms_brightness_max": "firms_brightness_max",
    "firms_confidence_mean": "firms_confidence_mean",
    "meteo_temp_max_first": "meteo_temp_max",
    "meteo_temp_min_first": "meteo_temp_min",
    "meteo_precip_sum_first": "meteo_precip_sum",
    "meteo_wind_max_first": "meteo_wind_max",
    "meteo_humidity_max_first": "meteo_humidity_max",
    "meteo_humidity_min_first": "meteo_humidity_min",
    "meteo_solar_radiation_first": "meteo_solar_radiation",
    "effis_area_ha_sum": "effis_area_ha",
    "effis_fire_count_sum": "effis_fire_count",
    **{f"{c}_first": c for c in effis_pct_cols}
})

ord_cols = [
    "provincia","date",
    "firms_count","firms_frp_sum","firms_frp_mean",
    "firms_brightness_mean","firms_brightness_max","firms_confidence_mean",
    "meteo_temp_max","meteo_temp_min","meteo_precip_sum","meteo_wind_max",
    "meteo_humidity_max","meteo_humidity_min","meteo_solar_radiation",
    "effis_area_ha","effis_fire_count"
] + effis_pct_cols
daily = daily[[c for c in ord_cols if c in daily.columns]].sort_values(["provincia","date"])

DAILY_VIZ_PATH = os.path.join(DATA_DIR, "prov_daily_viz.csv")
daily.to_csv(DAILY_VIZ_PATH, index=False)
print("‚úÖ Guardado:", DAILY_VIZ_PATH, "| filas:", len(daily), "| columnas:", len(daily.columns))


‚úÖ Guardado: C:\Users\aitor.herran\Desktop\incendios\events_viz.csv | filas: 287584 | columnas: 18
‚úÖ Guardado: C:\Users\aitor.herran\Desktop\incendios\prov_daily_viz.csv | filas: 46541 | columnas: 27


In [10]:
# ==========================================
# 8. RESUMEN FINAL DE TODOS LOS DATASETS
# ==========================================

def resumen_csv(path, date_col_candidates):
    df = pd.read_csv(path, low_memory=False)
    date_col = None
    for c in date_col_candidates:
        if c in df.columns:
            date_col = c
            break

    print("\nüìÅ", os.path.basename(path))
    print("   Filas    :", len(df))
    print("   Columnas :", len(df.columns))
    if date_col:
        d = pd.to_datetime(df[date_col], errors="coerce")
        print(f"   Fecha ({date_col}):", d.min(), "‚Üí", d.max())
    print("   Ejemplo columnas:", df.columns.tolist()[:10])

print("============== RESUMEN FINAL ==============")
resumen_csv(FIRMS_PROV_PATH,        ["acq_date"])
resumen_csv(MERGED_PATH,            ["acq_date"])
resumen_csv(MERGED_FULL_PATH,       ["acq_date"])
resumen_csv(ENRICHED_PATH,          ["firms_date"])
resumen_csv(CLEAN_PATH,             ["firms_date"])
resumen_csv(EVENTS_VIZ_PATH,        ["firms_date"])
resumen_csv(DAILY_VIZ_PATH,         ["date"])
print("‚úÖ Pipeline completo.")



üìÅ firms_spain_provincia.csv
   Filas    : 338944
   Columnas : 19
   Fecha (acq_date): 2015-09-19 00:00:00 ‚Üí 2025-09-18 00:00:00
   Ejemplo columnas: ['latitude', 'longitude', 'brightness', 'scan', 'track', 'acq_date', 'acq_time', 'satellite', 'instrument', 'confidence']

üìÅ merged_fires_openmeteo_provincia.csv
   Filas    : 635484
   Columnas : 29
   Fecha (acq_date): 2015-09-19 00:00:00 ‚Üí 2025-09-18 00:00:00
   Ejemplo columnas: ['latitude', 'longitude', 'brightness', 'scan', 'track', 'acq_date', 'acq_time', 'satellite', 'instrument', 'confidence']

üìÅ merged_full_fires_openmeteo_effis.csv
   Filas    : 635484
   Columnas : 42
   Fecha (acq_date): 2015-09-19 00:00:00 ‚Üí 2025-09-18 00:00:00
   Ejemplo columnas: ['latitude', 'longitude', 'brightness', 'scan', 'track', 'acq_date', 'acq_time', 'satellite', 'instrument', 'confidence']

üìÅ fires_openmeteo_effis_enriched.csv
   Filas    : 287584
   Columnas : 42
   Fecha (firms_date): 2015-09-19 00:00:00 ‚Üí 2025-09-18 00:00: