In [5]:
import requests, json, time, logging, unicodedata, re
from datetime import datetime, timedelta
from pyspark.sql import SparkSession
import os
from typing import Dict, List, Tuple, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed
from notebookutils import mssparkutils

# Logging simple en consola
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Spark session
spark = SparkSession.builder.appName("REData_API_Explorer").getOrCreate()

StatementMeta(, c47b6a96-a025-468f-b332-170ec8b48017, 7, Finished, Available, Finished)

In [6]:
class REDataAPIExplorer:
    def __init__(self, base_lakehouse_path: str):
        self.base_url = "https://apidatos.ree.es"
        self.base_lakehouse_path = base_lakehouse_path
        self.session = requests.Session()
        self.headers = {"Accept": "application/json", "Content-Type": "application/json"}

        self.end_date = datetime.now()
        self.start_date = self.end_date - timedelta(days=365)

        # Reducido según briefing: solo categorías clave
        self.api_config = {
            "balance": ["balance-electrico"],
            "demanda": ["evolucion"],
            "generacion": [
                "estructura-generacion",
                "demanda-maxima-diaria",
                "evolucion-renovable-no-renovable",
                "estructura-renovables",
                "estructura-generacion-emisiones-asociadas",
                "maxima-renovable"
            ],
            "intercambios": ["francia-frontera", "portugal-frontera", "marruecos-frontera", "andorra-frontera"],
            "mercados": ["componentes-precio", "precios-mercados-tiempo-real"],
        }

        # Geo con IDs
        self.ccaa_ids = {
            "Andalucía": 4, "Aragón": 5, "Cantabria": 6, "Castilla-La Mancha": 7,
            "Castilla y León": 8, "Cataluña": 9, "País Vasco": 10, "Principado de Asturias": 11,
            "Comunidad de Ceuta": 8744, "Comunidad de Melilla": 8745,
            "Comunidad de Madrid": 13, "Comunidad de Navarra": 14, "Comunidad Valenciana": 15,
            "Extremadura": 16, "Galicia": 17, "Islas Baleares": 8743, "Islas Canarias": 8742,
            "La Rioja": 20, "Región de Murcia": 21, "Península": 8741,
        }
        self.geo_limits = ["peninsular", "canarias", "baleares", "ceuta", "melilla", "ccaa"]

        # 🔹 Ahora incluye day y month
        self.time_truncs = ["day", "month"]

        # Mapa de equivalencias para geo_limit → ccaa_ids
        self.geo_map = {
            "peninsular": "Península",
            "canarias": "Islas Canarias",
            "baleares": "Islas Baleares",
            "ceuta": "Comunidad de Ceuta",
            "melilla": "Comunidad de Melilla",
        }

        # Logs
        self.logs_success = f"{self.base_lakehouse_path}/logs/success.log"
        self.logs_error = f"{self.base_lakehouse_path}/logs/error.log"


    def _slugify(self, text: str) -> str:
        t = unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("ascii")
        return re.sub(r"[^a-z0-9\-]", "", t.lower().replace(" ", "-"))

    def build_api_url(self, lang, category, widget, time_trunc, geo_limit, ccaa_name=None):
        start_str = self.start_date.strftime("%Y-%m-%dT%H:%M")
        end_str = self.end_date.strftime("%Y-%m-%dT%H:%M")
        base = f"{self.base_url}/{lang}/datos/{category}/{widget}"

        params = {
            "start_date": start_str,
            "end_date": end_str,
            "time_trunc": time_trunc,
            "geo_trunc": "electric_system",
            "geo_limit": geo_limit
        }

        if geo_limit == "ccaa" and ccaa_name:
            params["geo_ids"] = str(self.ccaa_ids[ccaa_name])
        else:
            mapped_name = self.geo_map[geo_limit]  # usar siempre el mapeo
            params["geo_ids"] = str(self.ccaa_ids[mapped_name])

        query = "&".join([f"{k}={v}" for k, v in params.items()])
        return f"{base}?{query}"


    def make_api_request(self, url: str):
        try:
            r = self.session.get(url, headers=self.headers, timeout=20)
            if r.status_code == 200:
                return True, r.json()
            return False, {}
        except Exception:
            return False, {}

    def save_to_bronze(self, data, category, widget, region, timestamp, time_trunc):
        fname = f"brz-{region}-{category}-{widget}-{time_trunc}-{timestamp}.json"
        path = f"{self.base_lakehouse_path}/data/{category}/{widget}/{time_trunc}/{fname}"
        mssparkutils.fs.put(path, json.dumps(data, indent=2, ensure_ascii=False), overwrite=True)
        logger.info(f"✅ Guardado Bronze: {path}")

    def log(self, path, msg: str):
        try:
            mssparkutils.fs.put(path, msg + "\n", overwrite=False)
        except:
            old = ""
            try:
                old = mssparkutils.fs.head(path, 1000000)
            except:
                pass
            mssparkutils.fs.put(path, old + msg + "\n", overwrite=True)

    def explore_single(self, lang, category, widget, time_trunc, geo_limit, ccaa_name=None):
        ts = datetime.utcnow().isoformat(timespec="microseconds") + "Z"
        url = self.build_api_url(lang, category, widget, time_trunc, geo_limit, ccaa_name)
        success, data = self.make_api_request(url)
        has_data = success and bool(data.get("included"))

        region = self._slugify(ccaa_name) if geo_limit == "ccaa" and ccaa_name else geo_limit

        if has_data:
            self.save_to_bronze(data, category, widget, region, ts, time_trunc)
            self.log(self.logs_success, f"{ts} OK {url}")
        else:
            self.log(self.logs_error, f"{ts} FAIL {url}")
        return {"url": url, "success": success, "has_data": has_data}

    def explore_all(self, max_combinations=20):
        combos = []
        for lang in ["es"]:
            for cat, widgets in self.api_config.items():
                for w in widgets:
                    for t in self.time_truncs:
                        for g in self.geo_limits:
                            if g == "ccaa":
                                for ccaa_name in self.ccaa_ids.keys():
                                    if ccaa_name != "Península":
                                        combos.append((lang, cat, w, t, g, ccaa_name))
                            else:
                                combos.append((lang, cat, w, t, g, None))
        combos = combos[:max_combinations]
        logger.info(f"Ejecutando {len(combos)} combinaciones...")
        results = []
        for c in combos:
            results.append(self.explore_single(*c))
            time.sleep(0.2)
        return results

    def analyze_results(self, results):
        total = len(results)
        ok = sum(r["success"] for r in results)
        with_data = sum(r["has_data"] for r in results)
        return {"total": total, "ok": ok, "with_data": with_data, "failures": total - ok}


StatementMeta(, c47b6a96-a025-468f-b332-170ec8b48017, 8, Finished, Available, Finished)

In [7]:
LAKEHOUSE_PATH = "Files/bronze/redata"
explorer = REDataAPIExplorer(LAKEHOUSE_PATH)

print("✅ Explorador inicializado")
print("Categorías:", list(explorer.api_config.keys()))

StatementMeta(, c47b6a96-a025-468f-b332-170ec8b48017, 9, Finished, Available, Finished)

✅ Explorador inicializado
Categorías: ['balance', 'demanda', 'generacion', 'intercambios', 'mercados']


In [8]:
# Prueba de construcción de URLs con distintos casos
print("🔎 URL Península:")
print(explorer.build_api_url("es", "demanda", "evolucion", "day", "peninsular"))

print("\n🔎 URL Canarias:")
print(explorer.build_api_url("es", "demanda", "evolucion", "day", "canarias"))

print("\n🔎 URL Cataluña (CCAA):")
print(explorer.build_api_url("es", "demanda", "evolucion", "day", "ccaa", "Cataluña"))

StatementMeta(, c47b6a96-a025-468f-b332-170ec8b48017, 10, Finished, Available, Finished)

🔎 URL Península:
https://apidatos.ree.es/es/datos/demanda/evolucion?start_date=2024-09-23T13:19&end_date=2025-09-23T13:19&time_trunc=day&geo_trunc=electric_system&geo_limit=peninsular&geo_ids=8741

🔎 URL Canarias:
https://apidatos.ree.es/es/datos/demanda/evolucion?start_date=2024-09-23T13:19&end_date=2025-09-23T13:19&time_trunc=day&geo_trunc=electric_system&geo_limit=canarias&geo_ids=8742

🔎 URL Cataluña (CCAA):
https://apidatos.ree.es/es/datos/demanda/evolucion?start_date=2024-09-23T13:19&end_date=2025-09-23T13:19&time_trunc=day&geo_trunc=electric_system&geo_limit=ccaa&geo_ids=9


In [9]:
results = explorer.explore_all(max_combinations=None)  
print("✅ Exploración completada")

StatementMeta(, c47b6a96-a025-468f-b332-170ec8b48017, 11, Finished, Available, Finished)

INFO:__main__:Ejecutando 672 combinaciones...
INFO:__main__:✅ Guardado Bronze: Files/bronze/redata/data/balance/balance-electrico/day/brz-peninsular-balance-balance-electrico-day-2025-09-23T13:19:19.101206Z.json
INFO:__main__:✅ Guardado Bronze: Files/bronze/redata/data/balance/balance-electrico/day/brz-canarias-balance-balance-electrico-day-2025-09-23T13:19:38.318840Z.json
INFO:__main__:✅ Guardado Bronze: Files/bronze/redata/data/balance/balance-electrico/day/brz-baleares-balance-balance-electrico-day-2025-09-23T13:19:51.543863Z.json
INFO:__main__:✅ Guardado Bronze: Files/bronze/redata/data/balance/balance-electrico/day/brz-ceuta-balance-balance-electrico-day-2025-09-23T13:19:59.261604Z.json
INFO:__main__:✅ Guardado Bronze: Files/bronze/redata/data/balance/balance-electrico/day/brz-melilla-balance-balance-electrico-day-2025-09-23T13:20:13.143786Z.json
INFO:__main__:✅ Guardado Bronze: Files/bronze/redata/data/balance/balance-electrico/day/brz-andalucia-balance-balance-electrico-day-2025

✅ Exploración completada


In [10]:
summary = explorer.analyze_results(results)
print("📊 Resumen de la ejecución:")
print(summary)

StatementMeta(, c47b6a96-a025-468f-b332-170ec8b48017, 12, Finished, Available, Finished)

📊 Resumen de la ejecución:
{'total': 672, 'ok': 199, 'with_data': 199, 'failures': 473}
