In [None]:
import requests, json, time, logging, unicodedata, re
from datetime import datetime, timedelta
from pyspark.sql import SparkSession
from typing import Dict, Optional
from notebookutils import mssparkutils

In [None]:
# Parámetros del notebook
start_date_param = ""
end_date_param = ""
max_combinations_param = ""
lakehouse_path_param = "Files/bronze/REDATA"
schedule_type_param = "full"

import sys

# Obtener parámetros del pipeline
if 'start_date' in locals() or 'start_date' in globals():
    start_date_param = start_date if 'start_date' in locals() else globals().get('start_date', "")
if 'end_date' in locals() or 'end_date' in globals():
    end_date_param = end_date if 'end_date' in locals() else globals().get('end_date', "")
if 'max_combinations' in locals() or 'max_combinations' in globals():
    max_combinations_param = max_combinations if 'max_combinations' in locals() else globals().get('max_combinations', "")
if 'lakehouse_path' in locals() or 'lakehouse_path' in globals():
    lakehouse_path_param = lakehouse_path if 'lakehouse_path' in locals() else globals().get('lakehouse_path', "Files/bronze/REDATA")
if 'schedule_type' in locals() or 'schedule_type' in globals():
    schedule_type_param = schedule_type if 'schedule_type' in locals() else globals().get('schedule_type', "full")

# Debug
print(f"Parámetros recibidos:")
print(f"  start_date: '{start_date_param}'")
print(f"  end_date: '{end_date_param}'")
print(f"  max_combinations: '{max_combinations_param}'")
print(f"  lakehouse_path: '{lakehouse_path_param}'")
print(f"  schedule_type: '{schedule_type_param}'")

In [None]:
# Logging simple en consola
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Spark session
spark = SparkSession.builder.appName("REData_API_Explorer").getOrCreate()

In [None]:
# Cargar configuración externa
def load_config():
    config_path = "Files/config/redata_config.json"
    try:
        content = mssparkutils.fs.head(config_path, 100000)
        return json.loads(content)
    except:
        logger.warning("No se pudo cargar config, usando valores por defecto")
        return {
            "ccaa_ids": {"Península": 8741, "Islas Canarias": 8742},
            "geo_map": {"peninsular": "Península", "canarias": "Islas Canarias"},
            "api_config": {"balance": ["balance-electrico"]}
        }

In [None]:
class REDataAPIExplorer:
    def __init__(self, base_lakehouse_path: str, start_date: str = None, end_date: str = None):
        self.base_url = "https://apidatos.ree.es"
        self.base_lakehouse_path = base_lakehouse_path
        self.session = requests.Session()
        self.headers = {"Accept": "application/json", "Content-Type": "application/json"}
        
        # Cargar configuración
        config = load_config()
        self.ccaa_ids = config["ccaa_ids"]
        self.geo_map = config["geo_map"]
        self.api_config = config["api_config"]
        
        # Fechas
        if start_date and end_date:
            self.start_date = datetime.fromisoformat(start_date.replace('Z', '+00:00'))
            self.end_date = datetime.fromisoformat(end_date.replace('Z', '+00:00'))
        else:
            self.end_date = datetime.now()
            self.start_date = self.end_date - timedelta(days=365)
        
        self.geo_limits = ["peninsular", "canarias", "baleares", "ceuta", "melilla", "ccaa"]
        self.logs_success = f"{self.base_lakehouse_path}/logs/success.log"
        self.logs_error = f"{self.base_lakehouse_path}/logs/error.log"
    
    def _slugify(self, text: str) -> str:
        t = unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("ascii")
        return re.sub(r"[^a-z0-9\-]", "", t.lower().replace(" ", "-"))
    
    def _clean_timestamp(self, timestamp: str) -> str:
        return timestamp.lower().replace(":", "-").replace(".", "-")
    
    # CAMBIO 2: Método para resolver geo_id y nombre
    def _resolve_geo(self, geo_limit: str, ccaa_name: Optional[str] = None) -> tuple:
        """Retorna (geo_id, geo_name)"""
        if geo_limit == "ccaa" and ccaa_name:
            return self.ccaa_ids[ccaa_name], ccaa_name
        else:
            mapped_name = self.geo_map[geo_limit]
            return self.ccaa_ids[mapped_name], mapped_name
    
    def build_api_url(self, lang, category, widget, time_trunc, geo_limit, ccaa_name=None):
        start_str = self.start_date.strftime("%Y-%m-%dT%H:%M")
        end_str = self.end_date.strftime("%Y-%m-%dT%H:%M")
        base = f"{self.base_url}/{lang}/datos/{category}/{widget}"
        
        geo_id, _ = self._resolve_geo(geo_limit, ccaa_name)
        
        params = {
            "start_date": start_str,
            "end_date": end_str,
            "time_trunc": time_trunc,
            "geo_trunc": "electric_system",
            "geo_limit": geo_limit,
            "geo_ids": str(geo_id)
        }
        
        query = "&".join([f"{k}={v}" for k, v in params.items()])
        return f"{base}?{query}"
    
    def make_api_request(self, url: str):
        try:
            r = self.session.get(url, headers=self.headers, timeout=20)
            if r.status_code == 200:
                return True, r.json()
            return False, {}
        except Exception:
            return False, {}
    
    # CAMBIO 3: Enriquecer JSON con metadata de petición
    def save_to_bronze(self, data, category, widget, region, timestamp, time_trunc, 
                       geo_id, geo_name, geo_limit, ccaa_name):
        # Agregar metadata al JSON
        enriched_data = {
            "request_metadata": {
                "geo_id": geo_id,
                "geo_name": geo_name,
                "geo_limit": geo_limit,
                "ccaa_name": ccaa_name,
                "category": category,
                "widget": widget,
                "time_trunc": time_trunc,
                "ingestion_timestamp": timestamp,
                "start_date": self.start_date.isoformat(),
                "end_date": self.end_date.isoformat()
            },
            "api_response": data
        }
        
        clean_ts = self._clean_timestamp(timestamp)
        fname = f"brz-{region}-{category}-{widget}-{time_trunc}-{clean_ts}.json"
        path = f"{self.base_lakehouse_path}/data/{category}/{widget}/{time_trunc}/{fname}"
        mssparkutils.fs.put(path, json.dumps(enriched_data, indent=2, ensure_ascii=False), overwrite=True)
        logger.info(f"✅ Guardado: {path}")
    
    def log(self, path, msg: str):
        try:
            mssparkutils.fs.put(path, msg + "\n", overwrite=False)
        except:
            try:
                old = mssparkutils.fs.head(path, 1000000)
            except:
                old = ""
            mssparkutils.fs.put(path, old + msg + "\n", overwrite=True)
    
    def explore_single(self, lang, category, widget, time_trunc, geo_limit, ccaa_name=None):
        ts = datetime.utcnow().isoformat(timespec="microseconds") + "Z"
        url = self.build_api_url(lang, category, widget, time_trunc, geo_limit, ccaa_name)
        success, data = self.make_api_request(url)
        has_data = success and bool(data.get("included"))
        
        region = self._slugify(ccaa_name) if geo_limit == "ccaa" and ccaa_name else geo_limit
        geo_id, geo_name = self._resolve_geo(geo_limit, ccaa_name)
        
        if has_data:
            self.save_to_bronze(data, category, widget, region, ts, time_trunc,
                              geo_id, geo_name, geo_limit, ccaa_name)
            self.log(self.logs_success, f"{ts} OK {url}")
        else:
            self.log(self.logs_error, f"{ts} FAIL {url}")
        
        return {"url": url, "success": success, "has_data": has_data}
    
    def explore_all(self, max_combinations=None, schedule_type="full"):
        # Determinar estrategia según tipo de schedule
        if schedule_type == "daily":
            time_truncs = ["day"]
            geo_limits = ["peninsular", "canarias", "baleares", "ceuta", "melilla"]
        elif schedule_type == "monthly":
            time_truncs = ["month"]
            geo_limits = self.geo_limits
        else:  # full
            time_truncs = ["day", "month"]
            geo_limits = self.geo_limits
        
        combos = []
        for cat, widgets in self.api_config.items():
            for w in widgets:
                if schedule_type == "full":
                    # Diarios para agregaciones
                    for g in ["peninsular", "canarias", "baleares", "ceuta", "melilla"]:
                        combos.append(("es", cat, w, "day", g, None))
                    # Mensuales para todas incluyendo CCAA
                    for g in geo_limits:
                        if g == "ccaa":
                            for ccaa_name in self.ccaa_ids.keys():
                                if ccaa_name != "Península":
                                    combos.append(("es", cat, w, "month", g, ccaa_name))
                        else:
                            combos.append(("es", cat, w, "month", g, None))
                else:
                    for t in time_truncs:
                        for g in geo_limits:
                            if g == "ccaa":
                                for ccaa_name in self.ccaa_ids.keys():
                                    if ccaa_name != "Península":
                                        combos.append(("es", cat, w, t, g, ccaa_name))
                            else:
                                combos.append(("es", cat, w, t, g, None))
        
        if max_combinations:
            combos = combos[:max_combinations]
        
        logger.info(f"Ejecutando {len(combos)} combinaciones")
        results = []
        for c in combos:
            results.append(self.explore_single(*c))
            time.sleep(0.2)
        return results
    
    def analyze_results(self, results):
        total = len(results)
        ok = sum(r["success"] for r in results)
        with_data = sum(r["has_data"] for r in results)
        return {"total": total, "ok": ok, "with_data": with_data, "failures": total - ok}

In [None]:
def main():
    max_comb = int(max_combinations_param.strip()) if max_combinations_param and max_combinations_param.strip().isdigit() else None
    
    explorer = REDataAPIExplorer(
        base_lakehouse_path=lakehouse_path_param,
        start_date=start_date_param if start_date_param else None,
        end_date=end_date_param if end_date_param else None
    )
    
    print(f"🚀 Iniciando ingesta REData")
    print(f"📅 Rango: {explorer.start_date} - {explorer.end_date}")
    print(f"📢 Max combinaciones: {max_comb or 'Todas'}")
    print(f"⚙️ Schedule: {schedule_type_param}")
    
    results = explorer.explore_all(max_combinations=max_comb, schedule_type=schedule_type_param)
    summary = explorer.analyze_results(results)
    print("📊 Resumen:", summary)
    return summary

if __name__ == "__main__":
    main()