In [None]:
import requests, json, time, logging, unicodedata, re
from datetime import datetime, timedelta
from pyspark.sql import SparkSession
from typing import Dict, Optional
from notebookutils import mssparkutils

In [None]:
start_date_param = ""
end_date_param = ""
max_combinations_param = ""
lakehouse_path_param = "Files/bronze/REDATA"

# Obtener parámetros del pipeline
if 'start_date' in locals() or 'start_date' in globals():
    start_date_param = start_date if 'start_date' in locals() else globals().get('start_date', "")
if 'end_date' in locals() or 'end_date' in globals():
    end_date_param = end_date if 'end_date' in locals() else globals().get('end_date', "")
if 'max_combinations' in locals() or 'max_combinations' in globals():
    max_combinations_param = max_combinations if 'max_combinations' in locals() else globals().get('max_combinations', "")
if 'lakehouse_path' in locals() or 'lakehouse_path' in globals():
    lakehouse_path_param = lakehouse_path if 'lakehouse_path' in locals() else globals().get('lakehouse_path', "Files/bronze/REDATA")

print(f"Parámetros recibidos:")
print(f"  start_date: '{start_date_param}'")
print(f"  end_date: '{end_date_param}'")
print(f"  max_combinations: '{max_combinations_param}'")
print(f"  lakehouse_path: '{lakehouse_path_param}'")

In [None]:
# ============================================
# Configuración y load_config
# ============================================

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
spark = SparkSession.builder.appName("REData_API_Explorer").getOrCreate()


def load_config():
    """Carga configuración externa"""
    config_path = "Files/config/redata_config.json"
    try:
        content = mssparkutils.fs.head(config_path, 100000)
        config_data = json.loads(content)
        print("✅ Configuración cargada desde redata_config.json")
        return config_data
    except Exception as e:
        logger.warning(f"⚠️ No se pudo cargar config: {e}")
        logger.warning("Usando valores por defecto")
        return {
            "ccaa_ids": {
                "Península": 8741,
                "Islas Canarias": 8742,
                "Islas Baleares": 8743,
                "Ceuta": 8744,
                "Melilla": 8745,
                "Andalucía": 4,
                "Aragón": 5,
                "Cantabria": 6,
                "Castilla-La Mancha": 7,
                "Castilla y León": 8,
                "Cataluña": 9,
                "País Vasco": 10,
                "Principado de Asturias": 11,
                "Comunidad de Madrid": 13,
                "Comunidad de Navarra": 14,
                "Comunidad Valenciana": 15,
                "Extremadura": 16,
                "Galicia": 17,
                "La Rioja": 20,
                "Región de Murcia": 21
            },
            "api_config": {
                "balance": ["balance-electrico"],
                "demanda": ["evolucion"],
                "generacion": [
                    "estructura-generacion",
                    "estructura-generacion-emisiones-asociadas",
                    "estructura-renovables",
                    "evolucion-renovable-no-renovable"
                ]
            }
        }

In [None]:
# ============================================
# Clase REDataAPIExplorer
# ============================================

class REDataAPIExplorer:
    def __init__(self, base_lakehouse_path: str, start_date: str = None, end_date: str = None):
        self.base_url = "https://apidatos.ree.es"
        self.base_lakehouse_path = base_lakehouse_path
        self.session = requests.Session()
        self.headers = {"Accept": "application/json", "Content-Type": "application/json"}
        
        # Cargar configuración
        config = load_config()
        self.ccaa_ids = config["ccaa_ids"]
        self.api_config = config["api_config"]
        
        # Fechas
        if start_date and end_date:
            self.start_date = datetime.fromisoformat(start_date.replace('Z', '+00:00'))
            self.end_date = datetime.fromisoformat(end_date.replace('Z', '+00:00'))
        else:
            self.end_date = datetime.now()
            self.start_date = self.end_date - timedelta(days=365)
        
        self.logs_success = f"{self.base_lakehouse_path}/logs/success.log"
        self.logs_error = f"{self.base_lakehouse_path}/logs/error.log"
    
    def _slugify(self, text: str) -> str:
        """Convierte texto a formato slug"""
        t = unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("ascii")
        return re.sub(r"[^a-z0-9\-]", "", t.lower().replace(" ", "-"))
    
    def _clean_timestamp(self, timestamp: str) -> str:
        """Limpia timestamp para nombre de archivo"""
        return timestamp.lower().replace(":", "-").replace(".", "-")
    
    def build_api_url(self, lang, category, widget, time_trunc, geo_id, geo_name):
        """Construye URL de API - SIEMPRE usando geo_limit=ccaa"""
        start_str = self.start_date.strftime("%Y-%m-%dT%H:%M")
        end_str = self.end_date.strftime("%Y-%m-%dT%H:%M")
        base = f"{self.base_url}/{lang}/datos/{category}/{widget}"
        
        params = {
            "start_date": start_str,
            "end_date": end_str,
            "time_trunc": time_trunc,
            "geo_trunc": "electric_system",
            "geo_limit": "ccaa",
            "geo_ids": str(geo_id)
        }
        
        query = "&".join([f"{k}={v}" for k, v in params.items()])
        return f"{base}?{query}"
    
    def make_api_request(self, url: str):
        """Realiza petición a la API"""
        try:
            r = self.session.get(url, headers=self.headers, timeout=20)
            if r.status_code == 200:
                return True, r.json()
            return False, {}
        except Exception:
            return False, {}
    
    def save_to_bronze(self, data, category, widget, region, timestamp, time_trunc, 
                       geo_id, geo_name):
        """Guarda datos en bronze con metadata"""
        enriched_data = {
            "request_metadata": {
                "geo_id": geo_id,
                "geo_name": geo_name,
                "geo_limit": "ccaa",
                "category": category,
                "widget": widget,
                "time_trunc": time_trunc,
                "ingestion_timestamp": timestamp,
                "start_date": self.start_date.isoformat(),
                "end_date": self.end_date.isoformat()
            },
            "api_response": data
        }
        
        clean_ts = self._clean_timestamp(timestamp)
        fname = f"brz-{region}-{category}-{widget}-{time_trunc}-{clean_ts}.json"
        path = f"{self.base_lakehouse_path}/data/{category}/{widget}/{time_trunc}/{fname}"
        mssparkutils.fs.put(path, json.dumps(enriched_data, indent=2, ensure_ascii=False), overwrite=True)
        logger.info(f"✅ Guardado: {path}")
    
    def log(self, path, msg: str):
        """Guarda log"""
        try:
            mssparkutils.fs.put(path, msg + "\n", overwrite=False)
        except:
            try:
                old = mssparkutils.fs.head(path, 1000000)
            except:
                old = ""
            mssparkutils.fs.put(path, old + msg + "\n", overwrite=True)
    
    def explore_single(self, lang, category, widget, time_trunc, geo_id, geo_name):
        """Explora un endpoint específico"""
        ts = datetime.utcnow().isoformat(timespec="microseconds") + "Z"
        url = self.build_api_url(lang, category, widget, time_trunc, geo_id, geo_name)
        success, data = self.make_api_request(url)
        has_data = success and bool(data.get("included"))
        
        region = self._slugify(geo_name)
        
        if has_data:
            self.save_to_bronze(data, category, widget, region, ts, time_trunc, geo_id, geo_name)
            self.log(self.logs_success, f"{ts} OK {url}")
        else:
            self.log(self.logs_error, f"{ts} FAIL {url}")
        
        return {"url": url, "success": success, "has_data": has_data}
    
    def explore_all(self, max_combinations=None):
        """
        Explora API - SOLO DATOS MENSUALES Y SOLO geo_limit=ccaa
        """
        combos = []
        
        for cat, widgets in self.api_config.items():
            for widget in widgets:
                # ✅ SOLO MENSUALES + SOLO CCAA
                for geo_name, geo_id in self.ccaa_ids.items():
                    combos.append(("es", cat, widget, "month", geo_id, geo_name))
        
        if max_combinations:
            combos = combos[:max_combinations]
        
        logger.info(f"🚀 Ejecutando {len(combos)} combinaciones (SOLO MENSUALES + CCAA)")
        results = []
        for c in combos:
            results.append(self.explore_single(*c))
            time.sleep(0.2)
        return results
    
    def analyze_results(self, results):
        """Analiza resultados"""
        total = len(results)
        ok = sum(r["success"] for r in results)
        with_data = sum(r["has_data"] for r in results)
        return {"total": total, "ok": ok, "with_data": with_data, "failures": total - ok}


In [None]:
def main():
    max_comb = int(max_combinations_param.strip()) if max_combinations_param and max_combinations_param.strip().isdigit() else None
    
    explorer = REDataAPIExplorer(
        base_lakehouse_path=lakehouse_path_param,
        start_date=start_date_param if start_date_param else None,
        end_date=end_date_param if end_date_param else None
    )
    
    print(f"🚀 Iniciando ingesta REData")
    print(f"📅 Rango: {explorer.start_date} - {explorer.end_date}")
    print(f"🔢 Max combinaciones: {max_comb or 'Todas'}")
    print(f"✅ SOLO MENSUALES + geo_limit=ccaa")
    print(f"❌ Endpoints excluidos: generacion/maxima-renovable, mercados/componentes-precio")
    
    results = explorer.explore_all(max_combinations=max_comb)
    summary = explorer.analyze_results(results)
    print("📊 Resumen:", summary)
    return summary

if __name__ == "__main__":
    main()