In [None]:
import os, re, unicodedata
from datetime import datetime
from pyspark.sql import SparkSession, functions as F
from notebookutils import mssparkutils

In [None]:
# ---------- Spark ----------
spark = SparkSession.builder.getOrCreate()

In [None]:
# Seleccionar el Lakehouse destino (bronze)
spark.sql("USE lh_bronze")

In [None]:
# 📂 Carpetas bronze (donde ya están los JSON)
bronze_base_logical = "Files/bronze/ESIOS/data/balance/balance-electrico"
bronze_base_physical = "/lakehouse/default/Files/bronze/ESIOS/data/balance/balance-electrico"

In [None]:
# 📂 Salida (Tables/)
tables_prefix = "Tables"

In [None]:
# --------------------------
# Función utilitaria mejorada
# --------------------------
def slugify(name: str) -> str:
    # Normalizar y quitar tildes
    name = unicodedata.normalize("NFKD", name)
    name = name.encode("ascii", "ignore").decode("utf-8")  # quita acentos
    # Sustituir caracteres no válidos por "_"
    return re.sub(r'[^a-z0-9_]', '_', name.lower())

In [None]:
## --------------------------
# Procesamiento
# --------------------------
total_rows = 0

# Listar granularidades (day, month)
granularities = [g.name for g in mssparkutils.fs.ls(bronze_base_logical) if g.isDir]

for trunc in granularities:
    years_dirs = mssparkutils.fs.ls(f"{bronze_base_logical}/{trunc}")
    for year_dir in years_dirs:
        year = year_dir.name
        json_files = mssparkutils.fs.ls(year_dir.path)
        for f in json_files:
            if not f.path.endswith(".json"):
                continue

            print(f"\n🔎 Procesando {f.path}")

            # Leer JSON
            df_raw = spark.read.option("multiline", True).json(f.path)

            # Extraer indicador, values
            indicator_id = df_raw.select("indicator.id").first()[0]
            indicator_type = df_raw.select("indicator.short_name").first()[0] \
                              if "short_name" in df_raw.select("indicator.*").columns else indicator_id

            values = df_raw.selectExpr("explode(indicator.values) as val") \
                           .select("val.*")

            if values.rdd.isEmpty():
                print("⚠️ JSON vacío (sin values), skipping")
                continue

            # Enriquecer con metadatos y transformar datetime
            df = (values
                  .withColumn("indicator_id", F.lit(indicator_id))
                  .withColumn("indicator_type", F.lit(indicator_type))
                  .withColumn("year", F.lit(int(year)))
                  .withColumn("time_trunc", F.lit(trunc))
                  # Renombrar datetime a fecha y convertir a timestamp
                  .withColumn("fecha", F.to_timestamp(F.col("datetime"), "yyyy-MM-dd'T'HH:mm:ss.SSSXXX"))
                  .drop("datetime")
                 )

            # Nombre tabla → según tipo de indicador y trunc
            indicator_slug = slugify(indicator_type)
            table_name = f"brz_esios_balance_{indicator_slug}_{trunc}"

            # Guardar en Delta gestionado (en Lakehouse slv_lkh_esios)
            df.write.format("delta").mode("append").saveAsTable(table_name)

            count = df.count()
            total_rows += count
            print(f"✅ Guardado en tabla {table_name} ({count} registros)")

print(f"\nProceso terminado - {datetime.utcnow().isoformat()}Z")
print(f"Total registros procesados: {total_rows}")