In [33]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from notebookutils import mssparkutils
import json

StatementMeta(, 432c2168-66c1-419f-862d-708e5d7fcc09, 35, Finished, Available, Finished)

In [34]:
# Parámetros
bronze_path_param = "Files/bronze/REDATA/data"
category_param = ""
widget_param = ""

if 'bronze_path' in locals() or 'bronze_path' in globals():
    bronze_path_param = bronze_path if 'bronze_path' in locals() else globals().get('bronze_path', bronze_path_param)
if 'category' in locals() or 'category' in globals():
    category_param = category if 'category' in locals() else globals().get('category', "")
if 'widget' in locals() or 'widget' in globals():
    widget_param = widget if 'widget' in locals() else globals().get('widget', "")

StatementMeta(, 432c2168-66c1-419f-862d-708e5d7fcc09, 36, Finished, Available, Finished)

In [35]:
spark = SparkSession.builder.appName("REData_JSON_to_Delta").getOrCreate()

StatementMeta(, 432c2168-66c1-419f-862d-708e5d7fcc09, 37, Finished, Available, Finished)

In [36]:
def list_categories():
    try:
        files = mssparkutils.fs.ls(bronze_path_param)
        return [f.name for f in files if f.isDir]
    except:
        return []

StatementMeta(, 432c2168-66c1-419f-862d-708e5d7fcc09, 38, Finished, Available, Finished)

In [37]:
def list_widgets(category):
    try:
        path = f"{bronze_path_param}/{category}"
        files = mssparkutils.fs.ls(path)
        return [f.name for f in files if f.isDir]
    except:
        return []

StatementMeta(, 432c2168-66c1-419f-862d-708e5d7fcc09, 39, Finished, Available, Finished)

In [38]:
def has_nested_content(df):
    try:
        sample = df.select("api_response.included").first()
        if not sample or not sample["included"] or len(sample["included"]) == 0:
            return False
        first_item = sample["included"][0]
        if hasattr(first_item, 'asDict'):
            first_dict = first_item.asDict()
            return "content" in first_dict.get("attributes", {})
        return "content" in first_item.get("attributes", {})
    except:
        return False

StatementMeta(, 432c2168-66c1-419f-862d-708e5d7fcc09, 40, Finished, Available, Finished)

In [39]:
def process_widget(category, widget):
    print(f"\n📦 {category}/{widget}")
    
    json_paths = []
    for time_trunc in ["day", "month"]:
        path = f"{bronze_path_param}/{category}/{widget}/{time_trunc}"
        try:
            files = mssparkutils.fs.ls(path)
            json_paths.extend([f"{path}/{f.name}" for f in files if f.name.endswith('.json')])
        except:
            continue
    
    if not json_paths:
        print(f"⚠️ Sin archivos")
        return None
    
    print(f"📄 {len(json_paths)} archivos")
    df_raw = spark.read.option("multiline", "true").json(json_paths)
    
    # Metadata
    df_meta = df_raw.select(
        col("request_metadata.geo_id").alias("geo_id"),
        col("request_metadata.geo_name").alias("geo_name"),
        col("request_metadata.geo_limit").alias("geo_limit"),
        col("request_metadata.ccaa_name").alias("ccaa_name"),
        col("request_metadata.time_trunc").alias("time_trunc"),
        col("request_metadata.ingestion_timestamp").alias("ingestion_timestamp"),
        col("api_response.included").alias("included")
    )
    
    df_included = df_meta.select("*", explode("included").alias("series")).drop("included")
    has_content = has_nested_content(df_raw)
    
    if has_content:
        # balance-electrico: included → content → values
        df_step1 = df_included.select(
            "geo_id", "geo_name", "geo_limit", "ccaa_name", "time_trunc", "ingestion_timestamp",
            col("series.type").alias("series_type"),
            col("series.attributes.title").alias("series_title"),
            col("series.attributes.content").alias("content")
        )
        
        df_step2 = df_step1.select("*", explode("content").alias("metric")).drop("content")
        
        df_step3 = df_step2.select(
            "geo_id", "geo_name", "geo_limit", "ccaa_name", "time_trunc", "ingestion_timestamp",
            "series_type", "series_title",
            col("metric.type").alias("metric_type"),
            col("metric.attributes.title").alias("metric_title"),
            col("metric.attributes.composite").alias("is_composite"),
            col("metric.attributes.values").alias("values")
        )
        
        df_step4 = df_step3.select("*", explode("values").alias("val")).drop("values")
        
        df_final = df_step4.select(
            "geo_id", "geo_name", "geo_limit", "ccaa_name", "time_trunc",
            "series_type", "series_title", "metric_type", "metric_title", "is_composite",
            to_timestamp(col("val.datetime")).alias("datetime"),
            col("val.value").cast("double").alias("value"),
            col("val.percentage").cast("double").alias("percentage"),
            col("ingestion_timestamp").cast("timestamp").alias("ingestion_timestamp")
        )
    else:
        # Resto: included → values directo
        df_step1 = df_included.select(
            "geo_id", "geo_name", "geo_limit", "ccaa_name", "time_trunc", "ingestion_timestamp",
            col("series.type").alias("series_type"),
            col("series.attributes.title").alias("series_title"),
            col("series.attributes.type").alias("metric_type"),
            col("series.attributes.composite").alias("is_composite"),
            col("series.attributes.values").alias("values")
        )
        
        df_step2 = df_step1.select("*", explode("values").alias("val")).drop("values")
        
        df_final = df_step2.select(
            "geo_id", "geo_name", "geo_limit", "ccaa_name", "time_trunc",
            "series_type", "series_title", "metric_type",
            col("series_title").alias("metric_title"),
            "is_composite",
            to_timestamp(col("val.datetime")).alias("datetime"),
            col("val.value").cast("double").alias("value"),
            col("val.percentage").cast("double").alias("percentage"),
            col("ingestion_timestamp").cast("timestamp").alias("ingestion_timestamp")
        )
    
    # Limpiar y deduplicar
    df_clean = df_final.filter(col("datetime").isNotNull() & col("value").isNotNull())
    df_dedup = df_clean.dropDuplicates(["geo_id", "time_trunc", "series_type", "datetime"])
    
    # Guardar
    table_name = f"brz_redata_{category}_{widget}".replace("-", "_")
    df_dedup.write.format("delta").mode("overwrite") \
        .option("mergeSchema", "true").option("overwriteSchema", "true") \
        .saveAsTable(table_name)
    
    count = df_dedup.count()
    print(f"✅ {table_name} ({count} reg)")
    return table_name

StatementMeta(, 432c2168-66c1-419f-862d-708e5d7fcc09, 41, Finished, Available, Finished)

In [40]:
def main():
    print("🚀 JSON → Delta")
    print(f"📂 {bronze_path_param}")
    
    if category_param and widget_param:
        categories_to_process = [(category_param, [widget_param])]
    elif category_param:
        widgets = list_widgets(category_param)
        categories_to_process = [(category_param, widgets)]
    else:
        categories = list_categories()
        categories_to_process = [(cat, list_widgets(cat)) for cat in categories]
    
    tables_created = []
    for category, widgets in categories_to_process:
        for widget in widgets:
            try:
                table = process_widget(category, widget)
                if table:
                    tables_created.append(table)
            except Exception as e:
                print(f"❌ {category}/{widget}: {str(e)}")
    
    print(f"\n📊 {len(tables_created)} tablas")
    return {"tables_created": len(tables_created), "tables": tables_created}

if __name__ == "__main__":
    main()

StatementMeta(, 432c2168-66c1-419f-862d-708e5d7fcc09, 42, Finished, Available, Finished)

🚀 JSON → Delta
📂 Files/bronze/REDATA/data

📦 balance/balance-electrico
📄 49 archivos
✅ brz_redata_balance_balance_electrico (6636 reg)

📦 demanda/evolucion
📄 34 archivos
✅ brz_redata_demanda_evolucion (2035 reg)

📦 generacion/estructura-generacion
📄 34 archivos
✅ brz_redata_generacion_estructura_generacion (15466 reg)

📦 generacion/estructura-generacion-emisiones-asociadas
📄 34 archivos
✅ brz_redata_generacion_estructura_generacion_emisiones_asociadas (13415 reg)

📦 generacion/estructura-renovables
📄 33 archivos
✅ brz_redata_generacion_estructura_renovables (8148 reg)

📦 generacion/evolucion-renovable-no-renovable
📄 34 archivos
✅ brz_redata_generacion_evolucion_renovable_no_renovable (3730 reg)

📦 generacion/maxima-renovable
📄 5 archivos
✅ brz_redata_generacion_maxima_renovable (52 reg)

📦 mercados/componentes-precio
📄 1 archivos
✅ brz_redata_mercados_componentes_precio (72 reg)

📊 8 tablas
