# OMIE Delta Transformation (Bronze to Silver)

Transforms OMIE Bronze Delta tables into clean Silver layer tables.

**Input:** Delta tables `brz_OMIE_2023`, `brz_OMIE_2024`, `brz_OMIE_2025`  
**Output:** Silver tables `slv_OMIE_2023`, `slv_OMIE_2024`, `slv_OMIE_2025`

Following the same pattern as REDATA colleagues with nested JSON processing.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from notebookutils import mssparkutils
import json

# Parámetros configurables
bronze_table_prefix = "brz_OMIE"
silver_table_prefix = "slv_OMIE"
years_param = [2023, 2024, 2025]

# Permitir override desde variables globales/locales
if 'years' in locals() or 'years' in globals():
    years_param = years if 'years' in locals() else globals().get('years', years_param)

spark = SparkSession.builder.appName("OMIE_Bronze_to_Silver").getOrCreate()

print("🚀 OMIE Delta Transformation Started")
print(f"✅ Spark session: {spark.version}")
print(f"🎯 Años a procesar: {years_param}")

In [None]:
def list_bronze_tables():
    """Lista todas las tablas Bronze OMIE disponibles"""
    try:
        all_tables = spark.sql("SHOW TABLES").collect()
        bronze_tables = [
            row.tableName for row in all_tables 
            if row.tableName.startswith(bronze_table_prefix)
        ]
        return bronze_tables
    except Exception as e:
        print(f"⚠️  Error listando tablas: {e}")
        return []

def validate_bronze_table(table_name):
    """Valida que una tabla Bronze existe y tiene datos"""
    try:
        count = spark.table(table_name).count()
        return count > 0, count
    except Exception as e:
        print(f"❌ Error validando {table_name}: {e}")
        return False, 0

# Descubrir tablas Bronze disponibles
available_bronze_tables = list_bronze_tables()
print(f"📋 Tablas Bronze encontradas: {available_bronze_tables}")

# Filtrar por años solicitados
tables_to_process = []
for year in years_param:
    table_name = f"{bronze_table_prefix}_{year}"
    if table_name in available_bronze_tables:
        is_valid, record_count = validate_bronze_table(table_name)
        if is_valid:
            tables_to_process.append((table_name, year, record_count))
            print(f"✅ {table_name}: {record_count:,} registros")
        else:
            print(f"⚠️  {table_name}: Sin datos válidos")
    else:
        print(f"❌ {table_name}: No encontrada")

if not tables_to_process:
    print("❌ No hay tablas Bronze válidas para procesar")
    try:
        dbutils.notebook.exit("Sin tablas Bronze")
    except:
        print("Terminando ejecución local")

print(f"\n🎯 Tablas a transformar: {len(tables_to_process)}")

In [None]:
def extract_omie_price_data(raw_content):
    """Extrae datos de precios de OMIE desde contenido raw"""
    # Esta función necesitará ser customizada según el formato real de OMIE
    # Por ahora, implementación placeholder que puede ser extendida
    
    if raw_content is None:
        return None
    
    # Placeholder: asumir que raw_content contiene datos CSV-like
    # En realidad, necesitarás parsear el formato específico de OMIE
    try:
        # Ejemplo: si OMIE usa formato CSV con headers específicos
        lines = raw_content.split('\n')
        
        # Buscar líneas que contengan datos de precios
        price_data = []
        for line in lines:
            if ',' in line and any(char.isdigit() for char in line):
                parts = line.split(',')
                if len(parts) >= 2:
                    # Intentar extraer hora y precio
                    try:
                        hour = int(parts[0]) if parts[0].isdigit() else None
                        price = float(parts[1].replace(',', '.')) if parts[1].replace(',', '.').replace('.', '').isdigit() else None
                        if hour is not None and price is not None:
                            price_data.append((hour, price))
                    except:
                        continue
        
        return price_data if price_data else None
        
    except Exception as e:
        return None

def transform_bronze_to_silver(bronze_table_name, year):
    """Transforma una tabla Bronze OMIE a Silver con lógica de negocio"""
    
    print(f"\n🔄 Transformando {bronze_table_name} → Silver")
    
    # Cargar datos Bronze
    df_bronze = spark.table(bronze_table_name)
    
    print(f"   📥 Registros Bronze: {df_bronze.count():,}")
    
    # Mostrar esquema Bronze
    print(f"   📋 Esquema Bronze:")
    df_bronze.printSchema()
    
    # Transformaciones Silver
    print(f"   🔧 Aplicando transformaciones Silver...")
    
    # 1. Limpieza y estandarización básica
    df_clean = df_bronze.filter(
        col("source_file").isNotNull() & 
        (col("source_file") != "unknown")
    )
    
    # 2. Extraer información de fecha desde extraction_date
    df_with_dates = df_clean.withColumn(
        "price_date",
        coalesce(
            col("extraction_date_parsed"),
            to_date(col("extraction_date"), "yyyyMMdd")
        )
    ).withColumn(
        "year", year(col("price_date"))
    ).withColumn(
        "month", month(col("price_date"))
    ).withColumn(
        "day", dayofmonth(col("price_date"))
    ).withColumn(
        "quarter", quarter(col("price_date"))
    ).withColumn(
        "day_of_week", dayofweek(col("price_date"))
    ).withColumn(
        "is_weekend", col("day_of_week").isin([1, 7])  # Sunday=1, Saturday=7
    ).withColumn(
        "season",
        when(col("month").isin([12, 1, 2]), "Winter")
        .when(col("month").isin([3, 4, 5]), "Spring")
        .when(col("month").isin([6, 7, 8]), "Summer")
        .otherwise("Autumn")
    )
    
    # 3. Clasificación de períodos de demanda (mercado eléctrico español)
    df_with_periods = df_with_dates.withColumn(
        "demand_period",
        when(
            (col("is_weekend") == False) & 
            (hour(current_timestamp()).between(8, 22)), "Peak"
        ).when(
            (col("is_weekend") == False) & 
            (
                hour(current_timestamp()).between(6, 8) | 
                hour(current_timestamp()).between(22, 24)
            ), "Shoulder"
        ).otherwise("Off-Peak")
    )
    
    # 4. Placeholder para datos de precios reales
    # NOTA: Esta lógica debe ser customizada según el formato real de OMIE
    df_with_prices = df_with_periods.withColumn(
        "price_hour", lit(12)  # Placeholder - extraer de raw_content
    ).withColumn(
        "marginal_price_eur_mwh", lit(50.0)  # Placeholder - extraer de raw_content
    ).withColumn(
        "energy_volume_mwh", lit(1000.0)  # Placeholder - extraer de raw_content
    ).withColumn(
        "price_category",
        when(col("marginal_price_eur_mwh") > 80, "High")
        .when(col("marginal_price_eur_mwh") > 40, "Medium")
        .otherwise("Low")
    )
    
    # 5. Crear timestamp combinado de fecha y hora
    df_with_timestamp = df_with_prices.withColumn(
        "price_datetime",
        to_timestamp(
            concat(
                date_format(col("price_date"), "yyyy-MM-dd"),
                lit(" "),
                format_string("%02d:00:00", col("price_hour"))
            ),
            "yyyy-MM-dd HH:mm:ss"
        )
    )
    
    # 6. Métricas de calidad mejoradas
    df_with_quality = df_with_timestamp.withColumn(
        "data_completeness_score",
        when(col("price_date").isNotNull(), 1.0)
        .when(col("extraction_date").isNotNull(), 0.8)
        .otherwise(0.3)
    ).withColumn(
        "is_anomaly", lit(False)  # Placeholder para detección de anomalías
    ).withColumn(
        "confidence_level", lit("High")
    )
    
    # 7. Metadatos de procesamiento
    df_final = df_with_quality.withColumn(
        "silver_processed_at", current_timestamp()
    ).withColumn(
        "silver_processing_version", lit("1.0")
    )
    
    # 8. Seleccionar columnas finales para Silver
    silver_columns = [
        # Dimensiones de tiempo
        "price_date", "price_hour", "price_datetime",
        "year", "month", "day", "quarter", "day_of_week", "is_weekend", "season",
        
        # Métricas de precio
        "marginal_price_eur_mwh", "energy_volume_mwh", "price_category",
        
        # Clasificaciones de negocio
        "demand_period",
        
        # Calidad de datos
        "data_completeness_score", "is_anomaly", "confidence_level",
        
        # Lineage y metadatos
        "source_file", "source_url", "extraction_year", "extraction_date",
        "ingested_at", "silver_processed_at", "silver_processing_version"
    ]
    
    df_silver = df_final.select(*silver_columns)
    
    # Deduplicar por archivo fuente y fecha
    df_deduped = df_silver.dropDuplicates(["source_file", "price_date"])
    
    final_count = df_deduped.count()
    print(f"   📤 Registros Silver: {final_count:,}")
    
    return df_deduped, final_count

print("✅ Funciones de transformación definidas")

In [None]:
# -------------------------------
# Procesar cada tabla Bronze a Silver
# -------------------------------

silver_tables_created = []
total_silver_records = 0

for bronze_table, year, bronze_count in tables_to_process:
    print(f"\n📦 Procesando {bronze_table} (año {year})")
    
    try:
        # Aplicar transformaciones
        df_silver, silver_count = transform_bronze_to_silver(bronze_table, year)
        
        # Nombre de tabla Silver
        silver_table_name = f"{silver_table_prefix}_{year}"
        
        print(f"   💾 Guardando tabla Silver: {silver_table_name}")
        
        # Guardar como tabla Delta con particionado optimizado
        df_silver.write.format("delta") \
            .mode("overwrite") \
            .option("mergeSchema", "true") \
            .option("overwriteSchema", "true") \
            .partitionBy("year", "month") \
            .saveAsTable(silver_table_name)
        
        # Verificar tabla creada
        final_verification = spark.table(silver_table_name).count()
        
        print(f"   ✅ {silver_table_name} creada ({final_verification:,} registros)")
        
        # Mostrar muestra de datos Silver
        print(f"   🔍 Muestra Silver:")
        spark.table(silver_table_name).select(
            "price_date", "price_hour", "marginal_price_eur_mwh", 
            "season", "demand_period", "data_completeness_score"
        ).show(3, truncate=False)
        
        silver_tables_created.append({
            "bronze_table": bronze_table,
            "silver_table": silver_table_name,
            "year": year,
            "bronze_records": bronze_count,
            "silver_records": final_verification
        })
        
        total_silver_records += final_verification
        
    except Exception as e:
        print(f"   ❌ Error procesando {bronze_table}: {str(e)}")
        import traceback
        traceback.print_exc()
        continue

print(f"\n🎉 Transformación Bronze→Silver completada!")
print(f"📊 Tablas Silver creadas: {len(silver_tables_created)}")
print(f"📈 Total registros Silver: {total_silver_records:,}")

In [None]:
# -------------------------------
# Validación y estadísticas finales
# -------------------------------

if silver_tables_created:
    print(f"📋 Resumen de transformación OMIE Bronze→Silver:")
    
    for table_info in silver_tables_created:
        bronze_table = table_info["bronze_table"]
        silver_table = table_info["silver_table"]
        year = table_info["year"]
        bronze_records = table_info["bronze_records"]
        silver_records = table_info["silver_records"]
        
        transformation_ratio = (silver_records / bronze_records * 100) if bronze_records > 0 else 0
        
        print(f"\n📅 Año {year}:")
        print(f"   🔧 {bronze_table} → {silver_table}")
        print(f"   📊 {bronze_records:,} → {silver_records:,} registros ({transformation_ratio:.1f}%)")
        
        # Estadísticas de calidad Silver
        try:
            silver_df = spark.table(silver_table)
            
            quality_stats = silver_df.select(
                avg("data_completeness_score").alias("avg_completeness"),
                countDistinct("source_file").alias("unique_files"),
                min("price_date").alias("min_date"),
                max("price_date").alias("max_date"),
                avg("marginal_price_eur_mwh").alias("avg_price")
            ).collect()[0]
            
            print(f"   🎯 Completeness promedio: {quality_stats['avg_completeness']:.2f}")
            print(f"   📁 Archivos únicos: {quality_stats['unique_files']}")
            print(f"   📅 Rango: {quality_stats['min_date']} a {quality_stats['max_date']}")
            print(f"   💰 Precio promedio: {quality_stats['avg_price']:.2f} EUR/MWh")
            
            # Distribución por período de demanda
            demand_dist = silver_df.groupBy("demand_period").count().collect()
            print(f"   ⚡ Distribución demanda: {dict((row.demand_period, row.count) for row in demand_dist)}")
            
        except Exception as e:
            print(f"   ⚠️  Error en estadísticas: {e}")
    
    # Crear vista unificada Silver
    if len(silver_tables_created) > 1:
        print(f"\n🔄 Creando vista Silver unificada...")
        
        try:
            union_query = " UNION ALL ".join([
                f"SELECT * FROM {t['silver_table']}" for t in silver_tables_created
            ])
            
            unified_view_name = "slv_OMIE_all_years"
            spark.sql(f"CREATE OR REPLACE VIEW {unified_view_name} AS {union_query}")
            
            # Estadísticas consolidadas
            unified_stats = spark.sql(f"""
                SELECT 
                    COUNT(*) as total_records,
                    MIN(year) as min_year,
                    MAX(year) as max_year,
                    AVG(data_completeness_score) as avg_quality,
                    AVG(marginal_price_eur_mwh) as avg_price
                FROM {unified_view_name}
            """)
            
            print(f"   📊 Vista unificada: {unified_view_name}")
            unified_stats.show()
            
        except Exception as e:
            print(f"   ⚠️  Error creando vista unificada: {e}")
    
    # Resumen ejecutivo
    print(f"\n📊 Resumen Ejecutivo:")
    print(f"   🏗️  Tablas Silver: {len(silver_tables_created)}")
    print(f"   📅 Años procesados: {sorted([t['year'] for t in silver_tables_created])}")
    print(f"   📈 Total registros: {total_silver_records:,}")
    print(f"   🎯 Transformación: Bronze → Silver completada")
    
    result_summary = {
        "tables_created": len(silver_tables_created),
        "years_processed": [t['year'] for t in silver_tables_created],
        "total_records": total_silver_records
    }
    
else:
    print("❌ No se crearon tablas Silver")
    result_summary = {"error": "No tables created"}

print(f"\n🏁 Finalizando transformación Delta...")

# Salida para pipeline
try:
    dbutils.notebook.exit(json.dumps(result_summary))
except:
    print(f"✅ Ejecución local completada: {result_summary}")