In [0]:
# Módulos necesarios
from pyspark.sql import functions as F
from pyspark.sql.functions import col

In [0]:
%sql
select
  *
from
  workspace.default.bronze_2023_2024
limit
  5;

In [0]:
# -----------------------------
# 1) Load Bronze
# -----------------------------
df_bronze = spark.table("workspace.default.bronze_2023_2024")
# -----------------------------
# 2) Convertir columnas numéricas de STRING a DOUBLE
# -----------------------------
# Identificar todas las columnas que terminan en _min o _max (son numéricas)
numeric_cols = [c for c in df_bronze.columns if c.endswith('_min') or c.endswith('_max')]

# Convertir a double
df_typed = df_bronze
for c in numeric_cols:
    df_typed = df_typed.withColumn(
        c, 
        F.coalesce(
            F.when(F.col(c) == "", None).otherwise(F.col(c)).cast('double'), 
            F.lit(0)
        )
    )
# -----------------------------
# 3) Agregar columnas basado en prefijos
# -----------------------------
AGG_FAMILIES = ["spps"]

df_aggregated = df_typed

for family in AGG_FAMILIES:
    # Find columns that match the pattern: family_*_min and family_*_max
    min_cols = [c for c in df_typed.columns if c.startswith(f"{family}_") and c.endswith("_min")]
    max_cols = [c for c in df_typed.columns if c.startswith(f"{family}_") and c.endswith("_max")]
    
    # Sum these columns to create aggregated columns
    if min_cols:
        min_sum_expr = sum([F.col(c) for c in min_cols])
        df_aggregated = df_aggregated.withColumn(f"{family}_min", min_sum_expr)
    
    if max_cols:
        max_sum_expr = sum([F.col(c) for c in max_cols])
        df_aggregated = df_aggregated.withColumn(f"{family}_max", max_sum_expr)

# -----------------------------
# 4) Seleccionar solo las columnas necesarias
# -----------------------------
base_cols = ['consecutivo', 'claves', 'descripcion', 'imss_min', 'imss_max', 'insabi_min', 'insabi_max', 'issste_min', 'issste_max', 'ccinshae_min', 'ccinshae_max', 'sedena_min', 'sedena_max', 'semar_min', 'semar_max', 'guardia_nacional_min', 'guardia_nacional_max', 'oadprs_min', 'oadprs_max']

for family in AGG_FAMILIES:
    base_cols.extend([f"{family}_min", f"{family}_max"])

base_cols.extend(['total_general_min', 'total_general_max'])

# -----------------------------
# 5) Limpiar descripcion y seleccionar columnas finales
# -----------------------------
print(list(df_aggregated.columns))
df_pre_silver = (
    df_aggregated
    .select(*base_cols)

)

df_silver = (
    df_pre_silver
    .withColumn('descripcion', F.regexp_replace(F.col('descripcion'), '["\']', ''))
    .withColumn('descripcion', F.trim(F.col('descripcion')))
    .withColumn('descripcion', F.regexp_replace(F.col('descripcion'), r'\s+', ' '))    
    .withColumnRenamed('claves', 'clave')
    .withColumnRenamed('insabi_min', 'imss_bienestar_min')
    .withColumnRenamed('insabi_max', 'imss_bienestar_max')
    .withColumnRenamed('spps_min', 'salud_spps_min')
    .withColumnRenamed('spps_max', 'salud_spps_max')
    .withColumnRenamed('total_general_min', 'totales_min')
    .withColumnRenamed('total_general_max', 'totales_max')
)

df_silver.limit(10).display()

In [0]:
# Guardamos la tabla Delta
table = "workspace.default.silver_2023_2024"
df_silver.write \
    .format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable(table)

print(f"✅ Tabla e ingestión completada: {table}")