In [0]:
# Módulos necesarios
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import col

In [0]:
# -----------------------------
# 1) Load Bronze
# -----------------------------
df_bronze = spark.table("workspace.default.bronze_market_data")

def remove_suffixex(company_col):
    # Remove S.A. DE C.V., S.A.P.I DE C.V., and similar variations
    cleaned = F.regexp_replace(company_col, r"S\.?\s*A\.?\s*(P\.?\s*I\.?)?\s*DE\s*C\.?\s*V\.?", "")
    # Remove trailing commas and extra whitespace
    cleaned = F.regexp_replace(cleaned, r",\s*$", "")
    return F.trim(cleaned)

# Add column clave by concatenate other three columns separated by '.'
df_bronze = df_bronze.withColumn(
    "clave",
    F.concat_ws(".", F.col("GPO"), F.col("GEN"), F.col("ESP"), F.col("DIF"))
)

# drop the unneeded columns
df_bronze = df_bronze.drop("GPO", "GEN", "ESP", "DIF", "VAR")

# sort with the recent date to older date
df_bronze = df_bronze.orderBy(F.col("FILE_DATE").desc())

# Limpiar columnas con texto
df_bronze = (
    df_bronze
    .withColumn("NO_CONTRATO", F.regexp_replace(F.col("NO_CONTRATO"), r"[\s\t]+", "").cast("string"))
)

# keep only recent rows based on this two columns (NO_CONTRATO, clave)
w = Window.partitionBy("NO_CONTRATO", "clave").orderBy(F.col("FILE_DATE").desc(), F.col("SOLICITADO").desc())
df_bronze = df_bronze.withColumn("rn", F.row_number().over(w)).filter(F.col("rn") == 1).drop("rn")

df_bronze = (
    df_bronze
    .withColumn("FECHA_INICIO", F.to_date(F.col("FECHA_INICIO")))
    .withColumn("FECHA_TERMINACION", F.to_date(F.col("FECHA_TERMINACION")))
    # For FILE_DATE: handle timestamp string properly
    .withColumn("FILE_DATE", F.to_date(F.col("FILE_DATE")))
)
# Limpiar las empresas
df_bronze = df_bronze.withColumn("RAZON_SOCIAL", remove_suffixex(F.col("RAZON_SOCIAL")))

# Convertir a fechas válidas
dates_columns = ["FECHA_INICIO", "FECHA_TERMINACION", "FILE_DATE"]
df_bronze = (
    df_bronze
    .withColumn("FECHA_INICIO", F.to_date(F.col("FECHA_INICIO"), "yyyy-MM-dd"))
    .withColumn("FECHA_TERMINACION", F.to_date(F.col("FECHA_TERMINACION"), "yyyy-MM-dd"))
    # "yyyy-MM-dd HH:mm:ss" -> keep only "yyyy-MM-dd" -> DATE
    .withColumn("FILE_DATE", F.to_date(F.substring(F.col("FILE_DATE").cast("string"), 1, 10), "yyyy-MM-dd"))
)

# Convertir columnas de número a numéricos
num_columns = ["CANT_MAX", "PRECIO_NETO"]
df_bronze = (
    df_bronze
    .withColumn("CANT_MAX", F.col("CANT_MAX").cast("double"))
    .withColumn("PRECIO_NETO", F.col("PRECIO_NETO").cast("double"))
)


# Headers a minúsculas
df_silver = df_bronze.toDF(*[c.lower().strip() for c in df_bronze.columns])


df_silver.display()



In [0]:
# Guardamos la tabla Delta
table = "workspace.default.silver_market_data"
df_silver.write \
    .format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable(table)

print(f"✅ Tabla e ingestión completada: {table}")