In [0]:
from pyspark.sql import functions as F

TARGET_DB = "bakehouse_jobs"

# ========== CUSTOMERS ==========

bronze_customers = spark.table(f"{TARGET_DB}.bronze_customers")

silver_customers = (
    bronze_customers
    .withColumn("customer_id", F.col("customerID").cast("long"))
    .withColumn("first_name", F.initcap(F.col("first_name")))
    .withColumn("last_name", F.initcap(F.col("last_name")))
    .withColumn("full_name", F.concat_ws(" ", "first_name", "last_name"))
    .withColumn("email", F.lower("email_address"))
    .withColumn("phone", F.regexp_replace("phone_number", r"\s+", ""))
    # Ejemplo de filtrado de registros invÃ¡lidos
    .filter(F.col("customer_id").isNotNull() & F.col("email").isNotNull())
)

silver_customers.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable(f"{TARGET_DB}.silver_customers")


# ========== FRANCHISES ==========

bronze_franchises = spark.table(f"{TARGET_DB}.bronze_franchises")

silver_franchises = (
    bronze_franchises
    .withColumn("franchise_id", F.col("franchiseID").cast("long"))
    .withColumn("franchise_name", F.col("name"))
    .withColumn("zipcode_str", F.col("zipcode").cast("string"))
    .withColumn("longitude", F.col("longitude").cast("double"))
    .withColumn("latitude", F.col("latitude").cast("double"))
)

silver_franchises.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable(f"{TARGET_DB}.silver_franchises")


# ========== SUPPLIERS ==========

bronze_suppliers = spark.table(f"{TARGET_DB}.bronze_suppliers")

silver_suppliers = (
    bronze_suppliers
    .withColumn("supplier_id", F.col("supplierID").cast("long"))
    .withColumn("supplier_name", F.col("name"))
    .withColumn(
        "approved_flag",
        F.when(F.upper(F.col("approved")) == "Y", F.lit(True)).otherwise(F.lit(False))
    )
)

silver_suppliers.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable(f"{TARGET_DB}.silver_suppliers")


# ========== SALES TRANSACTIONS ==========

bronze_tx = spark.table(f"{TARGET_DB}.bronze_sales_transactions")

silver_tx = (
    bronze_tx
    .withColumn("transaction_id", F.col("transactionID").cast("long"))
    .withColumn("customer_id", F.col("customerID").cast("long"))
    .withColumn("franchise_id", F.col("franchiseID").cast("long"))
    .withColumn("date", F.to_date("dateTime"))
    .withColumn("time", F.date_format("dateTime", "HH:mm:ss"))
    .withColumn("card_hash", F.sha2(F.col("cardNumber").cast("string"), 256))
    .filter(F.col("quantity") > 0)
    .filter(F.col("unitPrice") >= 0)
)

silver_tx.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable(f"{TARGET_DB}.silver_sales_transactions")


# ========== REVIEWS CHUNKED ==========

bronze_reviews = spark.table(f"{TARGET_DB}.bronze_reviews_chunked")

silver_reviews = (
    bronze_reviews
    .withColumn("franchise_id", F.col("franchiseID").cast("long"))
    .withColumn("review_date_date", F.to_date("review_date"))
    .withColumn(
        "chunked_text",
        F.trim(F.regexp_replace("chunked_text", r"\s+", " "))
    )
    .filter(F.col("chunked_text").isNotNull() & (F.col("chunked_text") != ""))
)

silver_reviews.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable(f"{TARGET_DB}.silver_reviews_chunked")