In [0]:
#----------------------------------------------------------------------
# Define paths for data layers using Volumes
#----------------------------------------------------------------------
bronze_path = "/Volumes/workspace/default/bronze/"
silver_path = "/Volumes/workspace/default/silver/"

#----------------------------------------------------------------------
# Import required functions and libraries
#----------------------------------------------------------------------
from pyspark.sql.functions import udf, current_timestamp, col, broadcast, when
from pyspark.sql.types import StringType
from unidecode import unidecode

# Define a UDF to remove accents from strings (e.g., for cleaning text fields)
remove_accents_udf = udf(lambda s: unidecode(s) if s else None, StringType())


In [0]:
# --------------------------------------------------------------------------------------------
# Promote product category translation data from Bronze to Silver with metadata
# --------------------------------------------------------------------------------------------

# 1. Read from Bronze Delta table
df_product_category = spark.read.format("delta").load(bronze_path + "product_category")

# 2. Add processed timestamp column for audit lineage
df_product_category = df_product_category.withColumn("processed_at", current_timestamp())

# Optional display
#df_product_category.display()

# 3. Write to Silver Delta table
df_product_category.write \
    .format("delta") \
    .mode("overwrite") \
    .save(silver_path + "product_category") 
    

In [0]:
# ----------------------------------------------------------------------------------------
# Clean and Transform Products Dataset → Promote to Silver
# ----------------------------------------------------------------------------------------

# 1. Read Bronze Delta table
df_products = spark.read.format("delta").load(bronze_path + "products")

# 2. Fill nulls with sensible defaults for known columns
df_products = df_products.fillna({
    "product_category_name": "N/A",
    "product_name_length": 0,
    "product_description_length": 0,
    "product_photos_qty": 0
})

# 3. Join with translated category names (broadcast for performance)
df_joined = df_products.join( \
    broadcast(df_product_category),\
    on="product_category_name", \
    how="left")

# 4. Clean up columns:
#    - Drop original (non-English) category
#    - Rename English name to unified column
df_final = df_joined \
   .drop(df_product_category["created_at"])\
   .drop(df_joined["product_category_name"]) \
   .withColumnRenamed("product_category_name_english", "product_category_name")

# 5. Normalize product category names with replacements
df_final = df_final.withColumn(
"product_category_name",
when(col("product_category_name") == "pc_gamer", "gaming_pc")
.when(col("product_category_name") == "portateis_cozinha_e_preparadores_de_alimentos", "portable_kitchen_and_food_preparators")
.when(col("product_category_name").isNull(), "N/A")
.otherwise(col("product_category_name"))
)

df_products = df_final

# 6. Add processed timestamp for audit trail
df_products = df_products.withColumn("processed_at", current_timestamp())

# 7. Write cleaned data to Silver Delta table
df_products.write \
    .format("delta") \
    .mode("overwrite") \
    .save(silver_path + "products") 

# Optional display
# df_products.display()

In [0]:
#--------------------------------------------------------------------------------------------
# Promote Customers data from Bronze to Silver with metadata
#--------------------------------------------------------------------------------------------
df_customer = spark.read.format("delta").load(bronze_path + "customers")

# Add processed timestamp for audit trail
df_customer = df_customer.withColumn("processed_at", current_timestamp())

# Optional display
#df_customer.display()

# Write cleaned data to Silver Delta table

df_customer.write \
    .format("delta") \
    .mode("overwrite") \
    .save(silver_path + "customers") 

In [0]:
#--------------------------------------------------------------------------------------------
# Promote Sellers data from Bronze to Silver with metadata
#--------------------------------------------------------------------------------------------

df_seller = spark.read.format("delta").load(bronze_path + "sellers")

# Add processed timestamp for audit trail
df_seller = df_seller.withColumn("processed_at", current_timestamp())

# Optional display
#df_seller.display()

# Write cleaned data to Silver Delta table
df_seller.write \
    .format("delta") \
    .mode("overwrite") \
    .save(silver_path + "sellers") 
    

In [0]:
#--------------------------------------------------------------------------------------------
# Promote Orders data from Bronze to Silver with metadata
#--------------------------------------------------------------------------------------------
# Read from Bronze Delta table
df_orders = spark.read.format("delta").load(bronze_path + "orders")

# Add processed timestamp for audit trail
df_orders = df_orders.withColumn("processed_at", current_timestamp())

# Optional display
#df_orders.display()

# Write cleaned data to Silver Delta table
df_orders.write \
    .format("delta") \
    .mode("overwrite") \
    .save(silver_path + "orders") 
    

In [0]:
# --------------------------------------------------------------------------------------------
# Transform Order Reviews Dataset → Clean Accents and Promote to Silver
# --------------------------------------------------------------------------------------------
# 1. Read from Bronze Delta table
df_orders_reviews = spark.read.format("delta").load(bronze_path + "orders_reviews")

# 2. Remove accents from textual fields using UDF (defined earlier)
df_orders_reviews = df_orders_reviews\
    .withColumn("review_comment_title", remove_accents_udf(col("review_comment_title")))\
    .withColumn("review_comment_message", remove_accents_udf(col("review_comment_message")))


# 3. Add processed timestamp for audit tracking
df_orders_reviews = df_orders_reviews.withColumn("processed_at", current_timestamp())

# Optional display
#df_orders_reviews.display()

# 4. Write the cleaned data to Silver layer, consider 'merge' or 'append' for production workloads
df_orders_reviews.write \
    .format("delta") \
    .mode("overwrite") \
    .save(silver_path + "orders_reviews") 


In [0]:
#--------------------------------------------------------------------------------------------
# Promote Order items data from Bronze to Silver with metadata
#--------------------------------------------------------------------------------------------
# Read from Bronze Delta table
df_orders_items = spark.read.format("delta").load(bronze_path + "orders_items")

# Add processed timestamp for audit trail
df_orders_items = df_orders_items.withColumn("processed_at", current_timestamp())

# Optional display
#df_orders_items.display()

# Write cleaned data to Silver Delta table

df_orders_items.write \
    .format("delta") \
    .mode("overwrite") \
    .save(silver_path + "orders_items") 
    

In [0]:
#--------------------------------------------------------------------------------------------
# Promote Order Payments data from Bronze to Silver with metadata
#--------------------------------------------------------------------------------------------
# 1. Read from Bronze Delta table
df_order_payments = spark.read.format("delta").load(bronze_path + "order_payments")

# Add processed timestamp for audit trail
df_order_payments = df_order_payments.withColumn("processed_at", current_timestamp())

# Optional display
#df_order_payments.display()

# Write cleaned data to Silver Delta table
df_order_payments.write \
    .format("delta") \
    .mode("overwrite") \
    .save(silver_path + "order_payments") 
    

In [0]:
#--------------------------------------------------------------------------------------------
# Promote geolocation data from bronze to Silver with metadata
#--------------------------------------------------------------------------------------------
# 1. Read from Bronze Delta table
df_geolocation = spark.read.format("delta").load(bronze_path + "geolocation")

df_geolocation = df_geolocation.withColumn("geolocation_city", remove_accents_udf(col("geolocation_city")))

# Add processed timestamp for audit trail
df_geolocation = df_geolocation.withColumn("processed_at", current_timestamp())

# Optional display
#df_geolocation.select("geolocation_city").distinct().display()

# Write cleaned data to Silver Delta table
df_geolocation.write \
    .format("delta") \
    .mode("overwrite") \
    .save(silver_path + "geolocation") 