In [0]:
bronze_path = "/Volumes/workspace/default/bronze/"
silver_path = "/Volumes/workspace/default/silver/"

from pyspark.sql.functions import *
from pyspark.sql.types import *

# Remove Accents from the Products dataset, geolocation_city
# install unidecode
%pip install unidecode==1.3.6

# Define the UDF
remove_accents_udf = udf(lambda s: unidecode(s) if s else None, StringType())

from unidecode import unidecode
from pyspark.sql.functions import udf, current_timestamp
from pyspark.sql.types import StringType

#dbutils.library.restartPython()

In [0]:
df_product_category = spark.read.format("delta").load(bronze_path + "product_category")

# Optional display
df_product_category.display()

# --------------------------------------------
# Add Processed Timestamp
# --------------------------------------------
df_product_category = df_product_category.withColumn("processed_at", current_timestamp())
# --------------------------------------------
# Write to silver in Delta Format
# --------------------------------------------

df_product_category.write \
    .format("delta") \
    .mode("overwrite") \
    .save(silver_path + "product_category") 

In [0]:
# ----------------------------------------------------
# Clean and Transform Products dataset (As this dataset has null values)
# ----------------------------------------------------
from pyspark.sql.functions import col

df_products = spark.read.format("delta").load(bronze_path + "products")

# Handle nulls
df_products = df_products.fillna({
    "product_category_name": "N/A",
    "product_name_length": 0,
    "product_description_length": 0,
    "product_photos_qty": 0
})

df_joined = df_products.join( \
    broadcast(df_product_category),\
    on="product_category_name", \
    how="left")

df_final = df_joined \
   .drop(df_product_category["created_at"])\
   .drop(df_joined["product_category_name"]) \
   .withColumnRenamed("product_category_name_english", "product_category_name")

df_final = df_final.withColumn(
"product_category_name",
when(col("product_category_name") == "pc_gamer", "gaming_pc")
.when(col("product_category_name") == "portateis_cozinha_e_preparadores_de_alimentos", "portable_kitchen_and_food_preparators")
.when(col("product_category_name").isNull(), "N/A")
.otherwise(col("product_category_name"))
)

df_products = df_final

df_products = df_products.withColumn("processed_at", current_timestamp())

# Write to Silver
df_products.write \
    .format("delta") \
    .mode("overwrite") \
    .save(silver_path + "products") 

#df_products.display()

In [0]:

df_customer = spark.read.format("delta").load(bronze_path + "customers")

# Optional display
#df_customer.display()

# --------------------------------------------
# Add Processed Timestamp
# --------------------------------------------
df_customer = df_customer.withColumn("processed_at", current_timestamp())
# --------------------------------------------
# Write to silver in Delta Format
# --------------------------------------------

df_customer.write \
    .format("delta") \
    .mode("overwrite") \
    .save(silver_path + "customers") 

In [0]:
df_seller = spark.read.format("delta").load(bronze_path + "sellers")

# Optional display
#df_seller.display()

# --------------------------------------------
# Add Processed Timestamp
# --------------------------------------------
df_seller = df_seller.withColumn("processed_at", current_timestamp())
# --------------------------------------------
# Write to silver in Delta Format
# --------------------------------------------

df_seller.write \
    .format("delta") \
    .mode("overwrite") \
    .save(silver_path + "sellers") 

In [0]:
df_orders = spark.read.format("delta").load(bronze_path + "orders")

# Optional display
df_orders.display()

# --------------------------------------------
# Add Processed Timestamp
# --------------------------------------------
df_orders = df_orders.withColumn("processed_at", current_timestamp())
# --------------------------------------------
# Write to silver in Delta Format
# --------------------------------------------

df_orders.write \
    .format("delta") \
    .mode("overwrite") \
    .save(silver_path + "orders") 

In [0]:
df_orders_reviews = spark.read.format("delta").load(bronze_path + "orders_reviews")

df_orders_reviews = df_orders_reviews.withColumn("review_comment_title", remove_accents_udf(col("review_comment_title")))
df_orders_reviews = df_orders_reviews.withColumn("review_comment_message", remove_accents_udf(col("review_comment_message")))

# Optional display
df_orders_reviews.display()

# --------------------------------------------
# Add Processed Timestamp
# --------------------------------------------
df_orders_reviews = df_orders_reviews.withColumn("processed_at", current_timestamp())
# --------------------------------------------
# Write to silver in Delta Format
# --------------------------------------------

df_orders_reviews.write \
    .format("delta") \
    .mode("overwrite") \
    .save(silver_path + "orders_reviews") 

In [0]:
df_orders_items = spark.read.format("delta").load(bronze_path + "orders_items")

# Optional display
#df_orders_items.display()

# --------------------------------------------
# Add Processed Timestamp
# --------------------------------------------
df_orders_items = df_orders_items.withColumn("processed_at", current_timestamp())
# --------------------------------------------
# Write to silver in Delta Format
# --------------------------------------------

df_orders_items.write \
    .format("delta") \
    .mode("overwrite") \
    .save(silver_path + "orders_items") 

In [0]:
df_order_payments = spark.read.format("delta").load(bronze_path + "order_payments")

# Optional display
#df_order_payments.display()

# --------------------------------------------
# Add Processed Timestamp
# --------------------------------------------
df_order_payments = df_order_payments.withColumn("processed_at", current_timestamp())
# --------------------------------------------
# Write to silver in Delta Format
# --------------------------------------------

df_order_payments.write \
    .format("delta") \
    .mode("overwrite") \
    .save(silver_path + "order_payments") 

In [0]:
df_geolocation = spark.read.format("delta").load(bronze_path + "geolocation")

df_geolocation = df_geolocation.withColumn("geolocation_city", remove_accents_udf(col("geolocation_city")))

# --------------------------------------------
# Add Processed Timestamp
# --------------------------------------------
df_geolocation = df_geolocation.withColumn("processed_at", current_timestamp())

# Optional display
#df_geolocation.select("geolocation_city").distinct().display()

# --------------------------------------------
# Write to silver in Delta Format
# --------------------------------------------

df_geolocation.write \
    .format("delta") \
    .mode("overwrite") \
    .save(silver_path + "geolocation") 