In [0]:
# Import required libraries
from pyspark.sql.functions import *

In [0]:
# Read the data from the bronze layer
df_products = spark.read.format("delta").load("abfss://olist-data@retailds.dfs.core.windows.net/bronze/products")

In [0]:
# Print the Schema
df_products.printSchema()

In [0]:
# Display the data
df_products.display()

In [0]:
# Count the number of records
df_products.count()

In [0]:
# Check for null values
df_products.select([
    count(when(col(c).isNull(), c)).alias(c)
    for c in df_products.columns
]).display()

In [0]:
# Drop null product_id
df_products = df_products.dropna(subset=["product_id"])

In [0]:
# Drop duplicates
df_products = df_products.dropDuplicates(subset=["product_id"])

In [0]:
# Count distinct product_id
df_products.select("product_id").distinct().count()

In [0]:
# Replace null values with "unknown"
df_products = df_products.withColumn(
    "product_category_name", 
    when(
        col("product_category_name").isNull(), 
        "unknown"
    ).otherwise(lower(trim(col("product_category_name"))))
    )

In [0]:
# Typecast columns
df_products = (
    df_products
    .withColumn("product_name_lenght", col("product_name_lenght").cast("int"))
    .withColumn("product_description_lenght", col("product_description_lenght").cast("int"))
    .withColumn("product_photos_qty", col("product_photos_qty").cast("int"))
    .withColumn("product_weight_g", col("product_weight_g").cast("int"))
    .withColumn("product_length_cm", col("product_length_cm").cast("int"))
    .withColumn("product_height_cm", col("product_height_cm").cast("int"))
    .withColumn("product_width_cm", col("product_width_cm").cast("int"))
)


In [0]:
# write data into silver layer
df_products.write.format("delta")\
.mode("overwrite")\
.option("overwriteSchema", "true")\
.save("abfss://olist-data@retailds.dfs.core.windows.net/silver/products")