In [0]:
# Import required libraries
from pyspark.sql.functions import *

In [0]:
# Read the bronze table
df_seller = spark.read.format("delta").load("abfss://olist-data@retailds.dfs.core.windows.net/bronze/sellers")

In [0]:
# Print the Schema
df_seller.printSchema()

In [0]:
# Display the Dataframe
df_seller.display()

In [0]:
# Count the number of records
df_seller.count()

In [0]:
# Check for null values
df_seller.select([
    count(when(col(c).isNull(), c)).alias(c)
    for c in df_seller.columns
 ]).display()

In [0]:
# Drop null values
df_seller = df_seller.dropna(subset=["seller_id"])

In [0]:
# Drop duplicates
df_seller = df_seller.dropDuplicates(subset=["seller_id"])

In [0]:
# Count distinct seller_id
df_seller.select("seller_id").distinct().count()

In [0]:
# Normalize city & state
df_seller = df_seller.withColumn("seller_city", lower(trim(col("seller_city"))))\
    .withColumn("seller_state", upper(trim(col("seller_state"))))

In [0]:
# Write data into silver layer
df_seller.write.format("delta")\
    .mode("overwrite")\
    .option("overwriteSchema", "true")\
    .save("abfss://olist-data@retailds.dfs.core.windows.net/silver/sellers")