# Bronze to Silver: Data Cleansing and Transformation

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
import pyspark.sql.functions as F

catalog_name = 'ecommerce'

In [0]:
df = spark.read.table(f"{catalog_name}.bronze.brz_order_items")
display(df)                    

In [0]:
#Checking duplicates in order_id and item_seq

df.groupby('order_id').count().filter(col('count')>1).show()

In [0]:
df = df.dropDuplicates(['order_id','item_seq'])

In [0]:
#distinct on quantity column

df.select('quantity').distinct().show()

#changing Two -> 2 and changing the type to integer

df = df.withColumn("quantity",
    F.when(col('quantity')=="Two" , 2).otherwise(F.col('quantity')).cast("int"))

df.select('quantity').distinct().show()

In [0]:
# Transformation : Remove any '$' or other symbols from unit_price, keep only numeric

df = df.withColumn(
    "unit_price",
    F.regexp_replace("unit_price" , "[$]" , "").cast("double")
)

# Transformation : Remove '%' from discount_pct and cast to double

df = df.withColumn(
    "discount_pct",
    F.regexp_replace("discount_pct","[%]", "").cast("double")
)

#df.select('unit_price','discount_pct').show(5)

## Transformation : coupon code processing (convert to lower)
df = df.withColumn(
    "coupon_code",F.lower(F.trim(F.col("coupon_code")))
)
#df.select('coupon_code').show(5)

## Transformation : channel processing 
df = df.withColumn(
    "channel",
    F.when(col("channel")=="web" , "Website")\
        .when(col("channel")=="app", "Mobile")\
            .otherwise(col("channel"))
)

df.select("channel").show(5)

In [0]:
# Transformation: datatype conversions
# 1) Convert dt (string → date)
df = df.withColumn(
    "dt",to_date("dt", "yyyy-MM-dd")
)

# 2) Convert order_ts (string → timestamp)
df = df.withColumn(
    "order_ts",
    F.coalesce(
        F.to_timestamp("order_ts" , "yyyy-MM-dd HH:mm:ss"),
        F.to_timestamp("order_ts" , "yyyy-mm-dd HH:mm")
    )
)

# 3) Convert item_seq (string → integer)
df = df.withColumn(
    "item_seq",F.col("item_seq").cast("int")
)

# 4) Convert tax_amount (string → double, strip non-numeric characters)
df = df.withColumn(
    "tax_amount",
    F.regexp_replace("tax_amount",r"[^0-9.\-]", "").cast("double")
)

# 5) Transformation : Add processed time 
df = df.withColumn("processed_time", F.current_timestamp())

df.select("item_seq","dt","order_ts","tax_amount","processed_time").show(100)

In [0]:
df.printSchema()

In [0]:
df.write.format("delta")\
    .mode("overwrite")\
        .option("mergeSchema" ,"true")\
            .saveAsTable(f"{catalog_name}.silver.slv_order_items")