# Silver Layer
This notebook cleanses and standardizes dimensional data from the Bronze layer, addressing formatting issues, trimming whitespace, and handling data anomalies.

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, IntegerType, DateType, TimestampType, FloatType

catalog_name = 'ecommerce'

## Brands

In [0]:
df_bronze = spark.table(f"{catalog_name}.bronze.brz_brands")
df_bronze.show(10)

+----------+-----------+-------------+--------------------+--------------------+
|brand_code| brand_name|category_code|        _source_file|         ingested_at|
+----------+-----------+-------------+--------------------+--------------------+
|      ACME|   AcmeTech|           CE|dbfs:/Volumes/eco...|2026-01-19 12:26:...|
|      NOVW|  NovaWave |           CE|dbfs:/Volumes/eco...|2026-01-19 12:26:...|
|      ZNTH|     Zenith|           CE|dbfs:/Volumes/eco...|2026-01-19 12:26:...|
|      BYTM|    ByteMax|           CE|dbfs:/Volumes/eco...|2026-01-19 12:26:...|
|      ECOT|    EcoTone|           CE|dbfs:/Volumes/eco...|2026-01-19 12:26:...|
|      SKYL|    SkyLink|           CE|dbfs:/Volumes/eco...|2026-01-19 12:26:...|
|     VOLT@|   VoltEdge|           CE|dbfs:/Volumes/eco...|2026-01-19 12:26:...|
|      PHTX|   Photonix|           CE|dbfs:/Volumes/eco...|2026-01-19 12:26:...|
|      URTL| UrbanTrail|          APP|dbfs:/Volumes/eco...|2026-01-19 12:26:...|
|      COTC| CottonClub|    

In [0]:
df_silver = df_bronze.withColumn("brand_name", F.trim(F.col("brand_name")))

df_silver.show(10)

+----------+----------+-------------+--------------------+--------------------+
|brand_code|brand_name|category_code|        _source_file|         ingested_at|
+----------+----------+-------------+--------------------+--------------------+
|      ACME|  AcmeTech|           CE|dbfs:/Volumes/eco...|2026-01-19 12:26:...|
|      NOVW|  NovaWave|           CE|dbfs:/Volumes/eco...|2026-01-19 12:26:...|
|      ZNTH|    Zenith|           CE|dbfs:/Volumes/eco...|2026-01-19 12:26:...|
|      BYTM|   ByteMax|           CE|dbfs:/Volumes/eco...|2026-01-19 12:26:...|
|      ECOT|   EcoTone|           CE|dbfs:/Volumes/eco...|2026-01-19 12:26:...|
|      SKYL|   SkyLink|           CE|dbfs:/Volumes/eco...|2026-01-19 12:26:...|
|     VOLT@|  VoltEdge|           CE|dbfs:/Volumes/eco...|2026-01-19 12:26:...|
|      PHTX|  Photonix|           CE|dbfs:/Volumes/eco...|2026-01-19 12:26:...|
|      URTL|UrbanTrail|          APP|dbfs:/Volumes/eco...|2026-01-19 12:26:...|
|      COTC|CottonClub|          APP|dbf

In [0]:
# Brand code should have only alphanumeric characters
df_silver = df_silver.withColumn("brand_code", F.regexp_replace(F.col("brand_code"), r'[^A-Za-z0-9]', ''))
df_silver.show(10)

+----------+----------+-------------+--------------------+--------------------+
|brand_code|brand_name|category_code|        _source_file|         ingested_at|
+----------+----------+-------------+--------------------+--------------------+
|      ACME|  AcmeTech|           CE|dbfs:/Volumes/eco...|2026-01-19 12:26:...|
|      NOVW|  NovaWave|           CE|dbfs:/Volumes/eco...|2026-01-19 12:26:...|
|      ZNTH|    Zenith|           CE|dbfs:/Volumes/eco...|2026-01-19 12:26:...|
|      BYTM|   ByteMax|           CE|dbfs:/Volumes/eco...|2026-01-19 12:26:...|
|      ECOT|   EcoTone|           CE|dbfs:/Volumes/eco...|2026-01-19 12:26:...|
|      SKYL|   SkyLink|           CE|dbfs:/Volumes/eco...|2026-01-19 12:26:...|
|      VOLT|  VoltEdge|           CE|dbfs:/Volumes/eco...|2026-01-19 12:26:...|
|      PHTX|  Photonix|           CE|dbfs:/Volumes/eco...|2026-01-19 12:26:...|
|      URTL|UrbanTrail|          APP|dbfs:/Volumes/eco...|2026-01-19 12:26:...|
|      COTC|CottonClub|          APP|dbf

In [0]:
df_silver.select("category_code").distinct().show()

+-------------+
|category_code|
+-------------+
|           CE|
|          APP|
|          HNK|
|          BPC|
|        BOOKS|
|          BKS|
|      GROCERY|
|         GRCY|
|          TOY|
|         TOYS|
|          SPT|
+-------------+



In [0]:
# Anomalies dictionary
anomalies = {
    "GROCERY": "GRCY",
    "BOOKS": "BKS",
    "TOYS": "TOY"
}

df_silver = df_silver.replace(anomalies, subset="category_code")

df_silver.select("category_code").distinct().show()

+-------------+
|category_code|
+-------------+
|           CE|
|          APP|
|          HNK|
|          BPC|
|          BKS|
|         GRCY|
|          TOY|
|          SPT|
+-------------+



In [0]:
df_silver.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable(f"{catalog_name}.silver.slv_brands")

## Category

In [0]:
df_bronze = spark.table(f"{catalog_name}.bronze.brz_category")
df_bronze.show()

+-------------+--------------------+--------------------+--------------------+
|category_code|       category_name|        _ingested_at|        _source_file|
+-------------+--------------------+--------------------+--------------------+
|           ce|         Electronics|2026-01-19 12:29:...|dbfs:/Volumes/eco...|
|          app|             Apparel|2026-01-19 12:29:...|dbfs:/Volumes/eco...|
|          hnk|      Home & Kitchen|2026-01-19 12:29:...|dbfs:/Volumes/eco...|
|          bpc|Beauty & Personal...|2026-01-19 12:29:...|dbfs:/Volumes/eco...|
|          bks|               Books|2026-01-19 12:29:...|dbfs:/Volumes/eco...|
|         grcy|             Grocery|2026-01-19 12:29:...|dbfs:/Volumes/eco...|
|          toy|        Toys & Games|2026-01-19 12:29:...|dbfs:/Volumes/eco...|
|          spt|   Sports & Outdoors|2026-01-19 12:29:...|dbfs:/Volumes/eco...|
|          app|             Apparel|2026-01-19 12:29:...|dbfs:/Volumes/eco...|
|         grcy|             Grocery|2026-01-19 12:29

In [0]:
df_bronze.groupBy("category_code").count().filter(F.col("count") > 1).show()

+-------------+-----+
|category_code|count|
+-------------+-----+
|          app|    2|
|         grcy|    2|
+-------------+-----+



In [0]:
df_silver = df_bronze.dropDuplicates(["category_code"])
display(df_silver)

category_code,category_name,_ingested_at,_source_file
ce,Electronics,2026-01-19T12:29:11.906Z,dbfs:/Volumes/ecommerce/source_data/raw/category/category.csv
app,Apparel,2026-01-19T12:29:11.906Z,dbfs:/Volumes/ecommerce/source_data/raw/category/category.csv
hnk,Home & Kitchen,2026-01-19T12:29:11.906Z,dbfs:/Volumes/ecommerce/source_data/raw/category/category.csv
bpc,Beauty & Personal Care,2026-01-19T12:29:11.906Z,dbfs:/Volumes/ecommerce/source_data/raw/category/category.csv
bks,Books,2026-01-19T12:29:11.906Z,dbfs:/Volumes/ecommerce/source_data/raw/category/category.csv
grcy,Grocery,2026-01-19T12:29:11.906Z,dbfs:/Volumes/ecommerce/source_data/raw/category/category.csv
toy,Toys & Games,2026-01-19T12:29:11.906Z,dbfs:/Volumes/ecommerce/source_data/raw/category/category.csv
spt,Sports & Outdoors,2026-01-19T12:29:11.906Z,dbfs:/Volumes/ecommerce/source_data/raw/category/category.csv


In [0]:
df_silver = df_silver.withColumn("category_code", F.upper(F.col("category_code")))
display(df_silver)

category_code,category_name,_ingested_at,_source_file
CE,Electronics,2026-01-19T12:29:11.906Z,dbfs:/Volumes/ecommerce/source_data/raw/category/category.csv
APP,Apparel,2026-01-19T12:29:11.906Z,dbfs:/Volumes/ecommerce/source_data/raw/category/category.csv
HNK,Home & Kitchen,2026-01-19T12:29:11.906Z,dbfs:/Volumes/ecommerce/source_data/raw/category/category.csv
BPC,Beauty & Personal Care,2026-01-19T12:29:11.906Z,dbfs:/Volumes/ecommerce/source_data/raw/category/category.csv
BKS,Books,2026-01-19T12:29:11.906Z,dbfs:/Volumes/ecommerce/source_data/raw/category/category.csv
GRCY,Grocery,2026-01-19T12:29:11.906Z,dbfs:/Volumes/ecommerce/source_data/raw/category/category.csv
TOY,Toys & Games,2026-01-19T12:29:11.906Z,dbfs:/Volumes/ecommerce/source_data/raw/category/category.csv
SPT,Sports & Outdoors,2026-01-19T12:29:11.906Z,dbfs:/Volumes/ecommerce/source_data/raw/category/category.csv


In [0]:
df_silver.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable(f"{catalog_name}.silver.slv_category")

## Products

In [0]:
df_bronze = spark.read.table(f"{catalog_name}.bronze.brz_products")

row_count, column_count = df_bronze.count(), len(df_bronze.columns)
print(f"row_count: {row_count}, column_count {column_count}")

row_count: 50000, column_count 14


In [0]:
display(df_brozne.limit(10))

product_id,sku,category_code,brand_code,color,size,material,weight_grams,length_cm,width_cm,height_cm,rating_count,file_name,ingest_timestamp
2000000000015,STCR-HNK-00001,hnk,stcr,White,One-Size,Coton,305g,222,17.1,6.3,0,dbfs:/Volumes/ecommerce/source_data/raw/products/products.csv,2026-01-19T12:34:38.090Z
2000000000022,HMNS-HNK-00002,hnk,hmns,Silver,One-Size,Steel,682g,182,12.3,3.7,1,dbfs:/Volumes/ecommerce/source_data/raw/products/products.csv,2026-01-19T12:34:38.090Z
2000000000039,NOVW-CE-00003,ce,novw,Purple,One-Size,Wood,243g,182,13.9,4.2,0,dbfs:/Volumes/ecommerce/source_data/raw/products/products.csv,2026-01-19T12:34:38.090Z
2000000000046,URTL-APP-00004,app,urtl,Silver,S,Ruber,225g,176,4.6,5.8,50,dbfs:/Volumes/ecommerce/source_data/raw/products/products.csv,2026-01-19T12:34:38.090Z
2000000000053,GGRN-GRC-00005,grcy,ggrn,Silver,One-Size,Ruber,455g,272,15.8,7.4,-4,dbfs:/Volumes/ecommerce/source_data/raw/products/products.csv,2026-01-19T12:34:38.090Z
2000000000060,SLKE-BPC-00006,bpc,slke,Purple,One-Size,Plastic,232g,280,13.8,6.1,0,dbfs:/Volumes/ecommerce/source_data/raw/products/products.csv,2026-01-19T12:34:38.090Z
2000000000077,VOLT-CE-00007,ce,volt,Blue,One-Size,Plastic,507g,272,12.1,6.4,5,dbfs:/Volumes/ecommerce/source_data/raw/products/products.csv,2026-01-19T12:34:38.090Z
2000000000084,CBLT-APP-00008,app,cblt,Blue,XS,Polyester,261g,277,8.5,7.0,0,dbfs:/Volumes/ecommerce/source_data/raw/products/products.csv,2026-01-19T12:34:38.090Z
2000000000091,ARFT-SPT-00009,spt,arft,Blue,XL,Plastic,59g,125,19.0,7.9,11,dbfs:/Volumes/ecommerce/source_data/raw/products/products.csv,2026-01-19T12:34:38.090Z
2000000000107,MOSA-APP-0000A,app,mosa,White,L,Polyester,238g,107,17.7,10.3,6,dbfs:/Volumes/ecommerce/source_data/raw/products/products.csv,2026-01-19T12:34:38.090Z


In [0]:
# weight_grams column normalization
df_bronze.select("weight_grams").show(5, truncate=False)

+------------+
|weight_grams|
+------------+
|305g        |
|682g        |
|243g        |
|225g        |
|455g        |
+------------+
only showing top 5 rows


In [0]:
df_silver = df_bronze.withColumn("weight_grams", F.regexp_replace(F.col("weight_grams"), "g", "").cast(IntegerType()))
df_silver.select("weight_grams").show(5, truncate=False)

+------------+
|weight_grams|
+------------+
|305         |
|682         |
|243         |
|225         |
|455         |
+------------+
only showing top 5 rows


In [0]:
df_silver.printSchema()

root
 |-- product_id: string (nullable = true)
 |-- sku: string (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand_code: string (nullable = true)
 |-- color: string (nullable = true)
 |-- size: string (nullable = true)
 |-- material: string (nullable = true)
 |-- weight_grams: integer (nullable = true)
 |-- length_cm: string (nullable = true)
 |-- width_cm: float (nullable = true)
 |-- height_cm: float (nullable = true)
 |-- rating_count: integer (nullable = true)
 |-- file_name: string (nullable = true)
 |-- ingest_timestamp: timestamp (nullable = true)



In [0]:
df_silver.select("length_cm").show(5)

+---------+
|length_cm|
+---------+
|     22,2|
|     18,2|
|     18,2|
|     17,6|
|     27,2|
+---------+
only showing top 5 rows


In [0]:
df_silver = df_silver.withColumn("length_cm", F.regexp_replace(F.col("length_cm"), ",", ".").cast(FloatType()))

df_silver.select("length_cm").show(5)

+---------+
|length_cm|
+---------+
|     22.2|
|     18.2|
|     18.2|
|     17.6|
|     27.2|
+---------+
only showing top 5 rows


In [0]:
# category_code and brand_code to upper case
df_silver.select("category_code", "brand_code").show(5)

+-------------+----------+
|category_code|brand_code|
+-------------+----------+
|          hnk|      stcr|
|          hnk|      hmns|
|           ce|      novw|
|          app|      urtl|
|         grcy|      ggrn|
+-------------+----------+
only showing top 5 rows


In [0]:
df_silver = df_silver.withColumn("category_code", F.upper(F.col("category_code")))
df_silver = df_silver.withColumn("brand_code", F.upper(F.col("brand_code")))
df_silver.select("category_code", "brand_code").show(5)

+-------------+----------+
|category_code|brand_code|
+-------------+----------+
|          HNK|      STCR|
|          HNK|      HMNS|
|           CE|      NOVW|
|          APP|      URTL|
|         GRCY|      GGRN|
+-------------+----------+
only showing top 5 rows


In [0]:
df_silver.select("material").distinct().show()

+---------+
| material|
+---------+
|    Coton|
|    Steel|
|     Wood|
|    Ruber|
|  Plastic|
|Polyester|
|    Glass|
|  Alumium|
|    Paper|
|  Leather|
+---------+



In [0]:
# Fixing spelling mistakes
df_silver = df_silver.withColumn("material",
                                F.when(F.col("material") == "Coton", "cotton")
                                .when(F.col("material") == "Alumium", "Aluminium")
                                .when(F.col("material") == "Ruber", "Rubber")
                                .otherwise(F.col("material")))
df_silver.select("material").distinct().show()

+---------+
| material|
+---------+
|   cotton|
|    Steel|
|     Wood|
|   Rubber|
|  Plastic|
|Polyester|
|    Glass|
|Aluminium|
|    Paper|
|  Leather|
+---------+



In [0]:
df_silver.filter(F.col("rating_count") < 0).select("rating_count").show(5)

+------------+
|rating_count|
+------------+
|          -4|
|          -2|
|          -2|
|          -1|
|         -14|
+------------+
only showing top 5 rows


In [0]:
# rating_count should be a positive number
df_silver = df_silver.withColumn("rating_count", F.when(F.col("rating_count").isNotNull(), F.abs(F.col("rating_count")))
                .otherwise(F.lit(0)))
df_silver.filter(F.col("rating_count") < 0).select("rating_count").show(5)

+------------+
|rating_count|
+------------+
+------------+



In [0]:
# Checking the final cleaned data
df_silver.select("weight_grams", "length_cm", "category_code", "brand_code", "material", "rating_count").show(10, truncate=False)

+------------+---------+-------------+----------+---------+------------+
|weight_grams|length_cm|category_code|brand_code|material |rating_count|
+------------+---------+-------------+----------+---------+------------+
|305         |22.2     |HNK          |STCR      |cotton   |0           |
|682         |18.2     |HNK          |HMNS      |Steel    |1           |
|243         |18.2     |CE           |NOVW      |Wood     |0           |
|225         |17.6     |APP          |URTL      |Rubber   |50          |
|455         |27.2     |GRCY         |GGRN      |Rubber   |4           |
|232         |28.0     |BPC          |SLKE      |Plastic  |0           |
|507         |27.2     |CE           |VOLT      |Plastic  |5           |
|261         |27.7     |APP          |CBLT      |Polyester|0           |
|59          |12.5     |SPT          |ARFT      |Plastic  |11          |
|238         |10.7     |APP          |MOSA      |Polyester|6           |
+------------+---------+-------------+----------+--

In [0]:
df_silver.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable(f"{catalog_name}.silver.slv_products")

## Customers

In [0]:
df_bronze = spark.read.table(f"{catalog_name}.bronze.brz_customers")
display(df_bronze.limit(10))

customer_id,phone,country_code,country,state,file_name,ingest_timestamp
CUST000000000001,917280033536.0,IN,India,MH,dbfs:/Volumes/ecommerce/source_data/raw/customers/customers.csv,2026-01-19T12:36:56.777Z
CUST000000000002,619489725433.0,AU,Australia,VIC,dbfs:/Volumes/ecommerce/source_data/raw/customers/customers.csv,2026-01-19T12:36:56.777Z
CUST000000000003,919390066524.0,IN,India,TN,dbfs:/Volumes/ecommerce/source_data/raw/customers/customers.csv,2026-01-19T12:36:56.777Z
CUST000000000004,917073741793.0,IN,India,TN,dbfs:/Volumes/ecommerce/source_data/raw/customers/customers.csv,2026-01-19T12:36:56.777Z
CUST000000000005,618478772532.0,AU,Australia,WA,dbfs:/Volumes/ecommerce/source_data/raw/customers/customers.csv,2026-01-19T12:36:56.777Z
CUST000000000006,916441718520.0,IN,India,GJ,dbfs:/Volumes/ecommerce/source_data/raw/customers/customers.csv,2026-01-19T12:36:56.777Z
CUST000000000007,,IN,India,MH,dbfs:/Volumes/ecommerce/source_data/raw/customers/customers.csv,2026-01-19T12:36:56.777Z
CUST000000000008,446806361276.0,GB,United Kingdom,ENG,dbfs:/Volumes/ecommerce/source_data/raw/customers/customers.csv,2026-01-19T12:36:56.777Z
CUST000000000009,18191801729.0,US,United States,MA,dbfs:/Volumes/ecommerce/source_data/raw/customers/customers.csv,2026-01-19T12:36:56.777Z
CUST000000000010,,IN,India,RJ,dbfs:/Volumes/ecommerce/source_data/raw/customers/customers.csv,2026-01-19T12:36:56.777Z


In [0]:
row_count, column_count = df_bronze.count(), len(df_bronze.columns)
print(f"row_count {row_count}, column_count {column_count}")

row_count 300000, column_count 7


In [0]:
# Handling null values in customer_id
null_count = df_bronze.filter(F.col("customer_id").isNull()).count()
print(null_count)

300


In [0]:
df_bronze.filter(F.col("customer_id").isNull()).show(5)

+-----------+--------------+------------+--------------+-----+--------------------+--------------------+
|customer_id|         phone|country_code|       country|state|           file_name|    ingest_timestamp|
+-----------+--------------+------------+--------------+-----+--------------------+--------------------+
|       NULL|918187043562.0|          IN|         India|   DL|dbfs:/Volumes/eco...|2026-01-19 12:36:...|
|       NULL|917517243052.0|          IN|         India|   DL|dbfs:/Volumes/eco...|2026-01-19 12:36:...|
|       NULL|          NULL|          IN|         India|   GJ|dbfs:/Volumes/eco...|2026-01-19 12:36:...|
|       NULL|447220214605.0|          GB|United Kingdom|  WLS|dbfs:/Volumes/eco...|2026-01-19 12:36:...|
|       NULL|916996290632.0|          IN|         India|   UP|dbfs:/Volumes/eco...|2026-01-19 12:36:...|
+-----------+--------------+------------+--------------+-----+--------------------+--------------------+
only showing top 5 rows


In [0]:
# Dropping rows where cutomer_id is null
df_silver = df_bronze.dropna(subset=["customer_id"])

print(f"Row count after dropping null values: {df_silver.count()}")

Row count after dropping null values: 299700


In [0]:
# Hanlding null values in phone
null_count = df_silver.filter(F.col("phone").isNull()).count()
print(f"Number of nulls in phone: {null_count}")

Number of nulls in phone: 29964


In [0]:
df_silver.filter(F.col("phone").isNull()).show(5)

+----------------+-----+------------+-------------+-----+--------------------+--------------------+
|     customer_id|phone|country_code|      country|state|           file_name|    ingest_timestamp|
+----------------+-----+------------+-------------+-----+--------------------+--------------------+
|CUST000000000007| NULL|          IN|        India|   MH|dbfs:/Volumes/eco...|2026-01-19 12:36:...|
|CUST000000000010| NULL|          IN|        India|   RJ|dbfs:/Volumes/eco...|2026-01-19 12:36:...|
|CUST000000000026| NULL|          IN|        India|   WB|dbfs:/Volumes/eco...|2026-01-19 12:36:...|
|CUST000000000032| NULL|          US|United States|   NJ|dbfs:/Volumes/eco...|2026-01-19 12:36:...|
|CUST000000000070| NULL|          IN|        India|   TS|dbfs:/Volumes/eco...|2026-01-19 12:36:...|
+----------------+-----+------------+-------------+-----+--------------------+--------------------+
only showing top 5 rows


In [0]:
df_silver = df_silver.fillna("Not Avaliable", subset=["phone"])
df_silver.filter(F.col("phone").isNull()).show()

+-----------+-----+------------+-------+-----+---------+----------------+
|customer_id|phone|country_code|country|state|file_name|ingest_timestamp|
+-----------+-----+------------+-------+-----+---------+----------------+
+-----------+-----+------------+-------+-----+---------+----------------+



In [0]:
df_silver.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable(f"{catalog_name}.silver.slv_customers")

## Calendar/Date

In [0]:
df_bronze = spark.read.table(f"{catalog_name}.bronze.brz_calendar")

display(df_bronze.limit(10))

date,year,day_name,quarter,week_of_year,_ingested_at,_source_file
01-08-2025,2025,friday,3,-31,2026-01-19T12:40:29.359Z,dbfs:/Volumes/ecommerce/source_data/raw/date/date.csv
02-08-2025,2025,SATURDAY,3,-31,2026-01-19T12:40:29.359Z,dbfs:/Volumes/ecommerce/source_data/raw/date/date.csv
03-08-2025,2025,SUNDAY,3,-31,2026-01-19T12:40:29.359Z,dbfs:/Volumes/ecommerce/source_data/raw/date/date.csv
04-08-2025,2025,MONDAY,3,-32,2026-01-19T12:40:29.359Z,dbfs:/Volumes/ecommerce/source_data/raw/date/date.csv
05-08-2025,2025,TUESDAY,3,-32,2026-01-19T12:40:29.359Z,dbfs:/Volumes/ecommerce/source_data/raw/date/date.csv
06-08-2025,2025,WEDNESDAY,3,-32,2026-01-19T12:40:29.359Z,dbfs:/Volumes/ecommerce/source_data/raw/date/date.csv
07-08-2025,2025,thursday,3,-32,2026-01-19T12:40:29.359Z,dbfs:/Volumes/ecommerce/source_data/raw/date/date.csv
08-08-2025,2025,friday,3,-32,2026-01-19T12:40:29.359Z,dbfs:/Volumes/ecommerce/source_data/raw/date/date.csv
09-08-2025,2025,SATURDAY,3,-32,2026-01-19T12:40:29.359Z,dbfs:/Volumes/ecommerce/source_data/raw/date/date.csv
10-08-2025,2025,SUNDAY,3,-32,2026-01-19T12:40:29.359Z,dbfs:/Volumes/ecommerce/source_data/raw/date/date.csv


In [0]:
row_count, column_count = df_bronze.count(), len(df_bronze.columns)
print(f"row_count {row_count}, column_count {column_count}")

row_count 95, column_count 7


In [0]:
df_bronze.printSchema()

root
 |-- date: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- day_name: string (nullable = true)
 |-- quarter: integer (nullable = true)
 |-- week_of_year: integer (nullable = true)
 |-- _ingested_at: timestamp (nullable = true)
 |-- _source_file: string (nullable = true)



In [0]:
from pyspark.sql.functions import to_date

df_silver = df_bronze.withColumn("date", to_date(df_bronze["date"], "dd-MM-yyyy"))

In [0]:
print(df_silver.printSchema())

root
 |-- date: date (nullable = true)
 |-- year: integer (nullable = true)
 |-- day_name: string (nullable = true)
 |-- quarter: integer (nullable = true)
 |-- week_of_year: integer (nullable = true)
 |-- _ingested_at: timestamp (nullable = true)
 |-- _source_file: string (nullable = true)

None


In [0]:
df_silver.show(5)

+----------+----+--------+-------+------------+--------------------+--------------------+
|      date|year|day_name|quarter|week_of_year|        _ingested_at|        _source_file|
+----------+----+--------+-------+------------+--------------------+--------------------+
|2025-08-01|2025|  friday|      3|         -31|2026-01-19 12:40:...|dbfs:/Volumes/eco...|
|2025-08-02|2025|SATURDAY|      3|         -31|2026-01-19 12:40:...|dbfs:/Volumes/eco...|
|2025-08-03|2025|  SUNDAY|      3|         -31|2026-01-19 12:40:...|dbfs:/Volumes/eco...|
|2025-08-04|2025|  MONDAY|      3|         -32|2026-01-19 12:40:...|dbfs:/Volumes/eco...|
|2025-08-05|2025| TUESDAY|      3|         -32|2026-01-19 12:40:...|dbfs:/Volumes/eco...|
+----------+----+--------+-------+------------+--------------------+--------------------+
only showing top 5 rows


In [0]:
duplicates = df_silver.groupBy('date').count().filter("count > 1")

print(f"Found {duplicates.count()} duplicate dates")
display(duplicates)

Found 3 duplicate dates


date,count
2025-08-29,2
2025-09-25,2
2025-10-13,2


In [0]:
df_silver = df_silver.dropDuplicates(['date'])

print("Rows after removing duplicates: ", df_silver.count())

Rows after removing duplicates:  92


In [0]:
# Capitalize first letter of each word in day_name
df_silver = df_silver.withColumn("day_name", F.initcap(F.col("day_name")))

df_silver.show(10)

+----------+----+---------+-------+------------+--------------------+--------------------+
|      date|year| day_name|quarter|week_of_year|        _ingested_at|        _source_file|
+----------+----+---------+-------+------------+--------------------+--------------------+
|2025-08-01|2025|   Friday|      3|         -31|2026-01-19 12:40:...|dbfs:/Volumes/eco...|
|2025-08-02|2025| Saturday|      3|         -31|2026-01-19 12:40:...|dbfs:/Volumes/eco...|
|2025-08-03|2025|   Sunday|      3|         -31|2026-01-19 12:40:...|dbfs:/Volumes/eco...|
|2025-08-04|2025|   Monday|      3|         -32|2026-01-19 12:40:...|dbfs:/Volumes/eco...|
|2025-08-05|2025|  Tuesday|      3|         -32|2026-01-19 12:40:...|dbfs:/Volumes/eco...|
|2025-08-06|2025|Wednesday|      3|         -32|2026-01-19 12:40:...|dbfs:/Volumes/eco...|
|2025-08-07|2025| Thursday|      3|         -32|2026-01-19 12:40:...|dbfs:/Volumes/eco...|
|2025-08-08|2025|   Friday|      3|         -32|2026-01-19 12:40:...|dbfs:/Volumes/eco...|

In [0]:
# Convert negative week_of_year to positive
df_silver = df_silver.withColumn("week_of_year", F.abs(F.col("week_of_year")))

df_silver.show(10)

+----------+----+---------+-------+------------+--------------------+--------------------+
|      date|year| day_name|quarter|week_of_year|        _ingested_at|        _source_file|
+----------+----+---------+-------+------------+--------------------+--------------------+
|2025-08-01|2025|   Friday|      3|          31|2026-01-19 12:40:...|dbfs:/Volumes/eco...|
|2025-08-02|2025| Saturday|      3|          31|2026-01-19 12:40:...|dbfs:/Volumes/eco...|
|2025-08-03|2025|   Sunday|      3|          31|2026-01-19 12:40:...|dbfs:/Volumes/eco...|
|2025-08-04|2025|   Monday|      3|          32|2026-01-19 12:40:...|dbfs:/Volumes/eco...|
|2025-08-05|2025|  Tuesday|      3|          32|2026-01-19 12:40:...|dbfs:/Volumes/eco...|
|2025-08-06|2025|Wednesday|      3|          32|2026-01-19 12:40:...|dbfs:/Volumes/eco...|
|2025-08-07|2025| Thursday|      3|          32|2026-01-19 12:40:...|dbfs:/Volumes/eco...|
|2025-08-08|2025|   Friday|      3|          32|2026-01-19 12:40:...|dbfs:/Volumes/eco...|

In [0]:
# Concatenating week_of_year with year and also quarter with year

df_silver = df_silver.withColumn("quarter", F.concat_ws("", F.concat(F.lit("Q"), F.col("quarter"), F.lit("-"), F.col("year"))))

df_silver = df_silver.withColumn("week_of_year", F.concat_ws("-", F.concat(F.lit("Week"), F.lit("-"), F.col("week_of_year"), F.lit("-"), F.col("year"))))

df_silver.show(10)

+----------+----+---------+-------+------------+--------------------+--------------------+
|      date|year| day_name|quarter|week_of_year|        _ingested_at|        _source_file|
+----------+----+---------+-------+------------+--------------------+--------------------+
|2025-08-01|2025|   Friday|Q3-2025|Week-31-2025|2026-01-19 12:40:...|dbfs:/Volumes/eco...|
|2025-08-02|2025| Saturday|Q3-2025|Week-31-2025|2026-01-19 12:40:...|dbfs:/Volumes/eco...|
|2025-08-03|2025|   Sunday|Q3-2025|Week-31-2025|2026-01-19 12:40:...|dbfs:/Volumes/eco...|
|2025-08-04|2025|   Monday|Q3-2025|Week-32-2025|2026-01-19 12:40:...|dbfs:/Volumes/eco...|
|2025-08-05|2025|  Tuesday|Q3-2025|Week-32-2025|2026-01-19 12:40:...|dbfs:/Volumes/eco...|
|2025-08-06|2025|Wednesday|Q3-2025|Week-32-2025|2026-01-19 12:40:...|dbfs:/Volumes/eco...|
|2025-08-07|2025| Thursday|Q3-2025|Week-32-2025|2026-01-19 12:40:...|dbfs:/Volumes/eco...|
|2025-08-08|2025|   Friday|Q3-2025|Week-32-2025|2026-01-19 12:40:...|dbfs:/Volumes/eco...|

In [0]:
# Rename a column
df_silver = df_silver.withColumnRenamed("week_of_the_year", "week")

In [0]:
df_silver.write.format("delta") \
    .mode("overwrite") \
    .option("megreSchema", "truw") \
    .saveAsTable(f"{catalog_name}.silver.slv_calendar")