COLUMN TRANSFORMATION

In [0]:
display(dbutils.fs.ls("mnt/bronze"))

path,name,size,modificationTime
dbfs:/mnt/bronze/Customers.parquet,Customers.parquet,1510,1750232356000
dbfs:/mnt/bronze/OrderItems.parquet,OrderItems.parquet,1631,1750232306000
dbfs:/mnt/bronze/Orders.parquet,Orders.parquet,1072,1750232307000
dbfs:/mnt/bronze/Products.parquet,Products.parquet,1742,1750232310000


In [0]:
df_customers = spark.read.parquet("/mnt/bronze/Customers.parquet")
display(df_customers)

CustomerID,FirstName,LastName,Email,CreatedDate
1,John,Doe,john.doe@example.com,2025-06-03T11:16:50.867Z
2,Jane,Smith,jane.smith@example.com,2025-06-03T11:16:50.867Z
3,Michael,Brown,michael.brown@example.com,2025-06-03T11:16:50.867Z
4,Emily,Davis,emily.davis@example.com,2025-06-03T11:16:50.867Z
5,David,Wilson,david.wilson@example.com,2025-06-03T11:16:50.867Z
6,Laura,Taylor,laura.taylor@example.com,2025-06-03T11:16:50.867Z
7,James,Anderson,james.anderson@example.com,2025-06-03T11:16:50.867Z
8,Linda,Thomas,linda.thomas@example.com,2025-06-03T11:16:50.867Z
9,Robert,Jackson,robert.jackson@example.com,2025-06-03T11:16:50.867Z
10,Susan,White,susan.white@example.com,2025-06-03T11:16:50.867Z


In [0]:
df_orderitems = spark.read.parquet("/mnt/bronze/OrderItems.parquet")
df_orderitems.display()

OrderItemID,OrderID,ProductID,Quantity,UnitPrice
1,1,1,2,25.99
2,1,6,1,15.0
3,2,9,1,250.0
4,3,3,5,10.0
5,3,11,3,8.99
6,4,5,1,199.99
7,4,10,3,30.0
8,5,12,2,12.0
9,6,7,1,150.0
10,6,20,1,70.0


In [0]:
df_orders = spark.read.parquet("/mnt/bronze/Orders.parquet")
df_orders.display()

OrderID,CustomerID,OrderDate,TotalAmount
1,1,2025-05-01T00:00:00Z,150.99
2,2,2025-05-03T00:00:00Z,250.5
3,3,2025-05-04T00:00:00Z,75.0
4,4,2025-05-05T00:00:00Z,300.0
5,5,2025-05-06T00:00:00Z,89.99
6,6,2025-05-07T00:00:00Z,120.0
7,7,2025-05-08T00:00:00Z,45.0
8,8,2025-05-09T00:00:00Z,250.0
9,9,2025-05-10T00:00:00Z,500.0
10,10,2025-05-11T00:00:00Z,30.0


In [0]:
df_products = spark.read.parquet("/mnt/bronze/Products.parquet")
df_products.display()

ProductID,ProductName,Category,Price,Stock
1,Wireless Mouse,Electronics,25.99,100
2,Bluetooth Keyboard,Electronics,45.5,150
3,USB-C Cable,Electronics,10.0,500
4,Laptop Stand,Accessories,35.0,75
5,Noise Cancelling Headphones,Electronics,199.99,40
6,Smartphone Case,Accessories,15.0,200
7,LED Monitor 24 inch,Electronics,150.0,50
8,External Hard Drive 1TB,Electronics,70.0,80
9,Gaming Chair,Furniture,250.0,20
10,Desk Lamp,Furniture,30.0,60


In [0]:
#Change date type to yyyy-mm-dd
from pyspark.sql.functions import *
from pyspark.sql.types import TimestampType

df_orders = df_orders.withColumn("OrderDate", date_format(from_utc_timestamp("OrderDate", "UTC"), "yyyy-MM-dd"))
display(df_orders)

OrderID,CustomerID,OrderDate,TotalAmount
1,1,2025-05-01,150.99
2,2,2025-05-03,250.5
3,3,2025-05-04,75.0
4,4,2025-05-05,300.0
5,5,2025-05-06,89.99
6,6,2025-05-07,120.0
7,7,2025-05-08,45.0
8,8,2025-05-09,250.0
9,9,2025-05-10,500.0
10,10,2025-05-11,30.0


In [0]:
df_customers = df_customers.withColumn("CreatedDate", date_format(from_utc_timestamp("CreatedDate", "UTC"), "yyyy-MM-dd"))
display(df_customers)

CustomerID,FirstName,LastName,Email,CreatedDate
1,John,Doe,john.doe@example.com,2025-06-03
2,Jane,Smith,jane.smith@example.com,2025-06-03
3,Michael,Brown,michael.brown@example.com,2025-06-03
4,Emily,Davis,emily.davis@example.com,2025-06-03
5,David,Wilson,david.wilson@example.com,2025-06-03
6,Laura,Taylor,laura.taylor@example.com,2025-06-03
7,James,Anderson,james.anderson@example.com,2025-06-03
8,Linda,Thomas,linda.thomas@example.com,2025-06-03
9,Robert,Jackson,robert.jackson@example.com,2025-06-03
10,Susan,White,susan.white@example.com,2025-06-03


In [0]:
#change the @example to @gmail
df_customers = df_customers.withColumn("Email", regexp_replace("Email", "@example", "@gmail"))
display(df_customers)

CustomerID,FirstName,LastName,Email,CreatedDate
1,John,Doe,john.doe@gmail.com,2025-06-03
2,Jane,Smith,jane.smith@gmail.com,2025-06-03
3,Michael,Brown,michael.brown@gmail.com,2025-06-03
4,Emily,Davis,emily.davis@gmail.com,2025-06-03
5,David,Wilson,david.wilson@gmail.com,2025-06-03
6,Laura,Taylor,laura.taylor@gmail.com,2025-06-03
7,James,Anderson,james.anderson@gmail.com,2025-06-03
8,Linda,Thomas,linda.thomas@gmail.com,2025-06-03
9,Robert,Jackson,robert.jackson@gmail.com,2025-06-03
10,Susan,White,susan.white@gmail.com,2025-06-03


In [0]:
# Write to Silver as Delta
df_customers.write.format("delta").mode("overwrite").save("/mnt/silver/Customers")
df_orders.write.format("delta").mode("overwrite").save("/mnt/silver/Orders")
df_orderitems.write.format("delta").mode("overwrite").save("/mnt/silver/OrderItems")
df_products.write.format("delta").mode("overwrite").save("/mnt/silver/Products")