In [0]:
# Define paths for Bronze2 and Silver tables
bronze2_path = "/mnt/bronze2/cafe_sales"
silver_path = "/mnt/silver/cafe_sales"

# Read data from Bronze2 Delta Table
df_bronze2 = spark.read.format("delta").load(bronze2_path)

# Display initial row count and sample data
print(f"Initial row count (Bronze2): {df_bronze2.count()}")
display(df_bronze2.limit(10))


Initial row count (Bronze2): 3555


transaction_id,item,quantity,price_per_unit,total_spent,payment_method,location,transaction_date
TXN_2548360,Salad,5.0,5.0,25.0,Cash,Takeaway,07-11-2023
TXN_3160411,Coffee,2.0,2.0,4.0,no-cash payment,In-store,11-06-2023
TXN_4271903,Cookie,4.0,1.0,4.0,no-cash payment,In-store,19-07-2023
TXN_1000555,Tea,1.0,1.5,1.5,no-cash payment,In-store,19-10-2023
TXN_1002457,Cookie,5.0,1.0,5.0,no-cash payment,Takeaway,29-09-2023
TXN_1004184,Smoothie,1.0,4.0,4.0,no-cash payment,In-store,18-05-2023
TXN_1004563,Tea,5.0,1.5,7.5,no-cash payment,In-store,28-10-2023
TXN_1005331,Coffee,1.0,2.0,2.0,no-cash payment,Takeaway,04-11-2023
TXN_1005377,Cake,5.0,3.0,15.0,no-cash payment,Takeaway,03-06-2023
TXN_1006942,Salad,1.0,5.0,5.0,no-cash payment,In-store,30-11-2023


In [0]:
from pyspark.sql.functions import col, to_date
from pyspark.sql.types import StringType, IntegerType, DecimalType, DateType

# Apply type conversions to create Silver layer data
df_silver = df_bronze2.select(
    col("transaction_id").cast(StringType()),
    col("item").cast(StringType()),
    col("quantity").cast(IntegerType()),
    col("price_per_unit").cast(DecimalType(5, 2)),
    col("total_spent").cast(DecimalType(7, 2)),
    col("payment_method").cast(StringType()),
    col("location").cast(StringType()),
    to_date(col("transaction_date"), "dd-MM-yyyy").alias("transaction_date")
)

print("✅ Type conversions applied:")
print("   - transaction_id: STRING")
print("   - item: STRING")
print("   - quantity: INTEGER")
print("   - price_per_unit: DECIMAL(5,2)")
print("   - total_spent: DECIMAL(7,2)")
print("   - payment_method: STRING")
print("   - location: STRING")
print("   - transaction_date: DATE")

print(f"\nRow count after conversion: {df_silver.count()}")
display(df_silver.limit(10))


✅ Type conversions applied:
   - transaction_id: STRING
   - item: STRING
   - quantity: INTEGER
   - price_per_unit: DECIMAL(5,2)
   - total_spent: DECIMAL(7,2)
   - payment_method: STRING
   - location: STRING
   - transaction_date: DATE

Row count after conversion: 3555


transaction_id,item,quantity,price_per_unit,total_spent,payment_method,location,transaction_date
TXN_2548360,Salad,5,5.0,25.0,Cash,Takeaway,2023-11-07
TXN_3160411,Coffee,2,2.0,4.0,no-cash payment,In-store,2023-06-11
TXN_4271903,Cookie,4,1.0,4.0,no-cash payment,In-store,2023-07-19
TXN_1000555,Tea,1,1.5,1.5,no-cash payment,In-store,2023-10-19
TXN_1002457,Cookie,5,1.0,5.0,no-cash payment,Takeaway,2023-09-29
TXN_1004184,Smoothie,1,4.0,4.0,no-cash payment,In-store,2023-05-18
TXN_1004563,Tea,5,1.5,7.5,no-cash payment,In-store,2023-10-28
TXN_1005331,Coffee,1,2.0,2.0,no-cash payment,Takeaway,2023-11-04
TXN_1005377,Cake,5,3.0,15.0,no-cash payment,Takeaway,2023-06-03
TXN_1006942,Salad,1,5.0,5.0,no-cash payment,In-store,2023-11-30


In [0]:
# Display schema to verify data types
print("=== Silver Layer Schema ===")
df_silver.printSchema()


=== Silver Layer Schema ===
root
 |-- transaction_id: string (nullable = true)
 |-- item: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- price_per_unit: decimal(5,2) (nullable = true)
 |-- total_spent: decimal(7,2) (nullable = true)
 |-- payment_method: string (nullable = true)
 |-- location: string (nullable = true)
 |-- transaction_date: date (nullable = true)



In [0]:
# Clear cache before writing to avoid schema analysis issues
spark.catalog.clearCache()

# Write to Silver Delta Table with both path and catalog registration
df_silver.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .option("path", silver_path) \
    .saveAsTable("cafe_sales_silver")

print(f"✅ External table 'cafe_sales_silver' created in catalog")
print(f"✅ Delta files saved to: {silver_path}")
print(f"Final row count: {df_silver.count()}")
print(f"✅ Type conversions: STRING, INTEGER, DECIMAL, DATE")


✅ External table 'cafe_sales_silver' created in catalog
✅ Delta files saved to: /mnt/silver/cafe_sales
Final row count: 3555
✅ Type conversions: STRING, INTEGER, DECIMAL, DATE


In [0]:
# Refresh catalog and verify changes for Silver table
from pyspark.sql.functions import col

# Refresh catalog cache for Silver table
spark.catalog.refreshTable("cafe_sales_silver")
print("✅ Catalog cache refreshed for Silver")

# Read back from Silver to verify changes - now using table name
df_verify = spark.read.table("cafe_sales_silver")

# Display sample data from Silver
print("\n=== Sample data from Silver table ===")
display(df_verify.limit(20))

# Verify schema
print("\n=== Verified Silver Schema ===")
df_verify.printSchema()


✅ Catalog cache refreshed for Silver

=== Sample data from Silver table ===


transaction_id,item,quantity,price_per_unit,total_spent,payment_method,location,transaction_date
TXN_2548360,Salad,5,5.0,25.0,Cash,Takeaway,2023-11-07
TXN_3160411,Coffee,2,2.0,4.0,no-cash payment,In-store,2023-06-11
TXN_4271903,Cookie,4,1.0,4.0,no-cash payment,In-store,2023-07-19
TXN_1000555,Tea,1,1.5,1.5,no-cash payment,In-store,2023-10-19
TXN_1002457,Cookie,5,1.0,5.0,no-cash payment,Takeaway,2023-09-29
TXN_1004184,Smoothie,1,4.0,4.0,no-cash payment,In-store,2023-05-18
TXN_1004563,Tea,5,1.5,7.5,no-cash payment,In-store,2023-10-28
TXN_1005331,Coffee,1,2.0,2.0,no-cash payment,Takeaway,2023-11-04
TXN_1005377,Cake,5,3.0,15.0,no-cash payment,Takeaway,2023-06-03
TXN_1006942,Salad,1,5.0,5.0,no-cash payment,In-store,2023-11-30



=== Verified Silver Schema ===
root
 |-- transaction_id: string (nullable = true)
 |-- item: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- price_per_unit: decimal(5,2) (nullable = true)
 |-- total_spent: decimal(7,2) (nullable = true)
 |-- payment_method: string (nullable = true)
 |-- location: string (nullable = true)
 |-- transaction_date: date (nullable = true)

