In [0]:
# Define paths for Silver and Gold tables
silver_path = "/mnt/silver/cafe_sales"
gold_path = "/mnt/gold/cafe_sales_aggregated"

# Read data from Silver Delta Table
df_silver = spark.read.format("delta").load(silver_path)

# Display initial row count and sample data
print(f"Initial row count (Silver): {df_silver.count()}")
display(df_silver.limit(10))


Initial row count (Silver): 3555


transaction_id,item,quantity,price_per_unit,total_spent,payment_method,location,transaction_date
TXN_2548360,Salad,5,5.0,25.0,Cash,Takeaway,2023-11-07
TXN_3160411,Coffee,2,2.0,4.0,no-cash payment,In-store,2023-06-11
TXN_4271903,Cookie,4,1.0,4.0,no-cash payment,In-store,2023-07-19
TXN_1000555,Tea,1,1.5,1.5,no-cash payment,In-store,2023-10-19
TXN_1002457,Cookie,5,1.0,5.0,no-cash payment,Takeaway,2023-09-29
TXN_1004184,Smoothie,1,4.0,4.0,no-cash payment,In-store,2023-05-18
TXN_1004563,Tea,5,1.5,7.5,no-cash payment,In-store,2023-10-28
TXN_1005331,Coffee,1,2.0,2.0,no-cash payment,Takeaway,2023-11-04
TXN_1005377,Cake,5,3.0,15.0,no-cash payment,Takeaway,2023-06-03
TXN_1006942,Salad,1,5.0,5.0,no-cash payment,In-store,2023-11-30


In [0]:
from pyspark.sql.functions import col, sum, avg, count

# Aggregate data to create Gold layer
df_gold = df_silver.groupBy("item") \
    .agg(
        sum("quantity").alias("total_quantity"),
        sum("total_spent").alias("total_revenue"),
        avg("price_per_unit").alias("avg_price"),
        count("transaction_id").alias("transaction_count")
    ) \
    .withColumnRenamed("item", "product_name") \
    .orderBy(col("total_revenue").desc())

print("✅ Aggregations applied:")
print("   - SUM(quantity) as total_quantity")
print("   - SUM(total_spent) as total_revenue")
print("   - AVG(price_per_unit) as avg_price")
print("   - COUNT(transaction_id) as transaction_count")
print("   - GROUP BY item")
print("   - ORDER BY total_revenue DESC")

print(f"\nRow count after aggregation: {df_gold.count()}")
display(df_gold)


✅ Aggregations applied:
   - SUM(quantity) as total_quantity
   - SUM(total_spent) as total_revenue
   - AVG(price_per_unit) as avg_price
   - COUNT(transaction_id) as transaction_count
   - GROUP BY item
   - ORDER BY total_revenue DESC

Row count after aggregation: 8


product_name,total_quantity,total_revenue,avg_price,transaction_count
Salad,1461,7305.0,5.0,480
Sandwich,1424,5696.0,4.0,457
Smoothie,1186,4744.0,4.0,389
Juice,1395,4185.0,3.0,477
Cake,1353,4059.0,3.0,439
Coffee,1303,2606.0,2.0,427
Tea,1314,1971.0,1.5,426
Cookie,1362,1362.0,1.0,460


In [0]:
# Display schema to verify data types
print("=== Gold Layer Schema ===")
df_gold.printSchema()


=== Gold Layer Schema ===
root
 |-- product_name: string (nullable = true)
 |-- total_quantity: long (nullable = true)
 |-- total_revenue: decimal(17,2) (nullable = true)
 |-- avg_price: decimal(9,6) (nullable = true)
 |-- transaction_count: long (nullable = false)



In [0]:
# Clear cache before writing to avoid schema analysis issues
spark.catalog.clearCache()

# Write to Gold Delta Table with both path and catalog registration (overwrite = TRUNCATE + INSERT)
df_gold.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .option("path", gold_path) \
    .saveAsTable("cafe_sales_gold")

print(f"✅ External table 'cafe_sales_gold' created in catalog")
print(f"✅ Delta files saved to: {gold_path}")
print(f"Final row count: {df_gold.count()}")
print(f"✅ Aggregations: SUM(quantity), SUM(total_spent), AVG(price_per_unit), COUNT(transaction_id)")


✅ External table 'cafe_sales_gold' created in catalog
✅ Delta files saved to: /mnt/gold/cafe_sales_aggregated
Final row count: 8
✅ Aggregations: SUM(quantity), SUM(total_spent), AVG(price_per_unit), COUNT(transaction_id)


In [0]:
# Refresh catalog and verify changes for Gold table
from pyspark.sql.functions import col, sum as spark_sum

# Refresh catalog cache for Gold table
spark.catalog.refreshTable("cafe_sales_gold")
print("✅ Catalog cache refreshed for Gold")

# Read back from Gold to verify changes
df_verify = spark.read.table("cafe_sales_gold")

# Display aggregated data from Gold
print("\n=== Sample aggregated data from Gold table ===")
display(df_verify.limit(20))

# Verify schema
print("\n=== Verified Gold Schema ===")
df_verify.printSchema()

# Show summary statistics
print("\n=== Summary Statistics ===")
print(f"Total products: {df_verify.count()}")
total_revenue = df_verify.agg(spark_sum('total_revenue')).collect()[0][0]
print(f"Total revenue across all products: ${total_revenue:.2f}")


✅ Catalog cache refreshed for Gold

=== Sample aggregated data from Gold table ===


product_name,total_quantity,total_revenue,avg_price,transaction_count
Salad,1461,7305.0,5.0,480
Sandwich,1424,5696.0,4.0,457
Smoothie,1186,4744.0,4.0,389
Juice,1395,4185.0,3.0,477
Cake,1353,4059.0,3.0,439
Coffee,1303,2606.0,2.0,427
Tea,1314,1971.0,1.5,426
Cookie,1362,1362.0,1.0,460



=== Verified Gold Schema ===
root
 |-- product_name: string (nullable = true)
 |-- total_quantity: long (nullable = true)
 |-- total_revenue: decimal(17,2) (nullable = true)
 |-- avg_price: decimal(9,6) (nullable = true)
 |-- transaction_count: long (nullable = true)


=== Summary Statistics ===
Total products: 8
Total revenue across all products: $31928.00
