In [0]:
# Define paths for Silver and Gold2 tables
silver_path = "/mnt/silver/cafe_sales"
gold2_path = "/mnt/gold/cafe_sales_payment_analysis"

# Read data from Silver Delta files
df_silver = spark.read.format("delta").load(silver_path)

# Display initial row count and sample data
print(f"Initial row count (Silver): {df_silver.count()}")
display(df_silver.limit(10))


Initial row count (Silver): 3555


transaction_id,item,quantity,price_per_unit,total_spent,payment_method,location,transaction_date
TXN_2548360,Salad,5,5.0,25.0,Cash,Takeaway,2023-11-07
TXN_3160411,Coffee,2,2.0,4.0,no-cash payment,In-store,2023-06-11
TXN_4271903,Cookie,4,1.0,4.0,no-cash payment,In-store,2023-07-19
TXN_1000555,Tea,1,1.5,1.5,no-cash payment,In-store,2023-10-19
TXN_1002457,Cookie,5,1.0,5.0,no-cash payment,Takeaway,2023-09-29
TXN_1004184,Smoothie,1,4.0,4.0,no-cash payment,In-store,2023-05-18
TXN_1004563,Tea,5,1.5,7.5,no-cash payment,In-store,2023-10-28
TXN_1005331,Coffee,1,2.0,2.0,no-cash payment,Takeaway,2023-11-04
TXN_1005377,Cake,5,3.0,15.0,no-cash payment,Takeaway,2023-06-03
TXN_1006942,Salad,1,5.0,5.0,no-cash payment,In-store,2023-11-30


In [0]:
from pyspark.sql.functions import col, count, sum, round
from pyspark.sql.window import Window

# Calculate total transaction count for percentage calculation
total_transactions = df_silver.count()

# Aggregate data by payment method with percentage share
df_gold2 = df_silver.groupBy("payment_method") \
    .agg(
        count("transaction_id").alias("transaction_count"),
        sum("total_spent").alias("total_revenue")
    ) \
    .withColumn(
        "percentage_share",
        round((col("transaction_count") * 100.0) / total_transactions, 2)
    ) \
    .orderBy(col("transaction_count").desc())

print("✅ Aggregations applied:")
print("   - COUNT(transaction_id) as transaction_count")
print("   - SUM(total_spent) as total_revenue")
print("   - PERCENTAGE_SHARE calculation (transaction_count * 100 / total)")
print("   - GROUP BY payment_method")
print("   - ORDER BY transaction_count DESC")

print(f"\nRow count after aggregation: {df_gold2.count()}")
display(df_gold2)


✅ Aggregations applied:
   - COUNT(transaction_id) as transaction_count
   - SUM(total_spent) as total_revenue
   - PERCENTAGE_SHARE calculation (transaction_count * 100 / total)
   - GROUP BY payment_method
   - ORDER BY transaction_count DESC

Row count after aggregation: 2


payment_method,transaction_count,total_revenue,percentage_share
no-cash payment,2395,21389.0,67.37
Cash,1160,10539.0,32.63


In [0]:
# Display schema to verify data types
print("=== Gold2 Layer Schema ===")
df_gold2.printSchema()


=== Gold2 Layer Schema ===
root
 |-- payment_method: string (nullable = true)
 |-- transaction_count: long (nullable = false)
 |-- total_revenue: decimal(17,2) (nullable = true)
 |-- percentage_share: double (nullable = true)



In [0]:
# Clear cache before writing to avoid schema analysis issues
spark.catalog.clearCache()

# Write to Gold2 Delta Table with both path and catalog registration (overwrite = TRUNCATE + INSERT)
df_gold2.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .option("path", gold2_path) \
    .saveAsTable("cafe_sales_gold2")

print(f"✅ External table 'cafe_sales_gold2' created in catalog")
print(f"✅ Delta files saved to: {gold2_path}")
print(f"Final row count: {df_gold2.count()}")
print(f"✅ Aggregations: COUNT(transaction_id), SUM(total_spent), PERCENTAGE_SHARE")


✅ External table 'cafe_sales_gold2' created in catalog
✅ Delta files saved to: /mnt/gold/cafe_sales_payment_analysis
Final row count: 2
✅ Aggregations: COUNT(transaction_id), SUM(total_spent), PERCENTAGE_SHARE


In [0]:
# Refresh catalog and verify changes for Gold2 table
from pyspark.sql.functions import col, sum as spark_sum

# Refresh catalog cache for Gold2 table
spark.catalog.refreshTable("cafe_sales_gold2")
print("✅ Catalog cache refreshed for Gold2")

# Read back from Gold2 to verify changes
df_verify = spark.read.table("cafe_sales_gold2")

# Display payment method analysis from Gold2
print("\n=== Payment Method Analysis from Gold2 table ===")
display(df_verify)

# Verify schema
print("\n=== Verified Gold2 Schema ===")
df_verify.printSchema()

# Show summary statistics
print("\n=== Summary Statistics ===")
print(f"Total payment methods: {df_verify.count()}")
total_revenue = df_verify.agg(spark_sum('total_revenue')).collect()[0][0]
total_transactions = df_verify.agg(spark_sum('transaction_count')).collect()[0][0]
print(f"Total transactions: {total_transactions}")
print(f"Total revenue: ${total_revenue:.2f}")
print(f"\nPercentage share verification (should sum to ~100%): {df_verify.agg(spark_sum('percentage_share')).collect()[0][0]:.2f}%")


✅ Catalog cache refreshed for Gold2

=== Payment Method Analysis from Gold2 table ===


payment_method,transaction_count,total_revenue,percentage_share
no-cash payment,2395,21389.0,67.37
Cash,1160,10539.0,32.63



=== Verified Gold2 Schema ===
root
 |-- payment_method: string (nullable = true)
 |-- transaction_count: long (nullable = true)
 |-- total_revenue: decimal(17,2) (nullable = true)
 |-- percentage_share: double (nullable = true)


=== Summary Statistics ===
Total payment methods: 2
Total transactions: 3555
Total revenue: $31928.00

Percentage share verification (should sum to ~100%): 100.00%
