In [0]:
%run ./config

In [0]:
# Databricks notebook source
# MAGIC %md
# MAGIC # Gold Layer - Streaming Business Aggregations
# MAGIC Creates real-time aggregated metrics tables for business intelligence

# COMMAND ----------

from pyspark.sql.functions import *
from pyspark.sql.types import *

# COMMAND ----------

# MAGIC %md
# MAGIC ## Configuration

# COMMAND ----------


print("="*70)
print("GOLD LAYER STREAMING CONFIGURATION")
print("="*70)
print(f"Source: {silver_table}")
print(f"\nGold Tables:")
print(f"  1. {gold_brand_category_table}")
print(f"  2. {gold_location_table}")
print(f"  3. {gold_product_table}")
print(f"  4. {gold_customer_table}")
print(f"  5. {gold_daily_summary_table}")
print("="*70)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Read Silver Stream

# COMMAND ----------

print("\n" + "="*70)
print("READING SILVER STREAM")
print("="*70)

# Read streaming data from Silver with watermark
silver_stream = (spark.readStream
    .format("delta")
    .table(silver_table)
    .withWatermark("silver_timestamp", "10 minutes")  # Handle late data up to 10 minutes
)

print(f"‚úì Silver stream configured")
print(f"  Source: {silver_table}")
print(f"  Watermark: 10 minutes on silver_timestamp")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Gold Stream 1: Sales by Brand and Category

# COMMAND ----------

print("\n" + "="*70)
print("STARTING: Sales by Brand and Category Stream")
print("="*70)

# Aggregate by brand and category
# Aggregate by brand and category
sales_by_brand_category_stream = (
    silver_stream
    .groupBy("brand", "category")
    .agg(
        count("*").alias("order_count"),
        sum("quantity").alias("total_quantity"),
        sum("total_amount").alias("total_revenue"),
        avg("total_amount").alias("avg_order_value"),
        min("total_amount").alias("min_order_value"),
        max("total_amount").alias("max_order_value"),
        approx_count_distinct("customer_id").alias("unique_customers"),
        # approx_count_distinct("product_id").alias("unique_products"),  # Uncomment if needed
        sum("discount_amount").alias("total_discounts")
    )
    .select(
        "brand",
        "category",
        "order_count",
        "total_quantity",
        round(col("total_revenue"), 2).alias("total_revenue"),
        round(col("avg_order_value"), 2).alias("avg_order_value"),
        round(col("min_order_value"), 2).alias("min_order_value"),
        round(col("max_order_value"), 2).alias("max_order_value"),
        "unique_customers",
        # "unique_products",
        round(col("total_discounts"), 2).alias("total_discounts"),
        current_timestamp().alias("gold_timestamp")
    )
)

# Write stream
brand_category_query = (sales_by_brand_category_stream
    .writeStream
    .format("delta")
    .outputMode("complete")  # Complete mode for full aggregations
    .option("checkpointLocation", gold_brand_checkpoint)
    .option("mergeSchema", "true")
    #.trigger(processingTime="20 seconds")
    .trigger(once=True) # for testing
    .toTable(gold_brand_category_table)
)

print(f"‚úì Brand/Category stream started")
print(f"  Query ID: {brand_category_query.id}")
print(f"  Target: {gold_brand_category_table}")
print(f"  Output Mode: complete")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Gold Stream 2: Location Performance

# COMMAND ----------

print("\n" + "="*70)
print("STARTING: Location Performance Stream")
print("="*70)

# Aggregate by location
location_performance_stream = (
    silver_stream
    .groupBy("location")
    .agg(
        count("*").alias("order_count"),
        sum("total_amount").alias("total_revenue"),
        avg("total_amount").alias("avg_order_value"),
        sum("quantity").alias("total_items_sold"),
        approx_count_distinct("customer_id").alias("unique_customers"),
        approx_count_distinct("product_name").alias("unique_products"),
        approx_count_distinct("brand").alias("unique_brands"),
        sum(when(col("order_status") == "delivered", 1).otherwise(0)).alias("delivered_orders"),
        sum(when(col("order_status") == "pending", 1).otherwise(0)).alias("pending_orders")
    )
    .select(
        "location",
        "order_count",
        round(col("total_revenue"), 2).alias("total_revenue"),
        round(col("avg_order_value"), 2).alias("avg_order_value"),
        "total_items_sold",
        "unique_customers",
        "unique_products",
        "unique_brands",
        "delivered_orders",
        "pending_orders",
        round((col("delivered_orders") / col("order_count") * 100), 2).alias("delivery_rate_pct"),
        current_timestamp().alias("gold_timestamp")
    )
)

# Write stream
location_query = (location_performance_stream
    .writeStream
    .format("delta")
    .outputMode("complete")
    .option("checkpointLocation", gold_location_checkpoint)
    .option("mergeSchema", "true")
   # .trigger(processingTime="20 seconds")
    .trigger(once=True) # for testing
    .toTable(gold_location_table)
)

print(f"‚úì Location performance stream started")
print(f"  Query ID: {location_query.id}")
print(f"  Target: {gold_location_table}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Gold Stream 3: Product Performance

# COMMAND ----------

print("\n" + "="*70)
print("STARTING: Product Performance Stream")
print("="*70)

# Aggregate by product
# Aggregate by product (remove product_id)
product_performance_stream = (
    silver_stream
    .groupBy("product_name", "brand", "category", "base_price")
    .agg(
        count("*").alias("order_count"),
        sum("quantity").alias("total_quantity_sold"),
        sum("total_amount").alias("total_revenue"),
        avg("unit_price").alias("avg_selling_price"),
        min("unit_price").alias("min_selling_price"),
        max("unit_price").alias("max_selling_price"),
        approx_count_distinct("customer_id").alias("unique_customers"),
        approx_count_distinct("location").alias("locations_sold"),
        avg("discount_pct").alias("avg_discount_pct"),
        sum("discount_amount").alias("total_discount_amount")
    )
    .select(
        "product_name",
        "brand",
        "category",
        "base_price",
        "order_count",
        "total_quantity_sold",
        round(col("total_revenue"), 2).alias("total_revenue"),
        round(col("avg_selling_price"), 2).alias("avg_selling_price"),
        round(col("min_selling_price"), 2).alias("min_selling_price"),
        round(col("max_selling_price"), 2).alias("max_selling_price"),
        "unique_customers",
        "locations_sold",
        round(col("avg_discount_pct") * 100, 2).alias("avg_discount_pct"),
        round(col("total_discount_amount"), 2).alias("total_discount_amount"),
        round((col("total_revenue") / col("total_quantity_sold")), 2).alias("revenue_per_unit"),
        current_timestamp().alias("gold_timestamp")
    )
)

# Write stream
product_query = (product_performance_stream
    .writeStream
    .format("delta")
    .outputMode("complete")
    .option("checkpointLocation", gold_product_checkpoint)
    .option("mergeSchema", "true")
    #.trigger(processingTime="20 seconds")
    .trigger(once=True) # for testing
    .toTable(gold_product_table)
)

print(f"‚úì Product performance stream started")
print(f"  Query ID: {product_query.id}")
print(f"  Target: {gold_product_table}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Gold Stream 4: Customer Insights (Using foreachBatch)

# COMMAND ----------

customer_insights_stream = (
    silver_stream
    .filter(
        col("customer_id").isNotNull() & 
        col("order_timestamp").isNotNull()
    )
    .groupBy("customer_id", "customer_name", "location")
    .agg(
        count("*").alias("order_count"),
        sum("total_amount").alias("total_spent"),
        avg("total_amount").alias("avg_order_value"),
        sum("quantity").alias("total_items_purchased"),
        min("order_timestamp").alias("first_order_date"),
        max("order_timestamp").alias("last_order_date"),
        approx_count_distinct("product_name").alias("unique_products_bought"),
        approx_count_distinct("brand").alias("unique_brands"),
        approx_count_distinct("category").alias("unique_categories"),
        avg(coalesce(col("discount_pct"), lit(0))).alias("avg_discount_rate")
    )
    .withColumn(
        "days_as_customer", 
        when(
            col("first_order_date").isNotNull() & col("last_order_date").isNotNull(),
            datediff(col("last_order_date"), col("first_order_date"))
        ).otherwise(0)
    )
    .withColumn(
        "customer_segment",
        when(col("total_spent") >= 5000, "High Value")
        .when(col("total_spent") >= 2000, "Medium Value")
        .otherwise("Low Value")
    )
    .select(
        "customer_id",
        "customer_name",
        "location",
        "order_count",
        round(col("total_spent"), 2).alias("total_spent"),
        round(col("avg_order_value"), 2).alias("avg_order_value"),
        "total_items_purchased",
        "first_order_date",
        "last_order_date",
        "days_as_customer",
        "unique_products_bought",
        "unique_brands",
        "unique_categories",
        round(col("avg_discount_rate") * 100, 2).alias("avg_discount_pct"),
        "customer_segment",
        current_timestamp().alias("gold_timestamp")
    )
)
# Write with complete mode (simpler, no MERGE needed)
customer_query = (customer_insights_stream
    .writeStream
    .format("delta")
    .outputMode("complete")
    .option("checkpointLocation", gold_customer_checkpoint)
    .option("mergeSchema", "true")
   # .trigger(processingTime="30 seconds")
    .trigger(once=True) # for testing
    .toTable(gold_customer_table)
)

print(f"‚úì Customer insights stream started (simplified)")
print(f"  Query ID: {customer_query.id}")
print(f"  Target: {gold_customer_table}")
print(f"  Output Mode: complete")
print(f"  Note: Using countDistinct instead of collect_set for better stability")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Gold Stream 5: Daily Summary with Time Windows

# COMMAND ----------

print("\n" + "="*70)
print("STARTING: Daily Summary Stream")
print("="*70)

# Windowed aggregations by day
# Windowed aggregations by day
daily_summary_stream = (
    silver_stream
    .withWatermark("order_timestamp", "1 hour")
    .groupBy(
        window(col("order_timestamp"), "1 day").alias("day_window"),
        "location",
        "category"
    )
    .agg(
        count("*").alias("order_count"),
        sum("total_amount").alias("daily_revenue"),
        avg("total_amount").alias("avg_order_value"),
        sum("quantity").alias("items_sold"),
        approx_count_distinct("customer_id").alias("unique_customers"),
        approx_count_distinct("product_name").alias("unique_products"),
        max("total_amount").alias("highest_order"),
        min("total_amount").alias("lowest_order")
    )
    .select(
        col("day_window.start").alias("day"),
        "location",
        "category",
        "order_count",
        round(col("daily_revenue"), 2).alias("daily_revenue"),
        round(col("avg_order_value"), 2).alias("avg_order_value"),
        "items_sold",
        "unique_customers",
        "unique_products",
        round(col("highest_order"), 2).alias("highest_order"),
        round(col("lowest_order"), 2).alias("lowest_order"),
        current_timestamp().alias("gold_timestamp")
    )
)

# Write stream
daily_query = (daily_summary_stream
    .writeStream
    .format("delta")
    .outputMode("append")  # Append mode for windowed aggregations
    .option("checkpointLocation", gold_daily_checkpoint)
    .option("mergeSchema", "true")
    #.trigger(processingTime="30 seconds")
    .trigger(once=True) # for testing
    .toTable(gold_daily_summary_table)
)

print(f"‚úì Daily summary stream started")
print(f"  Query ID: {daily_query.id}")
print(f"  Target: {gold_daily_summary_table}")
print(f"  Window: 1 day")
print(f"  Output Mode: append")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Monitor All Streaming Queries

# COMMAND ----------

print("\n" + "="*70)
print("ACTIVE GOLD LAYER STREAMS")
print("="*70)

for stream in spark.streams.active:
    print(f"\nQuery ID: {stream.id}")
    print(f"  Name: {stream.name if stream.name else 'unnamed'}")
    print(f"  Status: {stream.status['message']}")
    print(f"  Is Active: {stream.isActive}")
    
    if stream.recentProgress:
        latest = stream.recentProgress[-1]
        print(f"  Recent Progress:")
        print(f"    - Batch: {latest.get('batchId', 'N/A')}")
        print(f"    - Input Rows: {latest.get('numInputRows', 0)}")
        print(f"    - Processing Rate: {latest.get('processedRowsPerSecond', 0):.2f} rows/sec")

print(f"\nTotal Active Streams: {len(spark.streams.active)}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Wait and Monitor (Run for 60 seconds)

# COMMAND ----------

import time

print("\n" + "="*70)
print("MONITORING GOLD STREAMS FOR 60 SECONDS")
print("="*70)

for i in range(12):
    time.sleep(5)
    print(f"\n‚è±Ô∏è  {(i+1)*5} seconds:")
    
    # Check each gold table
    try:
        brand_count = spark.table(gold_brand_category_table).count()
        print(f"    Brand/Category: {brand_count} records")
    except:
        print(f"    Brand/Category: Not created yet")
    
    try:
        location_count = spark.table(gold_location_table).count()
        print(f"    Locations: {location_count} records")
    except:
        print(f"    Locations: Not created yet")
    
    try:
        product_count = spark.table(gold_product_table).count()
        print(f"    Products: {product_count} records")
    except:
        print(f"    Products: Not created yet")
    
    try:
        customer_count = spark.table(gold_customer_table).count()
        print(f"    Customers: {customer_count} records")
    except:
        print(f"    Customers: Not created yet")
    
    try:
        daily_count = spark.table(gold_daily_summary_table).count()
        print(f"    Daily Summary: {daily_count} records")
    except:
        print(f"    Daily Summary: Not created yet")

print("\n‚úì Monitoring complete")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Verify Gold Tables

# COMMAND ----------

print("\n" + "="*70)
print("GOLD LAYER VERIFICATION")
print("="*70)

# 1. Brand/Category
try:
    brand_cat_df = spark.table(gold_brand_category_table)
    print(f"\n1. Sales by Brand/Category: {brand_cat_df.count()} records")
    display(brand_cat_df.orderBy(desc("total_revenue")).limit(10))
except Exception as e:
    print(f"\n1. Brand/Category table not ready: {e}")

# COMMAND ----------

# 2. Location Performance
try:
    location_df = spark.table(gold_location_table)
    print(f"\n2. Location Performance: {location_df.count()} records")
    display(location_df.orderBy(desc("total_revenue")))
except Exception as e:
    print(f"\n2. Location table not ready: {e}")

# COMMAND ----------

# 3. Product Performance
try:
    product_df = spark.table(gold_product_table)
    print(f"\n3. Product Performance: {product_df.count()} records")
    display(product_df.orderBy(desc("total_revenue")).limit(10))
except Exception as e:
    print(f"\n3. Product table not ready: {e}")

# COMMAND ----------

# 4. Customer Insights
try:
    customer_df = spark.table(gold_customer_table)
    print(f"\n4. Customer Insights: {customer_df.count()} records")
    
    # Show by segment
    print("\nCustomers by Segment:")
    display(
        customer_df.groupBy("customer_segment")
        .agg(
            count("*").alias("customer_count"),
            sum("total_spent").alias("segment_revenue"),
            avg("order_count").alias("avg_orders_per_customer")
        )
        .orderBy(desc("segment_revenue"))
    )
    
    print("\nTop 10 Customers:")
    display(customer_df.orderBy(desc("total_spent")).limit(10))
    
except Exception as e:
    print(f"\n4. Customer table not ready: {e}")

# COMMAND ----------

# 5. Daily Summary
try:
    daily_df = spark.table(gold_daily_summary_table)
    print(f"\n5. Daily Summary: {daily_df.count()} records")
    
    print("\nDaily Revenue Trend:")
    display(
        daily_df
        .groupBy("day")
        .agg(
            sum("daily_revenue").alias("total_daily_revenue"),
            sum("order_count").alias("total_orders")
        )
        .orderBy("day")
    )
    
except Exception as e:
    print(f"\n5. Daily Summary table not ready: {e}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Real-Time Business Dashboard

# COMMAND ----------

print("\n" + "="*70)
print("REAL-TIME BUSINESS METRICS")
print("="*70)

try:
    # Overall metrics from brand/category table
    overall = spark.sql(f"""
        SELECT 
            SUM(order_count) as total_orders,
            SUM(total_revenue) as total_revenue,
            AVG(avg_order_value) as overall_avg_order_value,
            SUM(unique_customers) as total_unique_customers,
            SUM(total_discounts) as total_discounts_given
        FROM {gold_brand_category_table}
    """).first()
    
    print(f"\nüìä Current Business Metrics:")
    print(f"  Total Orders: {overall['total_orders']:,}")
    print(f"  Total Revenue: ${overall['total_revenue']:,.2f}")
    print(f"  Avg Order Value: ${overall['overall_avg_order_value']:.2f}")
    print(f"  Unique Customers: {overall['total_unique_customers']}")
    print(f"  Total Discounts: ${overall['total_discounts_given']:,.2f}")
    
    # Top performing categories
    print("\nüèÜ Top Performing Categories:")
    display(
        spark.table(gold_brand_category_table)
        .groupBy("category")
        .agg(sum("total_revenue").alias("category_revenue"))
        .orderBy(desc("category_revenue"))
    )
    
except Exception as e:
    print(f"Dashboard not ready yet: {e}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Stop All Gold Streams (Run when done)

# COMMAND ----------

# Uncomment to stop all streams
"""
print("=" * 70)
print("STOPPING GOLD LAYER STREAMS")
print("=" * 70)

for stream in spark.streams.active:
    print(f"\nStopping: {stream.id}")
    stream.stop()
    print(f"  ‚úì Stopped")

print("\n‚úì All Gold streams stopped")
"""

# COMMAND ----------

# MAGIC %md
# MAGIC ## Summary

# COMMAND ----------

print("\n" + "="*70)
print("‚úì GOLD LAYER STREAMING SETUP COMPLETE")
print("="*70)

try:
    print(f"\nStreaming Gold Tables Created:")
    
    tables_info = [
        (gold_brand_category_table, "Sales by Brand/Category"),
        (gold_location_table, "Location Performance"),
        (gold_product_table, "Product Performance"),
        (gold_customer_table, "Customer Insights"),
        (gold_daily_summary_table, "Daily Summary")
    ]
    
    for table, desc in tables_info:
        try:
            count = spark.table(table).count()
            print(f"  ‚úì {desc}: {count} records")
        except:
            print(f"  ‚è≥ {desc}: Processing...")
    
    print(f"\nStreaming Status:")
    print(f"  Active Queries: {len(spark.streams.active)}")
    
    for stream in spark.streams.active:
        print(f"    - {stream.id}: {stream.status['message']}")
    
    print(f"\nData Flow:")
    print(f"  Silver Order Details (streaming)")
    print(f"    ‚Üì Real-time aggregations")
    print(f"  Gold Business Metrics")
    print(f"    ‚Ä¢ Brand/Category sales")
    print(f"    ‚Ä¢ Location performance")
    print(f"    ‚Ä¢ Product analytics")
    print(f"    ‚Ä¢ Customer segmentation")
    print(f"    ‚Ä¢ Daily summaries")
    
    print(f"\nKey Features:")
    print(f"  ‚Ä¢ Watermarks for late data (10 min)")
    print(f"  ‚Ä¢ Complete mode for full aggregations")
    print(f"  ‚Ä¢ Append mode for time windows")
    print(f"  ‚Ä¢ MERGE for customer updates")
    print(f"  ‚Ä¢ Real-time business intelligence")
    
    print("\n" + "="*70)
    print("Next Steps:")
    print("  1. Monitor streams with 'Monitor' cell")
    print("  2. Query tables for real-time analytics")
    print("  3. Build dashboards using Gold tables")
    print("  4. Stop streams when done")
    print("="*70)
    
except Exception as e:
    print(f"\n‚ö†Ô∏è  Some tables not ready yet: {e}")