In [0]:
%run ./02config

In [0]:
# Databricks notebook source
# MAGIC %md
# MAGIC # Gold Layer - Streaming Business Aggregations



# COMMAND ----------

from pyspark.sql.functions import *
from pyspark.sql.types import *

# COMMAND ----------

print("="*70)
print("GOLD LAYER CONFIGURATION")
print("="*70)
print(f"Source: {silver_table}")
print(f"\nTargets:")
print(f"  1. {gold_brand_category_table}")
print(f"  2. {gold_location_table}")
print(f"  3. {gold_product_table}")
print(f"  4. {gold_customer_table}")
print(f"  5. {gold_daily_summary_table}")
print("="*70)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Read Silver Stream

# COMMAND ----------

print("\n" + "="*70)
print("READING SILVER STREAM")
print("="*70)

silver_stream = (spark.readStream
    .format("delta")
    .table(silver_table)
)

print(f"‚úì Silver stream configured: {silver_table}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Gold Stream 1: Sales by Brand and Category

# COMMAND ----------

print("\n" + "="*70)
print("STARTING: Brand/Category Stream")
print("="*70)

sales_by_brand_category_stream = (
    silver_stream
    .groupBy("brand", "category")
    .agg(
        count("*").alias("order_count"),
        sum("quantity").alias("total_quantity"),
        sum("total_amount").alias("total_revenue"),
        avg("total_amount").alias("avg_order_value"),
        min("total_amount").alias("min_order_value"),
        max("total_amount").alias("max_order_value"),
        approx_count_distinct("customer_id").alias("unique_customers"),
        sum("discount_amount").alias("total_discounts")
    )
    .select(
        "brand",
        "category",
        "order_count",
        "total_quantity",
        round(col("total_revenue"), 2).alias("total_revenue"),
        round(col("avg_order_value"), 2).alias("avg_order_value"),
        round(col("min_order_value"), 2).alias("min_order_value"),
        round(col("max_order_value"), 2).alias("max_order_value"),
        "unique_customers",
        round(col("total_discounts"), 2).alias("total_discounts"),
        current_timestamp().alias("gold_timestamp")
    )
)

brand_category_query = (sales_by_brand_category_stream
    .writeStream
    .format("delta")
    .outputMode("complete")
    .option("checkpointLocation", gold_brand_checkpoint)
    .option("mergeSchema", "true")
    #.trigger(processingTime="20 seconds")
    .trigger(availableNow=True)
    .toTable(gold_brand_category_table)
)

print(f"‚úì Stream started: {brand_category_query.id}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Gold Stream 2: Location Performance

# COMMAND ----------

print("\n" + "="*70)
print("STARTING: Location Performance Stream")
print("="*70)

location_performance_stream = (
    silver_stream
    .groupBy("location")
    .agg(
        count("*").alias("order_count"),
        sum("total_amount").alias("total_revenue"),
        avg("total_amount").alias("avg_order_value"),
        sum("quantity").alias("total_items_sold"),
        approx_count_distinct("customer_id").alias("unique_customers"),
        approx_count_distinct("product_name").alias("unique_products"),
        approx_count_distinct("brand").alias("unique_brands"),
        sum(when(col("order_status") == "delivered", 1).otherwise(0)).alias("delivered_orders"),
        sum(when(col("order_status") == "pending", 1).otherwise(0)).alias("pending_orders")
    )
    .select(
        "location",
        "order_count",
        round(col("total_revenue"), 2).alias("total_revenue"),
        round(col("avg_order_value"), 2).alias("avg_order_value"),
        "total_items_sold",
        "unique_customers",
        "unique_products",
        "unique_brands",
        "delivered_orders",
        "pending_orders",
        round((col("delivered_orders") / col("order_count") * 100), 2).alias("delivery_rate_pct"),
        current_timestamp().alias("gold_timestamp")
    )
)

location_query = (location_performance_stream
    .writeStream
    .format("delta")
    .outputMode("complete")
    .option("checkpointLocation", gold_location_checkpoint)
    .option("mergeSchema", "true")
    #.trigger(processingTime="20 seconds")
    .trigger(availableNow=True)
    .toTable(gold_location_table)
)

print(f"‚úì Stream started: {location_query.id}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Gold Stream 3: Product Performance

# COMMAND ----------

print("\n" + "="*70)
print("STARTING: Product Performance Stream")
print("="*70)

product_performance_stream = (
    silver_stream
    .groupBy("product_name", "brand", "category", "base_price")
    .agg(
        count("*").alias("order_count"),
        sum("quantity").alias("total_quantity_sold"),
        sum("total_amount").alias("total_revenue"),
        avg("unit_price").alias("avg_selling_price"),
        min("unit_price").alias("min_selling_price"),
        max("unit_price").alias("max_selling_price"),
        approx_count_distinct("customer_id").alias("unique_customers"),
        approx_count_distinct("location").alias("locations_sold"),
        avg("discount_pct").alias("avg_discount_pct"),
        sum("discount_amount").alias("total_discount_amount")
    )
    .select(
        "product_name",
        "brand",
        "category",
        "base_price",
        "order_count",
        "total_quantity_sold",
        round(col("total_revenue"), 2).alias("total_revenue"),
        round(col("avg_selling_price"), 2).alias("avg_selling_price"),
        round(col("min_selling_price"), 2).alias("min_selling_price"),
        round(col("max_selling_price"), 2).alias("max_selling_price"),
        "unique_customers",
        "locations_sold",
        round(col("avg_discount_pct") * 100, 2).alias("avg_discount_pct"),
        round(col("total_discount_amount"), 2).alias("total_discount_amount"),
        round((col("total_revenue") / col("total_quantity_sold")), 2).alias("revenue_per_unit"),
        current_timestamp().alias("gold_timestamp")
    )
)

product_query = (product_performance_stream
    .writeStream
    .format("delta")
    .outputMode("complete")
    .option("checkpointLocation", gold_product_checkpoint)
    .option("mergeSchema", "true")
    #.trigger(processingTime="20 seconds")
    .trigger(availableNow=True)
    .toTable(gold_product_table)
)

print(f"‚úì Stream started: {product_query.id}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Gold Stream 4: Customer Insights

# COMMAND ----------

print("\n" + "="*70)
print("STARTING: Customer Insights Stream")
print("="*70)

customer_insights_stream = (
    silver_stream
    .filter(
        col("customer_id").isNotNull() & 
        col("order_timestamp").isNotNull()
    )
    .groupBy("customer_id", "customer_name", "location")
    .agg(
        count("*").alias("order_count"),
        sum("total_amount").alias("total_spent"),
        avg("total_amount").alias("avg_order_value"),
        sum("quantity").alias("total_items_purchased"),
        min("order_timestamp").alias("first_order_date"),
        max("order_timestamp").alias("last_order_date"),
        approx_count_distinct("product_name").alias("unique_products_bought"),
        approx_count_distinct("brand").alias("unique_brands"),
        approx_count_distinct("category").alias("unique_categories"),
        avg(coalesce(col("discount_pct"), lit(0))).alias("avg_discount_rate")
    )
    .withColumn(
        "days_as_customer", 
        when(
            col("first_order_date").isNotNull() & col("last_order_date").isNotNull(),
            datediff(col("last_order_date"), col("first_order_date"))
        ).otherwise(0)
    )
    .withColumn(
        "customer_segment",
        when(col("total_spent") >= 5000, "High Value")
        .when(col("total_spent") >= 2000, "Medium Value")
        .otherwise("Low Value")
    )
    .select(
        "customer_id",
        "customer_name",
        "location",
        "order_count",
        round(col("total_spent"), 2).alias("total_spent"),
        round(col("avg_order_value"), 2).alias("avg_order_value"),
        "total_items_purchased",
        "first_order_date",
        "last_order_date",
        "days_as_customer",
        "unique_products_bought",
        "unique_brands",
        "unique_categories",
        round(col("avg_discount_rate") * 100, 2).alias("avg_discount_pct"),
        "customer_segment",
        current_timestamp().alias("gold_timestamp")
    )
)

customer_query = (customer_insights_stream
    .writeStream
    .format("delta")
    .outputMode("complete")
    .option("checkpointLocation", gold_customer_checkpoint)
    .option("mergeSchema", "true")
   # .trigger(processingTime="30 seconds")
    .trigger(availableNow=True)
    .toTable(gold_customer_table)
)

print(f"‚úì Stream started: {customer_query.id}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Gold Stream 5: Daily Summary

# COMMAND ----------

print("\n" + "="*70)
print("STARTING: Daily Summary Stream")
print("="*70)

daily_summary_stream = (
    silver_stream
    .withWatermark("order_timestamp", "1 hour")
    .groupBy(
        window(col("order_timestamp"), "1 day").alias("day_window"),
        "location",
        "category"
    )
    .agg(
        count("*").alias("order_count"),
        sum("total_amount").alias("daily_revenue"),
        avg("total_amount").alias("avg_order_value"),
        sum("quantity").alias("items_sold"),
        approx_count_distinct("customer_id").alias("unique_customers"),
        approx_count_distinct("product_name").alias("unique_products"),
        max("total_amount").alias("highest_order"),
        min("total_amount").alias("lowest_order")
    )
    .select(
        col("day_window.start").alias("day"),
        "location",
        "category",
        "order_count",
        round(col("daily_revenue"), 2).alias("daily_revenue"),
        round(col("avg_order_value"), 2).alias("avg_order_value"),
        "items_sold",
        "unique_customers",
        "unique_products",
        round(col("highest_order"), 2).alias("highest_order"),
        round(col("lowest_order"), 2).alias("lowest_order"),
        current_timestamp().alias("gold_timestamp")
    )
)

daily_query = (daily_summary_stream
    .writeStream
    .format("delta")
    .outputMode("append")
    .option("checkpointLocation", gold_daily_checkpoint)
    .option("mergeSchema", "true")
    #.trigger(processingTime="30 seconds")
    .trigger(availableNow=True)
    .toTable(gold_daily_summary_table)
)

print(f"‚úì Stream started: {daily_query.id}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Monitor Streams

# COMMAND ----------

print("\n" + "="*70)
print("ACTIVE GOLD STREAMS")
print("="*70)

for stream in spark.streams.active:
    print(f"\nQuery ID: {stream.id}")
    print(f"  Status: {stream.status['message']}")
    print(f"  Active: {stream.isActive}")
    
    if stream.recentProgress:
        latest = stream.recentProgress[-1]
        print(f"  Batch: {latest.get('batchId', 'N/A')}")
        print(f"  Input Rows: {latest.get('numInputRows', 0)}")
        print(f"  Rate: {latest.get('processedRowsPerSecond', 0):.2f} rows/sec")

print(f"\nTotal Active: {len(spark.streams.active)}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Monitor Tables

# COMMAND ----------

import time

print("\n" + "="*70)
print("MONITORING (60 seconds)")
print("="*70)

for i in range(12):
    time.sleep(5)
    print(f"\n‚è±Ô∏è  {(i+1)*5}s:")
    
    try:
        print(f"    Brand/Category: {spark.table(gold_brand_category_table).count():,}")
    except:
        print(f"    Brand/Category: waiting...")
    
    try:
        print(f"    Locations: {spark.table(gold_location_table).count():,}")
    except:
        print(f"    Locations: waiting...")
    
    try:
        print(f"    Products: {spark.table(gold_product_table).count():,}")
    except:
        print(f"    Products: waiting...")
    
    try:
        print(f"    Customers: {spark.table(gold_customer_table).count():,}")
    except:
        print(f"    Customers: waiting...")
    
    try:
        print(f"    Daily Summary: {spark.table(gold_daily_summary_table).count():,}")
    except:
        print(f"    Daily Summary: waiting...")

print("\n‚úì Monitoring complete")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Verify Gold Tables

# COMMAND ----------

print("\n" + "="*70)
print("GOLD VERIFICATION")
print("="*70)

# 1. Brand/Category
try:
    brand_cat_df = spark.table(gold_brand_category_table)
    print(f"\n1. Brand/Category: {brand_cat_df.count():,} records")
    display(brand_cat_df.orderBy(desc("total_revenue")).limit(10))
except Exception as e:
    print(f"\n1. Not ready: {e}")

# COMMAND ----------

# 2. Location
try:
    location_df = spark.table(gold_location_table)
    print(f"\n2. Locations: {location_df.count():,} records")
    display(location_df.orderBy(desc("total_revenue")))
except Exception as e:
    print(f"\n2. Not ready: {e}")

# COMMAND ----------

# 3. Products
try:
    product_df = spark.table(gold_product_table)
    print(f"\n3. Products: {product_df.count():,} records")
    display(product_df.orderBy(desc("total_revenue")).limit(10))
except Exception as e:
    print(f"\n3. Not ready: {e}")

# COMMAND ----------

# 4. Customers
try:
    customer_df = spark.table(gold_customer_table)
    print(f"\n4. Customers: {customer_df.count():,} records")
    
    print("\nBy Segment:")
    display(
        customer_df.groupBy("customer_segment")
        .agg(
            count("*").alias("customers"),
            sum("total_spent").alias("revenue"),
            avg("order_count").alias("avg_orders")
        )
        .orderBy(desc("revenue"))
    )
    
    print("\nTop 10:")
    display(customer_df.orderBy(desc("total_spent")).limit(10))
    
except Exception as e:
    print(f"\n4. Not ready: {e}")

# COMMAND ----------

# 5. Daily Summary
try:
    daily_df = spark.table(gold_daily_summary_table)
    print(f"\n5. Daily Summary: {daily_df.count():,} records")
    
    print("\nDaily Revenue:")
    display(
        daily_df
        .groupBy("day")
        .agg(
            sum("daily_revenue").alias("revenue"),
            sum("order_count").alias("orders")
        )
        .orderBy("day")
    )
    
except Exception as e:
    print(f"\n5. Not ready: {e}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Business Dashboard

# COMMAND ----------

print("\n" + "="*70)
print("BUSINESS METRICS")
print("="*70)

try:
    overall = spark.sql(f"""
        SELECT 
            SUM(order_count) as orders,
            SUM(total_revenue) as revenue,
            AVG(avg_order_value) as avg_order,
            SUM(unique_customers) as customers,
            SUM(total_discounts) as discounts
        FROM {gold_brand_category_table}
    """).first()
    
    print(f"\nüìä Current Metrics:")
    print(f"  Orders: {overall['orders']:,}")
    print(f"  Revenue: ${overall['revenue']:,.2f}")
    print(f"  Avg Order: ${overall['avg_order']:.2f}")
    print(f"  Customers: {overall['customers']}")
    print(f"  Discounts: ${overall['discounts']:,.2f}")
    
    print("\nüèÜ Top Categories:")
    display(
        spark.table(gold_brand_category_table)
        .groupBy("category")
        .agg(sum("total_revenue").alias("revenue"))
        .orderBy(desc("revenue"))
    )
    
except Exception as e:
    print(f"Not ready: {e}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Stop Streams

# COMMAND ----------

# Uncomment to stop
"""
for stream in spark.streams.active:
    print(f"Stopping: {stream.id}")
    stream.stop()
print("‚úì All streams stopped")
"""

# COMMAND ----------

# MAGIC %md
# MAGIC ## Summary

# COMMAND ----------

print("\n" + "="*70)
print("GOLD LAYER COMPLETE")
print("="*70)

try:
    tables = [
        (gold_brand_category_table, "Brand/Category"),
        (gold_location_table, "Locations"),
        (gold_product_table, "Products"),
        (gold_customer_table, "Customers"),
        (gold_daily_summary_table, "Daily Summary")
    ]
    
    print(f"\nGold Tables:")
    for table, desc in tables:
        try:
            count = spark.table(table).count()
            print(f"  ‚úì {desc}: {count:,}")
        except:
            print(f"  ‚è≥ {desc}: processing...")
    
    print(f"\nActive Streams: {len(spark.streams.active)}")
    for stream in spark.streams.active:
        print(f"  ‚Ä¢ {stream.id}")
    
except Exception as e:
    print(f"‚ö†Ô∏è  Not ready yet")