In [0]:
%sql
select * from `na-dbxtraining`.biju_bronze.orders

In [0]:
%sql
select * from `na-dbxtraining`.biju_bronze.products

In [0]:
%sql
select * from `na-dbxtraining`.biju_silver.order_details

In [0]:
%sql
select * from `na-dbxtraining`.biju_gold.daily_summary

In [0]:
%sql
SELECT 
    window.start AS day,
    location,
    category,
    COUNT(*) AS order_count,
    ROUND(SUM(total_amount), 2) AS daily_revenue,
    ROUND(AVG(total_amount), 2) AS avg_order_value,
    SUM(quantity) AS items_sold,
    APPROX_COUNT_DISTINCT(customer_id) AS unique_customers,
    APPROX_COUNT_DISTINCT(product_name) AS unique_products,
    ROUND(MAX(total_amount), 2) AS highest_order,
    ROUND(MIN(total_amount), 2) AS lowest_order,
    CURRENT_TIMESTAMP() AS gold_timestamp
FROM `na-dbxtraining`.biju_silver.order_details
/* Watermark is applied by the engine; in SQL it's inferred from the window */
GROUP BY 
    WINDOW(order_timestamp, '1 day'),
    location,
    category

In [0]:
%sql
SELECT 
    window.start AS day,
    location,
    category,
    COUNT(*) AS order_count,
    ROUND(SUM(total_amount), 2) AS daily_revenue,
    ROUND(AVG(total_amount), 2) AS avg_order_value,
    SUM(quantity) AS items_sold,
    APPROX_COUNT_DISTINCT(customer_id) AS unique_customers,
    APPROX_COUNT_DISTINCT(product_name) AS unique_products,
    ROUND(MAX(total_amount), 2) AS highest_order,
    ROUND(MIN(total_amount), 2) AS lowest_order,
    CURRENT_TIMESTAMP() AS gold_timestamp
FROM `na-dbxtraining`.biju_silver.order_details
-- Apply watermark logic (1 hour)
WATERMARK order_timestamp DELAY OF INTERVAL 1 HOUR
GROUP BY 
    WINDOW(order_timestamp, '1 day'),
    location,
    category;

In [0]:
silver_table="`na-dbxtraining`.biju_silver.order_details"

In [0]:
silver_stream = (spark.read
    .format("delta")
    .table(silver_table)
)


In [0]:
%run /Workspace/Users/biju.thottathil@3cloudsolutions.com/training/databricksinternaldemo/eventhub-ecommerce-eventhub-stream/02config

In [0]:
print("\n" + "="*70)
print("STARTING: Daily Summary Stream")
print("="*70)
from pyspark.sql.functions import *
from pyspark.sql.types import *
daily_summary_stream = (
    silver_stream
    .withWatermark("order_timestamp", "1 hour")
    .groupBy(
        window(col("order_timestamp"), "1 day").alias("day_window"),
        "location",
        "category"
    )
    .agg(
        count("*").alias("order_count"),
        sum("total_amount").alias("daily_revenue"),
        avg("total_amount").alias("avg_order_value"),
        sum("quantity").alias("items_sold"),
        approx_count_distinct("customer_id").alias("unique_customers"),
        approx_count_distinct("product_name").alias("unique_products"),
        max("total_amount").alias("highest_order"),
        min("total_amount").alias("lowest_order")
    )
    .select(
        col("day_window.start").alias("day"),
        "location",
        "category",
        "order_count",
        round(col("daily_revenue"), 2).alias("daily_revenue"),
        round(col("avg_order_value"), 2).alias("avg_order_value"),
        "items_sold",
        "unique_customers",
        "unique_products",
        round(col("highest_order"), 2).alias("highest_order"),
        round(col("lowest_order"), 2).alias("lowest_order"),
        current_timestamp().alias("gold_timestamp")
    )
)
display(daily_summary_stream)
daily_query = (
    daily_summary_stream
    .write
    .format("delta")
    # .trigger(processingTime="30 seconds")
    # .trigger(availableNow=True)
    .saveasTable(gold_daily_summary_table)
   # .toTable(gold_daily_summary_table)
)
print(f"✓ Stream started: {daily_query.id}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Monitor Streams


In [0]:
try:
    daily_df = spark.table(gold_daily_summary_table)
    print(f"\n5. Daily Summary: {daily_df.count():,} records")
    
    print("\nDaily Revenue:")
    display(
        daily_df
        .groupBy("day")
        .agg(
            sum("daily_revenue").alias("revenue"),
            sum("order_count").alias("orders")
        )
        .orderBy("day")
    )
    
except Exception as e:
    print(f"\n5. Not ready: {e}")

In [0]:
# 1. Remove .withWatermark and treat silver_stream as a static DataFrame
daily_summary_batch = (
    spark.table(silver_table) # Or however you define your source
    .groupBy(
        window(col("order_timestamp"), "1 day").alias("day_window"),
        "location",
        "category"
    )
    .agg(
        count("*").alias("order_count"),
        sum("total_amount").alias("daily_revenue"),
        avg("total_amount").alias("avg_order_value"),
        sum("quantity").alias("items_sold"),
        approx_count_distinct("customer_id").alias("unique_customers"),
        approx_count_distinct("product_name").alias("unique_products"),
        max("total_amount").alias("highest_order"),
        min("total_amount").alias("lowest_order")
    )
    .select(
        col("day_window.start").alias("day"),
        "location",
        "category",
        "order_count",
        round(col("daily_revenue"), 2).alias("daily_revenue"),
        round(col("avg_order_value"), 2).alias("avg_order_value"),
        "items_sold",
        "unique_customers",
        "unique_products",
        round(col("highest_order"), 2).alias("highest_order"),
        round(col("lowest_order"), 2).alias("lowest_order"),
        current_timestamp().alias("gold_timestamp")
    )
)

# 2. Use .write instead of .writeStream
(daily_summary_batch
    .write
    .format("delta")
    .mode("overwrite") # Use "append" if you are adding new data, "overwrite" for full refresh
    .option("mergeSchema", "true")
    .saveAsTable(gold_daily_summary_table)
)

print(f"✓ Batch processing complete. Data written to {gold_daily_summary_table}")