In [0]:
%run ./config

In [0]:
# Databricks notebook source
# MAGIC %md
# MAGIC # Silver Layer - Streaming Join Orders with Products
# MAGIC Creates enriched order_details table by streaming join of Bronze tables

# COMMAND ----------

from pyspark.sql.functions import *
from pyspark.sql.types import *

# COMMAND ----------

# MAGIC %md
# MAGIC ## Configuration

# COMMAND ----------

# Unity Catalog Configuration
catalog = "na-dbxtraining"
schema_bronze = "biju_bronze"
schema_silver = "biju_silver"
schema_gold = "biju_gold"

# Table Names (with backticks for catalog)
bronze_orders_table = f"`{catalog}`.{schema_bronze}.orders"
bronze_products_table = f"`{catalog}`.{schema_bronze}.products"
silver_table = f"`{catalog}`.{schema_silver}.order_details"

# Checkpoint location
checkpoint_base = f"/Volumes/na-dbxtraining/biju_raw/biju_vol/checkpoints/{catalog.replace('-', '_')}"
silver_checkpoint = f"{checkpoint_base}/silver_order_details"

print("="*70)
print("SILVER LAYER STREAMING CONFIGURATION")
print("="*70)
print(f"Source Tables:")
print(f"  Orders: {bronze_orders_table}")
print(f"  Products: {bronze_products_table}")
print(f"Target Table:")
print(f"  Silver: {silver_table}")
print(f"Checkpoint: {silver_checkpoint}")
print("="*70)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 1: Read Streaming Orders from Bronze

# COMMAND ----------

print("\n" + "="*70)
print("READING BRONZE STREAMS")
print("="*70)

# Read streaming orders (with watermark for late data handling)
orders_stream = (spark.readStream
    .format("delta")
    .table(bronze_orders_table)
    .withWatermark("bronze_timestamp", "1 minute")  # Handle late arrivals within 1 minute
)

print(f"‚úì Orders stream configured")
print(f"  Source: {bronze_orders_table}")
print(f"  Watermark: 1 minute on bronze_timestamp")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 2: Read Products (Static or Streaming)

# COMMAND ----------

# Option 1: Read products as STATIC table (recommended for small, slowly changing dimension)
# This is more efficient as products don't change frequently
products_df = spark.read.format("delta").table(bronze_products_table)

print(f"\n‚úì Products loaded as static dimension")
print(f"  Source: {bronze_products_table}")
print(f"  Records: {products_df.count()}")

# Option 2: If you want streaming products (uncomment if needed)
"""
products_stream = (spark.readStream
    .format("delta")
    .table(bronze_products_table)
    .withWatermark("bronze_timestamp", "1 minute")
)
print(f"\n‚úì Products stream configured")
"""

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 3: Streaming Join and Enrichment

# COMMAND ----------

print("\n" + "="*70)
print("CONFIGURING STREAMING JOIN")
print("="*70)

# Perform streaming join (stream-to-static)
# For stream-to-stream join, use watermarks on both sides
order_details_stream = (orders_stream
    .join(
        products_df,  # Use products_stream for stream-to-stream join
        on="product_id",
        how="left"
    )
    .select(
        # Order fields
        orders_stream.order_id,
        orders_stream.customer_id,
        orders_stream.customer_name,
        orders_stream.location,
        orders_stream.order_status,
        orders_stream.payment_method,
        orders_stream.quantity,
        orders_stream.discount_pct,
        orders_stream.total_amount,
        
        # Parse order timestamp if it's a string
        when(col("order_timestamp_parsed").isNotNull(), 
             col("order_timestamp_parsed"))
        .otherwise(to_timestamp(orders_stream.order_timestamp))
        .alias("order_timestamp"),
        
        # Product fields (from join)
        products_df.product_id.alias("product_id_joined"),
        products_df.product_name,
        products_df.category,
        products_df.brand,
        products_df.base_price,
        products_df.unit_price,
        
        # Metadata
        orders_stream.event_time,
        orders_stream.kafka_offset,
        orders_stream.partition_id,
        orders_stream.bronze_timestamp
    )
    # Add calculated fields
    .withColumn("order_date", to_date(col("order_timestamp")))
    .withColumn("order_hour", hour(col("order_timestamp")))
    .withColumn("day_of_week", dayofweek(col("order_timestamp")))
    .withColumn("line_total", col("quantity") * col("unit_price"))
    .withColumn("discount_amount", 
                col("total_amount") - (col("quantity") * col("unit_price")))
    .withColumn("silver_timestamp", current_timestamp())
    # Clean up duplicate product_id column
    .drop("product_id_joined")
)

print("‚úì Streaming join configured")
print("  Join Type: LEFT JOIN")
print("  Join Key: product_id")
print("  Mode: Stream-to-Static")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 4: Add Data Quality Checks

# COMMAND ----------

# Add data quality flags
order_details_stream_with_quality = (order_details_stream
    .withColumn("has_product_info", col("product_name").isNotNull())
    .withColumn("is_valid_quantity", col("quantity") > 0)
    .withColumn("is_valid_amount", col("total_amount") > 0)
    .withColumn("data_quality_score", 
                (col("has_product_info").cast("int") + 
                 col("is_valid_quantity").cast("int") + 
                 col("is_valid_amount").cast("int")))
)

print("‚úì Data quality checks added")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 5: Write to Silver Table (Streaming)

# COMMAND ----------

print("\n" + "="*70)
print("STARTING SILVER LAYER STREAM")
print("="*70)

# Write enriched data to Silver table
silver_query = (order_details_stream_with_quality
    .writeStream
    .format("delta")
    .outputMode("append")
    .option("checkpointLocation", silver_checkpoint)
    .option("mergeSchema", "true")
   # .trigger(processingTime="15 seconds")  # Process every 15 seconds
     .trigger(once=True) # for testing
    .toTable(silver_table)
)

print(f"‚úì Silver stream started")
print(f"  Query ID: {silver_query.id}")
print(f"  Target: {silver_table}")
print(f"  Checkpoint: {silver_checkpoint}")
print(f"  Trigger: 15 seconds")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Monitor Streaming Query

# COMMAND ----------

print("\n" + "="*70)
print("ACTIVE STREAMING QUERIES")
print("="*70)

for stream in spark.streams.active:
    print(f"\nQuery ID: {stream.id}")
    print(f"  Name: {stream.name if stream.name else 'unnamed'}")
    print(f"  Status: {stream.status['message']}")
    print(f"  Is Active: {stream.isActive}")
    
    if stream.recentProgress:
        latest = stream.recentProgress[-1]
        print(f"  Recent Progress:")
        print(f"    - Batch: {latest.get('batchId', 'N/A')}")
        print(f"    - Input Rows: {latest.get('numInputRows', 0)}")
        print(f"    - Processing Rate: {latest.get('processedRowsPerSecond', 0):.2f} rows/sec")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Wait and Monitor (Run for 30 seconds)

# COMMAND ----------

import time

print("\n" + "="*70)
print("MONITORING SILVER STREAM FOR 30 SECONDS")
print("="*70)

for i in range(6):
    time.sleep(5)
    
    try:
        silver_count = spark.table(silver_table).count()
        print(f"\n‚è±Ô∏è  {(i+1)*5}s - Silver records: {silver_count}")
        
        # Show data quality metrics
        quality_stats = spark.sql(f"""
            SELECT 
                COUNT(*) as total_records,
                SUM(CASE WHEN has_product_info THEN 1 ELSE 0 END) as with_product_info,
                SUM(CASE WHEN NOT has_product_info THEN 1 ELSE 0 END) as missing_product_info,
                AVG(data_quality_score) as avg_quality_score
            FROM {silver_table}
        """).first()
        
        print(f"    - With product info: {quality_stats['with_product_info']}")
        print(f"    - Missing product info: {quality_stats['missing_product_info']}")
        print(f"    - Avg quality score: {quality_stats['avg_quality_score']:.2f}/3.0")
        
    except Exception as e:
        print(f"\n‚è±Ô∏è  {(i+1)*5}s - Silver table not created yet or error: {e}")

print("\n‚úì Monitoring complete")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Verify Silver Table

# COMMAND ----------

print("\n" + "="*70)
print("SILVER LAYER VERIFICATION")
print("="*70)

try:
    silver_df = spark.table(silver_table)
    total_records = silver_df.count()
    
    print(f"\nüìä Silver Table: {silver_table}")
    print(f"   Total Records: {total_records}")
    
    if total_records > 0:
        # Data quality summary
        print("\n   Data Quality Summary:")
        quality_df = silver_df.groupBy("has_product_info").count().collect()
        for row in quality_df:
            status = "‚úì Complete" if row['has_product_info'] else "‚ö†Ô∏è Missing Product"
            print(f"     {status}: {row['count']} records")
        
        print("\n   Latest 10 enriched orders:")
        display(
            silver_df
            .select("order_id", "customer_name", "location", "product_name", "brand", 
                    "category", "quantity", "unit_price", "total_amount", "order_timestamp",
                    "data_quality_score")
            .orderBy(desc("silver_timestamp"))
            .limit(10)
        )
        
except Exception as e:
    print(f"\n‚ö†Ô∏è  Silver table not available: {e}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Real-Time Analysis Queries

# COMMAND ----------

print("\n" + "="*70)
print("REAL-TIME SILVER LAYER ANALYSIS")
print("="*70)

try:
    silver_df = spark.table(silver_table)
    
    # 1. Orders by Brand (Real-time)
    print("\n1. Orders by Brand:")
    display(
        silver_df
        .groupBy("brand")
        .agg(
            count("*").alias("order_count"),
            sum("total_amount").alias("total_revenue"),
            avg("total_amount").alias("avg_order_value")
        )
        .orderBy(desc("total_revenue"))
    )
    
except Exception as e:
    print(f"Analysis not available yet: {e}")

# COMMAND ----------

# 2. Orders by Category
try:
    print("\n2. Orders by Category:")
    display(
        silver_df
        .groupBy("category")
        .agg(
            count("*").alias("order_count"),
            sum("total_amount").alias("total_revenue"),
            sum("quantity").alias("total_quantity")
        )
        .orderBy(desc("total_revenue"))
    )
except:
    pass

# COMMAND ----------

# 3. Top Customers (Real-time)
try:
    print("\n3. Top Customers by Revenue:")
    display(
        silver_df
        .groupBy("customer_id", "customer_name", "location")
        .agg(
            count("*").alias("order_count"),
            sum("total_amount").alias("total_spent"),
            avg("total_amount").alias("avg_order_value")
        )
        .orderBy(desc("total_spent"))
        .limit(20)
    )
except:
    pass

# COMMAND ----------

# 4. Hourly Order Trends
try:
    print("\n4. Hourly Order Trends:")
    display(
        silver_df
        .groupBy("order_hour")
        .agg(
            count("*").alias("order_count"),
            sum("total_amount").alias("total_revenue")
        )
        .orderBy("order_hour")
    )
except:
    pass

# COMMAND ----------

# MAGIC %md
# MAGIC ## Create Streaming Dashboard View

# COMMAND ----------

# Create a view for real-time dashboard
spark.sql(f"""
CREATE OR REPLACE TEMP VIEW silver_dashboard AS
SELECT 
    order_date,
    order_hour,
    category,
    brand,
    location,
    COUNT(*) as order_count,
    SUM(total_amount) as revenue,
    AVG(total_amount) as avg_order_value,
    SUM(quantity) as total_items_sold,
    MAX(silver_timestamp) as last_updated
FROM {silver_table}
GROUP BY order_date, order_hour, category, brand, location
""")

print("‚úì Created real-time dashboard view: silver_dashboard")

# Sample query
print("\nSample Dashboard Query - Today's Performance:")
display(spark.sql("""
SELECT 
    category,
    brand,
    SUM(order_count) as total_orders,
    SUM(revenue) as total_revenue,
    MAX(last_updated) as last_update
FROM silver_dashboard
WHERE order_date = CURRENT_DATE()
GROUP BY category, brand
ORDER BY total_revenue DESC
"""))

# COMMAND ----------

# MAGIC %md
# MAGIC ## Alternative: Stream-to-Stream Join (If Products Also Streaming)

# COMMAND ----------

# Uncomment this section if you want to use stream-to-stream join
"""
print("="*70)
print("ALTERNATIVE: STREAM-TO-STREAM JOIN")
print("="*70)

# Read products as stream
products_stream = (spark.readStream
    .format("delta")
    .table(bronze_products_table)
    .withWatermark("bronze_timestamp", "1 minute")
)

# Stream-to-Stream join with watermarks
order_details_stream_v2 = (orders_stream
    .join(
        products_stream,
        on=orders_stream.product_id == products_stream.product_id,
        how="left"
    )
    .select(
        orders_stream.order_id,
        orders_stream.customer_id,
        # ... other fields
    )
    # ... rest of transformation
)

# Write stream
silver_query_v2 = (order_details_stream_v2
    .writeStream
    .format("delta")
    .outputMode("append")
    .option("checkpointLocation", f"{silver_checkpoint}_v2")
    .trigger(processingTime="15 seconds")
    .toTable(f"{silver_table}_v2")
)

print("‚úì Stream-to-stream join configured")
"""

# COMMAND ----------

# MAGIC %md
# MAGIC ## Stop Streams (Run when done)

# COMMAND ----------

# Uncomment to stop all streams
"""
print("=" * 70)
print("STOPPING SILVER LAYER STREAMS")
print("=" * 70)

for stream in spark.streams.active:
    if "silver" in stream.id.lower() or "order_details" in str(stream.name).lower():
        print(f"\nStopping: {stream.id}")
        stream.stop()
        print(f"  ‚úì Stopped")

print("\n‚úì Silver streams stopped")
"""

# COMMAND ----------

# MAGIC %md
# MAGIC ## Summary

# COMMAND ----------

print("\n" + "="*70)
print("‚úì SILVER LAYER STREAMING SETUP COMPLETE")
print("="*70)

try:
    # Get current state
    orders_count = spark.table(bronze_orders_table).count()
    products_count = spark.table(bronze_products_table).count()
    silver_count = spark.table(silver_table).count()
    
    # Data quality check
    quality_check = spark.sql(f"""
        SELECT 
            COUNT(*) as total,
            SUM(CASE WHEN has_product_info THEN 1 ELSE 0 END) as complete,
            SUM(CASE WHEN NOT has_product_info THEN 1 ELSE 0 END) as incomplete
        FROM {silver_table}
    """).first()
    
    print(f"\nCurrent State:")
    print(f"  Bronze Orders: {orders_count} records")
    print(f"  Bronze Products: {products_count} records")
    print(f"  Silver Order Details: {silver_count} records")
    
    print(f"\nData Quality:")
    if quality_check:
        success_rate = (quality_check['complete'] / quality_check['total'] * 100) if quality_check['total'] > 0 else 0
        print(f"  Complete records: {quality_check['complete']}")
        print(f"  Incomplete records: {quality_check['incomplete']}")
        print(f"  Success rate: {success_rate:.1f}%")
    
    print(f"\nStreaming Status:")
    print(f"  Active Queries: {len(spark.streams.active)}")
    
    for stream in spark.streams.active:
        print(f"    - {stream.id}: {stream.status['message']}")
    
    print(f"\nData Flow:")
    print(f"  Bronze Orders (streaming)")
    print(f"    + Bronze Products (static)")
    print(f"    ‚Üì LEFT JOIN on product_id")
    print(f"  Silver Order Details (enriched)")
    print(f"    ‚Ä¢ Full order + product info")
    print(f"    ‚Ä¢ Calculated fields")
    print(f"    ‚Ä¢ Data quality flags")
    
    print("\n" + "="*70)
    print("Next Steps:")
    print("  1. Monitor streams using 'Monitor Streaming Query' cell")
    print("  2. Run Gold Layer notebook for business aggregations")
    print("  3. Use dashboard view for real-time analytics")
    print("  4. Stop streams when done using 'Stop Streams' cell")
    print("="*70)
    
except Exception as e:
    print(f"\n‚ö†Ô∏è  Tables not fully populated yet. Wait for streams to process data.")
    print(f"Error: {e}")