In [0]:
%run ./02config

In [0]:
# Databricks notebook source
# MAGIC %md
# MAGIC # Silver Layer - Streaming Join Orders with Products

# COMMAND ----------

# MAGIC %run ./02config

# COMMAND ----------

from pyspark.sql.functions import *
from pyspark.sql.types import *

# COMMAND ----------

# MAGIC %md
# MAGIC ## Read Bronze Streams

# COMMAND ----------

print("="*70)
print("READING BRONZE STREAMS")
print("="*70)

# Read streaming orders with watermark
orders_stream = (spark.readStream
    .format("delta")
    .table(bronze_orders_table)
    .withWatermark("bronze_timestamp", "1 minute")
)

print(f"‚úì Orders stream: {bronze_orders_table}")

# Read products as static (more efficient for small dimension)
products_df = spark.read.format("delta").table(bronze_products_table)

print(f"‚úì Products static: {bronze_products_table} ({products_df.count()} records)")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Streaming Join and Enrichment

# COMMAND ----------

print("\n" + "="*70)
print("CONFIGURING JOIN")
print("="*70)

order_details_stream = (orders_stream
    .join(products_df, on="product_id", how="left")
    .select(
        # Order fields
        orders_stream.order_id,
        orders_stream.customer_id,
        orders_stream.customer_name,
        orders_stream.location,
        orders_stream.order_status,
        orders_stream.payment_method,
        orders_stream.quantity,
        orders_stream.discount_pct,
        orders_stream.total_amount,
        
        # Order timestamp
        when(col("order_timestamp_parsed").isNotNull(), 
             col("order_timestamp_parsed"))
        .otherwise(to_timestamp(orders_stream.order_timestamp))
        .alias("order_timestamp"),
        
        # Product fields
        products_df.product_id.alias("product_id_joined"),
        products_df.product_name,
        products_df.category,
        products_df.brand,
        products_df.base_price,
        products_df.unit_price,
        
        # Metadata
        orders_stream.event_time,
        orders_stream.eventhub_offset,
        orders_stream.sequence_number,
        orders_stream.partition_id,
        orders_stream.bronze_timestamp
    )
    # Calculated fields
    .withColumn("order_date", to_date(col("order_timestamp")))
    .withColumn("order_hour", hour(col("order_timestamp")))
    .withColumn("day_of_week", dayofweek(col("order_timestamp")))
    .withColumn("line_total", col("quantity") * col("unit_price"))
    .withColumn("discount_amount", 
                col("total_amount") - (col("quantity") * col("unit_price")))
    .withColumn("silver_timestamp", current_timestamp())
    .drop("product_id_joined")
)

print("‚úì Join configured (LEFT JOIN on product_id)")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Add Data Quality Checks

# COMMAND ----------

order_details_stream_with_quality = (order_details_stream
    .withColumn("has_product_info", col("product_name").isNotNull())
    .withColumn("is_valid_quantity", col("quantity") > 0)
    .withColumn("is_valid_amount", col("total_amount") > 0)
    .withColumn("data_quality_score", 
                (col("has_product_info").cast("int") + 
                 col("is_valid_quantity").cast("int") + 
                 col("is_valid_amount").cast("int")))
)

print("‚úì Data quality checks added")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Write to Silver Table

# COMMAND ----------

print("\n" + "="*70)
print("STARTING SILVER STREAM")
print("="*70)

silver_query = (order_details_stream_with_quality
    .writeStream
    .format("delta")
    .outputMode("append")
    .option("checkpointLocation", silver_checkpoint)
    .option("mergeSchema", "true")
    #.trigger(processingTime="15 seconds")
    .trigger(availableNow=True)
    .toTable(silver_table)
)

print(f"‚úì Silver stream started: {silver_query.id}")
print(f"  Target: {silver_table}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Monitor Stream

# COMMAND ----------

print("\n" + "="*70)
print("ACTIVE STREAMS")
print("="*70)

for stream in spark.streams.active:
    print(f"\nQuery ID: {stream.id}")
    print(f"  Status: {stream.status['message']}")
    print(f"  Active: {stream.isActive}")
    
    if stream.recentProgress:
        latest = stream.recentProgress[-1]
        print(f"  Batch: {latest.get('batchId', 'N/A')}")
        print(f"  Input Rows: {latest.get('numInputRows', 0)}")
        print(f"  Rate: {latest.get('processedRowsPerSecond', 0):.2f} rows/sec")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Monitor Data Quality

# COMMAND ----------

import time

print("\n" + "="*70)
print("MONITORING (30 seconds)")
print("="*70)

for i in range(6):
    time.sleep(5)
    
    try:
        silver_count = spark.table(silver_table).count()
        
        quality_stats = spark.sql(f"""
            SELECT 
                COUNT(*) as total,
                SUM(CASE WHEN has_product_info THEN 1 ELSE 0 END) as complete,
                AVG(data_quality_score) as avg_score
            FROM {silver_table}
        """).first()
        
        print(f"‚è±Ô∏è  {(i+1)*5}s - Records: {silver_count:,} | Complete: {quality_stats['complete']} | Score: {quality_stats['avg_score']:.2f}/3.0")
        
    except Exception as e:
        print(f"‚è±Ô∏è  {(i+1)*5}s - Waiting for data...")

print("\n‚úì Monitoring complete")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Verify Silver Table

# COMMAND ----------

print("\n" + "="*70)
print("SILVER VERIFICATION")
print("="*70)

try:
    silver_df = spark.table(silver_table)
    total_records = silver_df.count()
    
    print(f"\nüìä Silver Table: {total_records:,} records")
    
    if total_records > 0:
        # Data quality
        quality_df = silver_df.groupBy("has_product_info").count().collect()
        for row in quality_df:
            status = "‚úì Complete" if row['has_product_info'] else "‚ö†Ô∏è Missing Product"
            print(f"  {status}: {row['count']:,}")
        
        # Latest orders
        print("\n‚úì Latest enriched orders:")
        display(
            silver_df
            .select("order_id", "customer_name", "location", "product_name", "brand", 
                    "category", "quantity", "unit_price", "total_amount", "order_timestamp",
                    "data_quality_score")
            .orderBy(desc("silver_timestamp"))
            .limit(10)
        )
        
except Exception as e:
    print(f"‚ö†Ô∏è  Table not available: {e}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Analysis Queries

# COMMAND ----------

try:
    silver_df = spark.table(silver_table)
    
    # Orders by Brand
    print("\n1. Orders by Brand:")
    display(
        silver_df
        .groupBy("brand")
        .agg(
            count("*").alias("orders"),
            sum("total_amount").alias("revenue"),
            avg("total_amount").alias("avg_order")
        )
        .orderBy(desc("revenue"))
    )
    
    # Orders by Category
    print("\n2. Orders by Category:")
    display(
        silver_df
        .groupBy("category")
        .agg(
            count("*").alias("orders"),
            sum("total_amount").alias("revenue"),
            sum("quantity").alias("quantity")
        )
        .orderBy(desc("revenue"))
    )
    
    # Top Customers
    print("\n3. Top Customers:")
    display(
        silver_df
        .groupBy("customer_id", "customer_name", "location")
        .agg(
            count("*").alias("orders"),
            sum("total_amount").alias("total_spent"),
            avg("total_amount").alias("avg_order")
        )
        .orderBy(desc("total_spent"))
        .limit(20)
    )
    
    # Hourly Trends
    print("\n4. Hourly Trends:")
    display(
        silver_df
        .groupBy("order_hour")
        .agg(
            count("*").alias("orders"),
            sum("total_amount").alias("revenue")
        )
        .orderBy("order_hour")
    )
    
except Exception as e:
    print(f"Analysis not available: {e}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Dashboard View

# COMMAND ----------

spark.sql(f"""
CREATE OR REPLACE TEMP VIEW silver_dashboard AS
SELECT 
    order_date,
    order_hour,
    category,
    brand,
    location,
    COUNT(*) as order_count,
    SUM(total_amount) as revenue,
    AVG(total_amount) as avg_order_value,
    SUM(quantity) as total_items,
    MAX(silver_timestamp) as last_updated
FROM {silver_table}
GROUP BY order_date, order_hour, category, brand, location
""")

print("‚úì Dashboard view created")

# Today's performance
print("\nToday's Performance:")
display(spark.sql("""
SELECT 
    category,
    brand,
    SUM(order_count) as orders,
    SUM(revenue) as revenue,
    MAX(last_updated) as last_update
FROM silver_dashboard
WHERE order_date = CURRENT_DATE()
GROUP BY category, brand
ORDER BY revenue DESC
"""))

# COMMAND ----------

# MAGIC %md
# MAGIC ## Stop Streams

# COMMAND ----------

# Uncomment to stop
"""
for stream in spark.streams.active:
    print(f"Stopping: {stream.id}")
    stream.stop()
print("‚úì Streams stopped")
"""

# COMMAND ----------

# MAGIC %md
# MAGIC ## Summary

# COMMAND ----------

print("\n" + "="*70)
print("SILVER LAYER COMPLETE")
print("="*70)

try:
    orders_count = spark.table(bronze_orders_table).count()
    products_count = spark.table(bronze_products_table).count()
    silver_count = spark.table(silver_table).count()
    
    quality_check = spark.sql(f"""
        SELECT 
            COUNT(*) as total,
            SUM(CASE WHEN has_product_info THEN 1 ELSE 0 END) as complete,
            SUM(CASE WHEN NOT has_product_info THEN 1 ELSE 0 END) as incomplete
        FROM {silver_table}
    """).first()
    
    print(f"\nRecords:")
    print(f"  Bronze Orders: {orders_count:,}")
    print(f"  Bronze Products: {products_count:,}")
    print(f"  Silver Details: {silver_count:,}")
    
    if quality_check:
        success_rate = (quality_check['complete'] / quality_check['total'] * 100) if quality_check['total'] > 0 else 0
        print(f"\nData Quality:")
        print(f"  Complete: {quality_check['complete']:,}")
        print(f"  Incomplete: {quality_check['incomplete']:,}")
        print(f"  Success Rate: {success_rate:.1f}%")
    
    print(f"\nActive Streams: {len(spark.streams.active)}")
    for stream in spark.streams.active:
        print(f"  ‚Ä¢ {stream.id}")
    
except Exception as e:
    print(f"‚ö†Ô∏è  Tables not ready yet")