# Real-time ETL & Monitoring

In [0]:
# import the necessary libraries for Structured Streaming
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
# Let's define our schema for the streaming data
# This is important for streaming to ensure consistent schema interpretation
schema = StructType([
    StructField("event_id", StringType(), True),
    StructField("user_id", StringType(), True),
    StructField("content_id", StringType(), True),
    StructField("timestamp", StringType(), True),
    StructField("duration_seconds", IntegerType(), True),
    StructField("device_type", StringType(), True),
    StructField("quality", StringType(), True),
    StructField("buffering_count", IntegerType(), True),
    StructField("error_type", StringType(), True),
    StructField("ip_address", StringType(), True),
    StructField("country", StringType(), True),
    StructField("session_id", StringType(), True)
])

In [0]:
# Now, let's create a streaming DataFrame that reads from our stream source
# This could be a directory where new files arrive, Kafka, or other streaming sources
streaming_df = spark.readStream \
    .schema(schema) \
    .json("/pyspark/video-streaming-data/module5-orchestration/streaming/stream_source")

print("Streaming source initialized!")

In [0]:
# Let's do some basic transformations on our streaming data
# We'll parse the timestamp, calculate metrics, and prepare data for analysis

# Parse timestamp into proper timestamp type and extract date components
parsed_df = streaming_df \
    .withColumn("event_timestamp", to_timestamp("timestamp", "yyyy-MM-dd'T'HH:mm:ss'Z'")) \
    .withColumn("event_date", to_date("event_timestamp")) \
    .withColumn("event_hour", hour("event_timestamp")) \
    .withColumn("event_minute", minute("event_timestamp"))

In [0]:
# Now, let's perform some streaming aggregations
# Calculate metrics in real-time with a 1-minute window

# Define a 1-minute tumbling window
windowed_counts = parsed_df \
    .withWatermark("event_timestamp", "10 minutes") \
    .groupBy(
        window("event_timestamp", "1 minute"),
        "device_type"
    ) \
    .agg(
        count("*").alias("event_count"),
        avg("duration_seconds").alias("avg_duration"),
        sum("buffering_count").alias("total_buffering_events")
    )

In [0]:
# Let's also track errors in our streaming data
error_tracking = parsed_df \
    .filter(col("error_type").isNotNull()) \
    .withWatermark("event_timestamp", "10 minutes") \
    .groupBy(
        window("event_timestamp", "1 minute"),
        "error_type"
    ) \
    .count() \
    .withColumnRenamed("count", "error_count")

In [0]:
# Now we'll start our streaming query to process the data
# This query will continuously process new data as it arrives

# Start the query to process streaming metrics
# Note the use of a trigger to control processing frequency
query = windowed_counts \
    .writeStream \
    .outputMode("complete") \
    .format("memory") \
    .queryName("streaming_device_metrics") \
    .trigger(processingTime="10 seconds") \
    .start()

# Start the error tracking query with metrics enabled
# This allows us to monitor the performance of this specific query
error_query = error_tracking \
    .writeStream \
    .outputMode("complete") \
    .format("memory") \
    .queryName("streaming_errors") \
    .trigger(processingTime="10 seconds") \
    .start()

print("Streaming queries started!")
print(f"Active streaming queries: {spark.streams.active}")
print(f"Total active streaming queries: {len(spark.streams.active)}")

In [0]:
# Now let's set up our monitoring - we'll check the query status
print("Streaming Query Status:")
print(f"Query name: {query.name}")
print(f"Query status: {query.status}")
print(f"Is active: {query.isActive}")
print(f"Recent progress:")
for progress in query.recentProgress[-5:]:
    print(f"  Batch: {progress['batchId']}, # rows: {progress.get('numInputRows', 'N/A')}, " + 
          f"Processing time: {progress.get('batchDuration', 'N/A')}ms")

In [0]:
# Let's examine our streaming metrics more closely
# We can access detailed metrics about our streaming query

# Get the latest metrics from our streaming query
latest_metrics = query.lastProgress

if latest_metrics:
    print("Latest Streaming Metrics:")
    print(f"Input rate: {latest_metrics.get('inputRowsPerSecond', 'N/A')} rows/second")
    print(f"Processing rate: {latest_metrics.get('processedRowsPerSecond', 'N/A')} rows/second")
    print(f"Batch duration: {latest_metrics.get('batchDuration', 'N/A')} ms")
    print(f"Operation duration: {latest_metrics.get('totalDuration', 'N/A')} ms")
    
    # Advanced metrics for debugging performance issues
    stateOperators = latest_metrics.get('stateOperators', [])
    if stateOperators:
        print("\nState Operation Metrics:")
        for operator in stateOperators:
            print(f"  - {operator.get('operatorName')}: {operator.get('numRowsTotal')} total rows in state")
            print(f"    Memory used: {operator.get('memoryUsedBytes')/1024/1024:.2f} MB")
else:
    print("No metrics available yet - query just started")

In [0]:
spark.sql("select count(*) from streaming_device_metrics").show(truncate=False)
spark.sql("select count(*) from streaming_errors").show(truncate=False)

In [0]:
# We can query the in-memory tables to see results in real-time
# In a production environment, you'd likely write to a more permanent storage
print("Current device metrics:")
spark.sql("SELECT * FROM streaming_device_metrics ORDER BY window DESC LIMIT 10").display(truncate=False)

print("Current error metrics:")
spark.sql("SELECT * FROM streaming_errors ORDER BY window DESC, error_count DESC LIMIT 10").display(truncate=False)

In [0]:
# Let's demonstrate writing our streaming results to Delta Lake
# This is a common pattern for building real-time data lakehouse architectures

# Using complete output mode for aggregations - this overwrites the entire result table each time
delta_query = parsed_df \
    .groupBy(
        "event_date",
        "event_hour",
        "device_type"
    ) \
    .agg(
        count("*").alias("event_count"),
        avg("duration_seconds").alias("avg_duration")
    ) \
    .writeStream \
    .format("delta") \
    .outputMode("complete") \
    .option("checkpointLocation", "/pyspark/video-streaming-data/module5-orchestration/streaming/delta_checkpoints") \
    .trigger(processingTime="30 seconds") \
    .start("/pyspark/video-streaming-data/module5-orchestration/streaming/delta_metrics")

print("Started writing streaming aggregations to Delta Lake!")

# Data Generator for Real-time Demo
Let's create and inject some sample data to demonstrate real-time data ingestion

In [0]:
# Create a function to generate sample streaming events
import json
import random
import datetime
import time
import os
import uuid

def generate_streaming_events(num_events=100, delay_seconds=0.2):
    """Generate sample streaming events and write them to the source directory"""
    
    # Define data generation parameters
    device_types = ["TV", "Mobile", "Web", "Tablet"]
    qualities = ["SD", "HD", "4K"]
    error_types = [None, None, None, None, "network_error", "server_error", "content_unavailable"]
    countries = ["US", "CA", "UK", "FR", "DE", "JP", "BR", "AU"]
    
    # Source directory for streaming data
    stream_dir = "/dbfs/pyspark/video-streaming-data/module5-orchestration/streaming/stream_source"
    
    # Ensure the directory exists
    os.makedirs(stream_dir, exist_ok=True)
    
    print(f"Generating {num_events} streaming events...")
    
    # Generate events in real-time
    for i in range(num_events):
        event_id = f"EVT{uuid.uuid4().hex[:8]}"
        user_id = f"USR{random.randint(10000, 99999)}"
        content_id = f"CON{random.randint(10000, 99999)}"
        
        # Use current time for timestamp to see real-time effects
        timestamp = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ")
        
        # Generate a random event
        event = {
            "event_id": event_id,
            "user_id": user_id,
            "content_id": content_id,
            "timestamp": timestamp,
            "duration_seconds": random.randint(10, 3600),
            "device_type": random.choice(device_types),
            "quality": random.choice(qualities),
            "buffering_count": random.randint(0, 10),
            "error_type": random.choice(error_types),
            "ip_address": f"192.168.{random.randint(1, 255)}.{random.randint(1, 255)}",
            "country": random.choice(countries),
            "session_id": f"SES{uuid.uuid4().hex[:8]}"
        }
        
        # Write the event to a JSON file
        filename = f"{stream_dir}/{event_id}.json"
        with open(filename, 'w') as f:
            f.write(json.dumps(event))
        
        # Print progress
        if (i + 1) % 10 == 0:
            print(f"Generated {i + 1} events...")
        
        # Add a small delay to simulate streaming
        time.sleep(delay_seconds)
    
    print(f"Successfully generated {num_events} streaming events")

# Execute the function to start generating data
# Adjust the number of events and delay as needed
generate_streaming_events(num_events=50, delay_seconds=0.5)

In [0]:
# Now let's look at integrating system metrics for holistic monitoring
# We'll load a sample of metrics that would typically come from a monitoring system
metrics_df = spark.read.csv("/pyspark/video-streaming-data/module5-orchestration/streaming/monitoring/job_metrics.csv", 
                           header=True, inferSchema=True)

print("Sample system metrics for monitoring:")
metrics_df.select("timestamp", "job_id", "executor_cores", "executor_memory", "duration_ms", "status").show(5)

In [0]:
# Creating a dashboard for operational monitoring would typically involve:
# 1. Collecting metrics from Spark's metrics system
# 2. Storing them in a time-series database (like InfluxDB, Prometheus)
# 3. Visualizing with tools like Grafana

# Here's how you could export metrics for external dashboarding
metrics_for_export = metrics_df \
    .withColumn("timestamp", to_timestamp("timestamp")) \
    .withColumn("is_long_running", when(col("duration_ms") > 10000, 1).otherwise(0))

# Write metrics for dashboard consumption
metrics_for_export.write \
    .format("csv") \
    .mode("overwrite") \
    .option("header", "true") \
    .save("/pyspark/video-streaming-data/module5-orchestration/streaming/monitoring/dashboard_metrics")

print("Metrics prepared for dashboard integration")

In [0]:
# Set up alerts based on streaming metrics
# In a real environment, this could trigger emails, Slack notifications, etc.

# Simple alert logic example
alert_df = spark.sql("""
SELECT 
  window.end as alert_time,
  error_type,
  error_count
FROM streaming_errors
WHERE error_count > 5
ORDER BY window.end DESC
""")

print("Recent alerts that would trigger notifications:")
alert_df.show(5)

In [0]:
# To properly shut down our streaming queries when done
# This is important for clean resource management
print("Stopping streaming queries...")
query.stop()
error_query.stop()
delta_query.stop()
print("All streaming queries stopped!")