In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import window, sum, col, from_json, current_timestamp, expr
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType

# Initialize Spark Session (in Databricks notebooks, this is pre-initialized)
spark = SparkSession.builder.appName("WatermarkExample").getOrCreate()

In [0]:
sales_schema = StructType([
    StructField("item_id", StringType(), True),
    StructField("amount", DoubleType(), True),
    StructField("event_time", TimestampType(), True)
])

In [0]:
from pyspark.sql.streaming import StreamingQueryListener

class MyStreamingListener(StreamingQueryListener):
    def onQueryStarted(self, event):
        print(f"Query started: {event.id}, Name: {event.name}")
    def onQueryProgress(self, event):
        print(f"Query progress: {event.progress}")
    def onQueryTerminated(self, event):
        print(f"Query terminated: {event.id}")

spark.streams.addListener(MyStreamingListener())

In [0]:
spark.sql("""
CREATE OR REPLACE TEMPORARY VIEW sales_source AS 
SELECT CAST(NULL AS STRING) AS item_id, 
       CAST(NULL AS DOUBLE) AS amount, 
       CAST(NULL AS TIMESTAMP) AS event_time
""")

In [0]:
%sql
select * from sales_source

In [0]:
def inject_data(item_id, amount, delay_seconds=0):
    current_ts = current_timestamp().cast(TimestampType())
    if delay_seconds > 0:
        # Simulate late arrival by subtracting delay from current_timestamp
        current_ts = (current_timestamp() - expr(f"INTERVAL {delay_seconds} SECONDS")).cast(TimestampType())
    
    data = [(item_id, amount, current_ts)]
    df_to_inject = spark.createDataFrame(data, sales_schema)
    df_to_inject.write.format("delta").mode("append").saveAsTable("bijucatalog.bijusilverschema.sales_stream_input")


In [0]:
sales_df = spark.readStream.format("delta").table("bijucatalog.bijusilverschema.sales_stream_input")


In [0]:
windowed_sales = sales_df \
    .withWatermark("event_time", "30 minutes") \
    .groupBy(
        window(col("event_time"), "1 hour", "30 minutes"),
        col("item_id")
    ) \
    .agg(sum("amount").alias("total_sales")) \
    .select(
        col("item_id"),
        col("window.start").alias("window_start"),
        col("window.end").alias("window_end"),
        col("total_sales")
    )


In [0]:
checkpoint = "s3://databricksbijubucketnew/checkpoints/"
query = windowed_sales \
    .writeStream \
    .outputMode("append") \
    .format("delta") \
    .option("checkpointLocation", f"{checkpoint}/windowed_sales") \
    .trigger(availableNow=True) \
    .toTable("bijucatalog.bijusilverschema.hourly_item_sales")


In [0]:
import time
from datetime import datetime, timedelta
from pyspark.sql.types import TimestampType
from pyspark.sql.functions import current_timestamp, expr
def inject_data(item_id, amount, delay_seconds=0):
    current_ts = datetime.now() - timedelta(seconds=delay_seconds)
    data = [(item_id, amount, current_ts)]
    df_to_inject = spark.createDataFrame(data, sales_schema)
    df_to_inject.write.format("delta").mode("append").saveAsTable("bijucatalog.bijusilverschema.sales_stream_input")




In [0]:
inject_data("laptop", 1200.0)
time.sleep(5)
inject_data("keyboard", 75.0)
time.sleep(30)
# inject_data("laptop", 50.0, delay_seconds=15 * 60)  # 15 minutes late

In [0]:
%sql
SELECT * FROM bijucatalog.bijusilverschema.sales_stream_input

In [0]:
%sql
SELECT * FROM bijucatalog.bijusilverschema.hourly_item_sales ORDER BY window_start DESC

In [0]:
%sql
select * from sales_source

In [0]:
# Reading from the simulated Delta table as a stream
sales_df = spark.readStream.format("delta").table("bijucatalog.bijusilverschema.sales_stream_input")
checkpoint = "s3://databricksbijubucketnew/checkpoints/"
# Apply watermark and perform windowed aggregation
windowed_sales = sales_df \
    .withWatermark("event_time", "30 minutes") \
    .groupBy(
        window(col("event_time"), "1 hour", "30 minutes"), # 1-hour window, sliding every 30 minutes
        col("item_id")
    ) \
    .agg(sum("amount").alias("total_sales")) \
    .select(
        col("item_id"),
        col("window.start").alias("window_start"),
        col("window.end").alias("window_end"),
        col("total_sales")
    )

# Write the results to a Delta table in "append" mode
# For windowed aggregations, "append" mode is common, meaning results are emitted once the window is closed by the watermark.
query = windowed_sales \
    .writeStream \
    .outputMode("append") \
    .format("delta") \
    .option("checkpointLocation", f"{checkpoint}/windowed_sales") \
    .trigger(processingTime="1 minute") \
    .toTable("hourly_item_sales")

# Start the query (in Databricks, this runs in the background)
# query.start() 

# You can manually inject data to see the effect
# inject_data("laptop", 1200.0)
# time.sleep(5)
# inject_data("keyboard", 75.0)
# time.sleep(30) # Wait for watermark to potentially advance
# inject_data("laptop", 50.0, delay_seconds=15 * 60) # Late data for a past window

# To stop the query
# query.stop()

# To view the results (after the stream has processed some data)
# display(spark.sql("SELECT * FROM hourly_item_sales ORDER BY window_start DESC"))

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import window, sum, col, from_json
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType

# Initialize Spark Session (already done in Databricks notebooks)
spark = SparkSession.builder.appName("WatermarkExample").getOrCreate()
checkpoint = "s3://databricksbijubucketnew/checkpoints/"
# Define the schema for incoming sales data
sales_schema = StructType([
    StructField("item_id", StringType(), True),
    StructField("amount", DoubleType(), True),
    StructField("event_time", TimestampType(), True)
])

# Simulate a streaming source (e.g., Kafka, Delta Lake)
# For demonstration, we'll use a memory stream or read from a simulated file source
# In a real scenario, this would be:
# sales_df = spark.readStream.format("kafka").option("kafka.bootstrap.servers", "host:port").option("subscribe", "sales_topic").load()
# or
# sales_df = spark.readStream.format("delta").load("/path/to/delta/sales_table")

# For a simple local example, let's create a dummy streaming DataFrame
# (In Databricks, you'd typically read from a Delta table or Kafka)
from pyspark.sql.streaming import StreamingQueryListener
import time

class MyStreamingListener(StreamingQueryListener):
    def onQueryStarted(self, event):
        print(f"Query started: {event.id}, Name: {event.name}")
    def onQueryProgress(self, event):
        print(f"Query progress: {event.progress}")
    def onQueryTerminated(self, event):
        print(f"Query terminated: {event.id}")

spark.streams.addListener(MyStreamingListener())

# Creating a memory source for demonstration
from pyspark.sql.streaming import DataStreamWriter
from pyspark.sql.functions import current_timestamp

# Create a temporary view to write to from the memory source
spark.sql("""
CREATE OR REPLACE TEMPORARY VIEW sales_source AS 
SELECT CAST(NULL AS STRING) AS item_id, 
       CAST(NULL AS DOUBLE) AS amount, 
       CAST(NULL AS TIMESTAMP) AS event_time
""")

# Function to inject data into the memory stream (for demonstration)
def inject_data(item_id, amount, delay_seconds=0):
    current_ts = current_timestamp().cast(TimestampType())
    if delay_seconds > 0:
        # Simulate late arrival by subtracting delay from current_timestamp
        current_ts = (current_timestamp() - expr(f"INTERVAL {delay_seconds} SECONDS")).cast(TimestampType())
    
    data = [(item_id, amount, current_ts)]
    df_to_inject = spark.createDataFrame(data, sales_schema)
    df_to_inject.write.format("delta").mode("append").saveAsTable("bijucatalog.bijusilverschema.sales_stream_input") # Using a Delta table for simulation

    # In a real scenario, you'd be reading from a persistent stream source
    # For a purely in-memory simulation, you'd use something like:
    # spark.sql(f"INSERT INTO sales_source VALUES ('{item_id}', {amount}, '{current_ts}')")

# Reading from the simulated Delta table as a stream
sales_df = spark.readStream.format("delta").table("bijucatalog.bijusilverschema.sales_stream_input")

# Apply watermark and perform windowed aggregation
windowed_sales = sales_df \
    .withWatermark("event_time", "30 minutes") \
    .groupBy(
        window(col("event_time"), "1 hour", "30 minutes"), # 1-hour window, sliding every 30 minutes
        col("item_id")
    ) \
    .agg(sum("amount").alias("total_sales")) \
    .select(
        col("item_id"),
        col("window.start").alias("window_start"),
        col("window.end").alias("window_end"),
        col("total_sales")
    )



In [0]:
inject_data("keyboard", 75.0)

In [0]:
# Write the results to a Delta table in "append" mode
# For windowed aggregations, "append" mode is common, meaning results are emitted once the window is closed by the watermark.
query = windowed_sales \
    .writeStream \
    .outputMode("append") \
    .format("delta") \
    .option("checkpointLocation", f"{checkpoint}/windowed_sales") \
    .trigger(processingTime="1 minute") \
    .toTable("hourly_item_sales")

# Start the query (in Databricks, this runs in the background)
query.start() 

# You can manually inject data to see the effect
inject_data("laptop", 1200.0)
# time.sleep(5)
# inject_data("keyboard", 75.0)
# time.sleep(30) # Wait for watermark to potentially advance
# inject_data("laptop", 50.0, delay_seconds=15 * 60) # Late data for a past window

# To stop the query
# query.stop()

# To view the results (after the stream has processed some data)
# display(spark.sql("SELECT * FROM hourly_item_sales ORDER BY window_start DESC"))