In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, col
from pyspark.sql.types import StructType, StructField, StringType, TimestampType

spark = SparkSession.builder.appName("StreamJoinExample").getOrCreate()

In [0]:
clicks_schema = StructType([
    StructField("userId", StringType(), True),
    StructField("productId", StringType(), True),
    StructField("clickTime", TimestampType(), True)
])

purchases_schema = StructType([
    StructField("userId", StringType(), True),
    StructField("productId", StringType(), True),
    StructField("purchaseTime", TimestampType(), True)
])

In [0]:
from pyspark.sql.functions import current_timestamp
from datetime import datetime, timedelta
import pandas as pd

click_data = pd.DataFrame([
    ("u1", "p1", datetime.now()),
    ("u2", "p2", datetime.now() - timedelta(minutes=5)),
    ("u3", "p3", datetime.now() - timedelta(minutes=8))
], columns=["userId", "productId", "clickTime"])

click_df = spark.createDataFrame(click_data, clicks_schema)
click_df.write.format("delta").mode("overwrite").saveAsTable("bijucatalog.bijusilverschema.click_stream_input")


In [0]:
%sql
select * from bijucatalog.bijusilverschema.click_stream_input

In [0]:
purchase_data = pd.DataFrame([
    ("u1", "p1", datetime.now() + timedelta(minutes=1)),   # Valid match
    ("u2", "p2", datetime.now() + timedelta(minutes=6)),   # Valid match
    ("u3", "p3", datetime.now() + timedelta(minutes=11))   # Too late
], columns=["userId", "productId", "purchaseTime"])

purchase_df = spark.createDataFrame(purchase_data, purchases_schema)
purchase_df.write.format("delta").mode("overwrite").saveAsTable("bijucatalog.bijusilverschema.purchase_stream_input")



In [0]:
%sql
select * from bijucatalog.bijusilverschema.purchase_stream_input

In [0]:
# Read Streams from Delta Tables
clicks_stream = spark.readStream.format("delta").table("bijucatalog.bijusilverschema.click_stream_input")
purchases_stream = spark.readStream.format("delta").table("bijucatalog.bijusilverschema.purchase_stream_input")

In [0]:
# Stream-Stream Join with Watermark
clicks = clicks_stream.withWatermark("clickTime", "10 minutes").alias("clicks")
purchases = purchases_stream.withWatermark("purchaseTime", "10 minutes").alias("purchases")

joined_stream = clicks.join(
    purchases,
    expr("""
        clicks.userId = purchases.userId AND
        clicks.productId = purchases.productId AND
        purchases.purchaseTime BETWEEN clicks.clickTime AND clicks.clickTime + interval 10 minutes
    """)
)
    


In [0]:
cleaned_stream = joined_stream.select(
    col("clicks.userId").alias("userId"),
    col("clicks.productId").alias("productId"),
    col("clicks.clickTime"),
    col("purchases.purchaseTime")
)

In [0]:
query = cleaned_stream.writeStream \
    .format("memory") \
    .queryName("joined_view") \
    .outputMode("append") \
    #.trigger(availableNow=True) 


In [0]:
cleaned_stream.printSchema()

In [0]:
# Verify the columns in the joined_stream DataFrame

# Write Output to Console or Table
checkpoint = "s3://databricksbijubucketnew/checkpoints/"
query = cleaned_stream.writeStream \
    .outputMode("append") \
    .format("delta") \
    .option("checkpointLocation", f"{checkpoint}/stream_join_test_v2") \
    .option("skipChangeCommits", "true") \
    .trigger(availableNow=True) \
    .toTable("bijucatalog.bijusilverschema.successful_conversions")

In [0]:
%sql
select * from bijucatalog.bijusilverschema.successful_conversions

In [0]:
%sql
drop table bijucatalog.bijusilverschema.successful_conversions