In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, explode, window, current_timestamp

# Create SparkSession
spark = SparkSession.builder \
    .appName("WordCountStreaming") \
    .getOrCreate()

# Define the schema for the streaming data
schema = "value STRING"

# Read streaming data from a TCP socket
lines = spark.readStream \
    .format("socket") \
    .option("host", "localhost") \
    .option("port", 9999) \
    .load()

# Split the lines into words
words = lines.select(explode(split(lines.value, " ")).alias("word"), current_timestamp().alias("timestamp"))

# Define a window of 10 seconds
windowedCounts = words \
    .groupBy(window(words.timestamp, "10 seconds"), words.word) \
    .count()

# Output the word counts to the console
query = windowedCounts \
    .writeStream \
    .outputMode("complete") \
    .format("console") \
    .option("truncate", False) \
    .start()



# Wait for the streaming to finish
query.awaitTermination()

# Stop the SparkSession
spark.stop()


24/05/12 14:20:08 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
24/05/12 14:20:08 WARN TextSocketSourceProvider: The socket source should not be used for production applications! It does not support recovery.
24/05/12 14:20:09 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-685950f8-c853-42c0-8bdc-e3ef2bae612d. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/05/12 14:20:09 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+------+----+-----+
|window|word|count|
+------+----+-----+
+------+----+-----+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+--------------------+----+-----+
|              window|word|count|
+--------------------+----+-----+
|{2024-05-12 14:21...|  hi|    1|
+--------------------+----+-----+



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+--------------------+----+-----+
|              window|word|count|
+--------------------+----+-----+
|{2024-05-12 14:21...|  hi|    2|
|{2024-05-12 14:21...|    |    1|
|{2024-05-12 14:21...|  hi|    1|
+--------------------+----+-----+

