In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, explode, window, current_timestamp, udf
from pyspark.sql.types import IntegerType

# Create SparkSession
spark = SparkSession.builder \
    .appName("MovingAverageStreamingFromFile") \
    .getOrCreate()

# Define a custom function to count vowels in a word
def count_vowels(word):
    vowels = "aeiouAEIOU"
    return sum(1 for letter in word if letter not in vowels)

# Register the custom function as a UDF
count_vowels_udf = udf(count_vowels, IntegerType())

# Read streaming data from a file (initial path)
initial_path = "./data/"
lines = spark.readStream \
    .format("text") \
    .load(initial_path)

# Split the lines into words and add a timestamp
words = lines.select(explode(split(lines.value, " ")).alias("word"), current_timestamp().alias("timestamp"))

# Apply the custom function to count vowels in each word
words_with_consonent_count = words.withColumn("consonent_count", count_vowels_udf(words.word))

# Define a sliding window of 10 seconds with a slide duration of 5 seconds
'''
windowedAvgVowelCount = words_with_vowel_count \
    .groupBy(window(words_with_vowel_count.timestamp, "10 seconds", "5 seconds")) \
    .agg({"vowel_count": "avg"})
'''
windowedAvgVowelCount = words_with_consonent_count \
    .groupBy(window(words_with_consonent_count.timestamp, "10 seconds")) \
    .agg({"consonent_count": "sum"})


# Output the moving average of vowel counts to the console
query = windowedAvgVowelCount \
    .writeStream \
    .outputMode("update") \
    .trigger(processingTime='10 seconds') \
    .format("console") \
    .option("truncate", "false") \
    .start()

# Wait for the streaming to finish
query.awaitTermination()

# Stop the SparkSession
spark.stop()


24/05/13 12:45:17 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-5ba231e4-85ee-4dc7-9026-9a2b9897e8fd. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/05/13 12:45:17 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+------------------------------------------+--------------------+
|window                                    |sum(consonent_count)|
+------------------------------------------+--------------------+
|{2024-05-13 12:45:10, 2024-05-13 12:45:20}|5                   |
+------------------------------------------+--------------------+



                                                                                

-------------------------------------------
Batch: 5
-------------------------------------------
+------------------------------------------+--------------------+
|window                                    |sum(consonent_count)|
+------------------------------------------+--------------------+
|{2024-05-13 12:45:20, 2024-05-13 12:45:30}|5                   |
+------------------------------------------+--------------------+



                                                                                

-------------------------------------------
Batch: 11
-------------------------------------------
+------------------------------------------+--------------------+
|window                                    |sum(consonent_count)|
+------------------------------------------+--------------------+
|{2024-05-13 12:45:20, 2024-05-13 12:45:30}|5                   |
+------------------------------------------+--------------------+



                                                                                

-------------------------------------------
Batch: 12
-------------------------------------------
+------------------------------------------+--------------------+
|window                                    |sum(consonent_count)|
+------------------------------------------+--------------------+
|{2024-05-13 12:45:30, 2024-05-13 12:45:40}|5                   |
+------------------------------------------+--------------------+



                                                                                

-------------------------------------------
Batch: 6
-------------------------------------------
+------------------------------------------+--------------------+
|window                                    |sum(consonent_count)|
+------------------------------------------+--------------------+
|{2024-05-13 12:45:30, 2024-05-13 12:45:40}|5                   |
+------------------------------------------+--------------------+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+------------------------------------------+--------------------+
|window                                    |sum(consonent_count)|
+------------------------------------------+--------------------+
|{2024-05-13 12:45:30, 2024-05-13 12:45:40}|5                   |
+------------------------------------------+--------------------+



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+------------------------------------------+--------------------+
|window                                    |sum(consonent_count)|
+------------------------------------------+--------------------+
|{2024-05-13 12:45:40, 2024-05-13 12:45:50}|5                   |
+------------------------------------------+--------------------+



                                                                                

-------------------------------------------
Batch: 13
-------------------------------------------
+------------------------------------------+--------------------+
|window                                    |sum(consonent_count)|
+------------------------------------------+--------------------+
|{2024-05-13 12:45:40, 2024-05-13 12:45:50}|5                   |
+------------------------------------------+--------------------+



                                                                                

-------------------------------------------
Batch: 7
-------------------------------------------
+------------------------------------------+--------------------+
|window                                    |sum(consonent_count)|
+------------------------------------------+--------------------+
|{2024-05-13 12:45:40, 2024-05-13 12:45:50}|5                   |
+------------------------------------------+--------------------+



                                                                                

-------------------------------------------
Batch: 3
-------------------------------------------
+------------------------------------------+--------------------+
|window                                    |sum(consonent_count)|
+------------------------------------------+--------------------+
|{2024-05-13 12:45:50, 2024-05-13 12:46:00}|5                   |
+------------------------------------------+--------------------+



                                                                                

-------------------------------------------
Batch: 8
-------------------------------------------
+------------------------------------------+--------------------+
|window                                    |sum(consonent_count)|
+------------------------------------------+--------------------+
|{2024-05-13 12:45:50, 2024-05-13 12:46:00}|5                   |
+------------------------------------------+--------------------+



                                                                                

-------------------------------------------
Batch: 14
-------------------------------------------
+------------------------------------------+--------------------+
|window                                    |sum(consonent_count)|
+------------------------------------------+--------------------+
|{2024-05-13 12:45:50, 2024-05-13 12:46:00}|5                   |
+------------------------------------------+--------------------+



                                                                                

-------------------------------------------
Batch: 9
-------------------------------------------
+------------------------------------------+--------------------+
|window                                    |sum(consonent_count)|
+------------------------------------------+--------------------+
|{2024-05-13 12:46:00, 2024-05-13 12:46:10}|5                   |
+------------------------------------------+--------------------+



                                                                                

-------------------------------------------
Batch: 15
-------------------------------------------
+------------------------------------------+--------------------+
|window                                    |sum(consonent_count)|
+------------------------------------------+--------------------+
|{2024-05-13 12:46:00, 2024-05-13 12:46:10}|5                   |
+------------------------------------------+--------------------+



                                                                                

-------------------------------------------
Batch: 4
-------------------------------------------
+------------------------------------------+--------------------+
|window                                    |sum(consonent_count)|
+------------------------------------------+--------------------+
|{2024-05-13 12:46:00, 2024-05-13 12:46:10}|5                   |
+------------------------------------------+--------------------+



                                                                                

-------------------------------------------
Batch: 16
-------------------------------------------
+------------------------------------------+--------------------+
|window                                    |sum(consonent_count)|
+------------------------------------------+--------------------+
|{2024-05-13 12:46:10, 2024-05-13 12:46:20}|5                   |
+------------------------------------------+--------------------+



                                                                                

-------------------------------------------
Batch: 10
-------------------------------------------
+------------------------------------------+--------------------+
|window                                    |sum(consonent_count)|
+------------------------------------------+--------------------+
|{2024-05-13 12:46:10, 2024-05-13 12:46:20}|5                   |
+------------------------------------------+--------------------+



                                                                                

-------------------------------------------
Batch: 5
-------------------------------------------
+------------------------------------------+--------------------+
|window                                    |sum(consonent_count)|
+------------------------------------------+--------------------+
|{2024-05-13 12:46:10, 2024-05-13 12:46:20}|5                   |
+------------------------------------------+--------------------+



                                                                                

-------------------------------------------
Batch: 6
-------------------------------------------
+------------------------------------------+--------------------+
|window                                    |sum(consonent_count)|
+------------------------------------------+--------------------+
|{2024-05-13 12:46:20, 2024-05-13 12:46:30}|5                   |
+------------------------------------------+--------------------+



                                                                                

-------------------------------------------
Batch: 11
-------------------------------------------
+------------------------------------------+--------------------+
|window                                    |sum(consonent_count)|
+------------------------------------------+--------------------+
|{2024-05-13 12:46:20, 2024-05-13 12:46:30}|5                   |
+------------------------------------------+--------------------+



                                                                                

-------------------------------------------
Batch: 17
-------------------------------------------
+------------------------------------------+--------------------+
|window                                    |sum(consonent_count)|
+------------------------------------------+--------------------+
|{2024-05-13 12:46:20, 2024-05-13 12:46:30}|5                   |
+------------------------------------------+--------------------+



                                                                                

-------------------------------------------
Batch: 12
-------------------------------------------
+------------------------------------------+--------------------+
|window                                    |sum(consonent_count)|
+------------------------------------------+--------------------+
|{2024-05-13 12:46:30, 2024-05-13 12:46:40}|5                   |
+------------------------------------------+--------------------+



                                                                                

-------------------------------------------
Batch: 18
-------------------------------------------
+------------------------------------------+--------------------+
|window                                    |sum(consonent_count)|
+------------------------------------------+--------------------+
|{2024-05-13 12:46:30, 2024-05-13 12:46:40}|5                   |
+------------------------------------------+--------------------+



                                                                                

-------------------------------------------
Batch: 7
-------------------------------------------
+------------------------------------------+--------------------+
|window                                    |sum(consonent_count)|
+------------------------------------------+--------------------+
|{2024-05-13 12:46:30, 2024-05-13 12:46:40}|5                   |
+------------------------------------------+--------------------+



                                                                                

-------------------------------------------
Batch: 19
-------------------------------------------
+------------------------------------------+--------------------+
|window                                    |sum(consonent_count)|
+------------------------------------------+--------------------+
|{2024-05-13 12:46:40, 2024-05-13 12:46:50}|5                   |
+------------------------------------------+--------------------+



                                                                                

-------------------------------------------
Batch: 8
-------------------------------------------
+------------------------------------------+--------------------+
|window                                    |sum(consonent_count)|
+------------------------------------------+--------------------+
|{2024-05-13 12:46:40, 2024-05-13 12:46:50}|5                   |
+------------------------------------------+--------------------+





-------------------------------------------
Batch: 13
-------------------------------------------
+------------------------------------------+--------------------+
|window                                    |sum(consonent_count)|
+------------------------------------------+--------------------+
|{2024-05-13 12:46:40, 2024-05-13 12:46:50}|5                   |
+------------------------------------------+--------------------+



[Stage 87:(196 + 4) / 200][Stage 89:> (0 + 0) / 200][Stage 91:> (0 + 0) / 200]0]

-------------------------------------------
Batch: 14
-------------------------------------------
+------------------------------------------+--------------------+
|window                                    |sum(consonent_count)|
+------------------------------------------+--------------------+
|{2024-05-13 12:46:50, 2024-05-13 12:47:00}|5                   |
+------------------------------------------+--------------------+



                                                                                

-------------------------------------------
Batch: 20
-------------------------------------------
+------------------------------------------+--------------------+
|window                                    |sum(consonent_count)|
+------------------------------------------+--------------------+
|{2024-05-13 12:46:50, 2024-05-13 12:47:00}|5                   |
+------------------------------------------+--------------------+



                                                                                

-------------------------------------------
Batch: 9
-------------------------------------------
+------------------------------------------+--------------------+
|window                                    |sum(consonent_count)|
+------------------------------------------+--------------------+
|{2024-05-13 12:46:50, 2024-05-13 12:47:00}|5                   |
+------------------------------------------+--------------------+



                                                                                

-------------------------------------------
Batch: 10
-------------------------------------------
+------------------------------------------+--------------------+
|window                                    |sum(consonent_count)|
+------------------------------------------+--------------------+
|{2024-05-13 12:47:00, 2024-05-13 12:47:10}|5                   |
+------------------------------------------+--------------------+



                                                                                

-------------------------------------------
Batch: 15
-------------------------------------------
+------------------------------------------+--------------------+
|window                                    |sum(consonent_count)|
+------------------------------------------+--------------------+
|{2024-05-13 12:47:00, 2024-05-13 12:47:10}|5                   |
+------------------------------------------+--------------------+



                                                                                

-------------------------------------------
Batch: 21
-------------------------------------------
+------------------------------------------+--------------------+
|window                                    |sum(consonent_count)|
+------------------------------------------+--------------------+
|{2024-05-13 12:47:00, 2024-05-13 12:47:10}|5                   |
+------------------------------------------+--------------------+



[Stage 99:(185 + 4) / 200][Stage 101:>(0 + 0) / 200][Stage 103:>(0 + 0) / 200]

-------------------------------------------
Batch: 11
-------------------------------------------
+------------------------------------------+--------------------+
|window                                    |sum(consonent_count)|
+------------------------------------------+--------------------+
|{2024-05-13 12:47:10, 2024-05-13 12:47:20}|5                   |
+------------------------------------------+--------------------+





-------------------------------------------
Batch: 22
-------------------------------------------
+------------------------------------------+--------------------+
|window                                    |sum(consonent_count)|
+------------------------------------------+--------------------+
|{2024-05-13 12:47:10, 2024-05-13 12:47:20}|5                   |
+------------------------------------------+--------------------+



                                                                                

-------------------------------------------
Batch: 16
-------------------------------------------
+------------------------------------------+--------------------+
|window                                    |sum(consonent_count)|
+------------------------------------------+--------------------+
|{2024-05-13 12:47:10, 2024-05-13 12:47:20}|5                   |
+------------------------------------------+--------------------+



[Stage 105:(185 + 4) / 200][Stage 107:>(3 + 0) / 200][Stage 109:>(0 + 0) / 200]

-------------------------------------------
Batch: 17
-------------------------------------------
+------------------------------------------+--------------------+
|window                                    |sum(consonent_count)|
+------------------------------------------+--------------------+
|{2024-05-13 12:47:20, 2024-05-13 12:47:30}|5                   |
+------------------------------------------+--------------------+



                                                                                

-------------------------------------------
Batch: 23
-------------------------------------------
+------------------------------------------+--------------------+
|window                                    |sum(consonent_count)|
+------------------------------------------+--------------------+
|{2024-05-13 12:47:20, 2024-05-13 12:47:30}|5                   |
+------------------------------------------+--------------------+



                                                                                

-------------------------------------------
Batch: 12
-------------------------------------------
+------------------------------------------+--------------------+
|window                                    |sum(consonent_count)|
+------------------------------------------+--------------------+
|{2024-05-13 12:47:20, 2024-05-13 12:47:30}|5                   |
+------------------------------------------+--------------------+



                                                                                

-------------------------------------------
Batch: 13
-------------------------------------------
+------------------------------------------+--------------------+
|window                                    |sum(consonent_count)|
+------------------------------------------+--------------------+
|{2024-05-13 12:47:30, 2024-05-13 12:47:40}|5                   |
+------------------------------------------+--------------------+



                                                                                

-------------------------------------------
Batch: 24
-------------------------------------------
+------------------------------------------+--------------------+
|window                                    |sum(consonent_count)|
+------------------------------------------+--------------------+
|{2024-05-13 12:47:30, 2024-05-13 12:47:40}|5                   |
+------------------------------------------+--------------------+



                                                                                

-------------------------------------------
Batch: 18
-------------------------------------------
+------------------------------------------+--------------------+
|window                                    |sum(consonent_count)|
+------------------------------------------+--------------------+
|{2024-05-13 12:47:30, 2024-05-13 12:47:40}|5                   |
+------------------------------------------+--------------------+

