In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import window, col
from pyspark.sql.types import StructType, StructField, LongType, StringType, DoubleType, IntegerType
from time import sleep

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Assignment2_stream")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

dataSchema = StructType(
    [StructField("title", StringType(), True),
     StructField("score", IntegerType(), True),
     StructField("id", StringType(), True),
     StructField("Subreddit", StringType(), True),
     StructField("url", StringType(), True),
     StructField("num_comments", IntegerType(), True),
     StructField("body", StringType(), True),
     StructField("timestamp_in_ms", LongType(), True)
     ])

# Read the dataset as a stream
sdf = spark.readStream.schema(dataSchema).option("maxFilesPerTrigger", 1) \
        .csv("/home/jovyan/data/a2")

withEventTimedf = sdf.selectExpr(
    "*",
    "cast(timestamp_in_ms/1000.0 as timestamp) as event_time")

withEventTimedf.printSchema()

root
 |-- title: string (nullable = true)
 |-- score: integer (nullable = true)
 |-- id: string (nullable = true)
 |-- Subreddit: string (nullable = true)
 |-- url: string (nullable = true)
 |-- num_comments: integer (nullable = true)
 |-- body: string (nullable = true)
 |-- timestamp_in_ms: long (nullable = true)
 |-- event_time: timestamp (nullable = true)



In [3]:
query = withEventTimedf \
    .writeStream \
    .queryName("avg_score_window") \
    .format("memory") \
    .outputMode("append") \
    .start()

try:
    for x in range(100):
        spark.sql("SELECT * FROM avg_score_window").show()
        sleep(10)
except KeyboardInterrupt:
    query.stop()
    # Stop the spark context
    spark.stop()
    print("Stoped the streaming query and the spark context")


+-----+-----+---+---------+---+------------+----+---------------+----------+
|title|score| id|Subreddit|url|num_comments|body|timestamp_in_ms|event_time|
+-----+-----+---+---------+---+------------+----+---------------+----------+
+-----+-----+---+---------+---+------------+----+---------------+----------+

+--------------------+-----+------+------------+--------------------+------------+----+---------------+--------------------+
|               title|score|    id|   Subreddit|                 url|num_comments|body|timestamp_in_ms|          event_time|
+--------------------+-----+------+------------+--------------------+------------+----+---------------+--------------------+
|               title| null|    id|   subreddit|                 url|        null|body|           null|                null|
|Colorado Board to...|    1|r1x48g|Conservative|https://timcast.c...|           0|null|     1637848711|1970-01-19 22:57:...|
|Waukesha suspect ...|    3|r1wztu|Conservative|https://www.foxne.

In [4]:
spark.stop()