In [13]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import window, col, avg, concat, lit, from_csv
from pyspark.sql.types import StructType, StructField, LongType, StringType, DoubleType, IntegerType
from time import sleep

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Assignment2_stream")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

dataSchema = StructType(
    [StructField("title", StringType(), True),
     StructField("score", IntegerType(), True),
     StructField("id", StringType(), True),
     StructField("Subreddit", StringType(), True),
     StructField("url", StringType(), True),
     StructField("num_comments", IntegerType(), True),
     StructField("body", StringType(), True),
     StructField("timestamp_in_ms", LongType(), True)
     ])

kafkaStream = spark \
    .read \ #original version shoul be readStream. I changed this to check if tha data is read. 
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka1:9093") \
    .option("subscribe", "reddit") \
    .option("startingOffsets", "earliest") \
    .load()

df = kafkaStream.selectExpr("CAST(value AS STRING)")

df1 = df.select(from_csv(df.value, dataSchema.simpleString()))

df1.printSchema()

sdf = df1.select(col("from_csv(value).*"))

sdf.printSchema()

withEventTimedf = sdf.selectExpr(
    "*",
    "cast(timestamp_in_ms/1000.0 as timestamp) as event_time")

withEventTimedf.show()

root
 |-- from_csv(value): struct (nullable = true)
 |    |-- title: string (nullable = true)
 |    |-- score: integer (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- Subreddit: string (nullable = true)
 |    |-- url: string (nullable = true)
 |    |-- num_comments: integer (nullable = true)
 |    |-- body: string (nullable = true)
 |    |-- timestamp_in_ms: long (nullable = true)

root
 |-- title: string (nullable = true)
 |-- score: integer (nullable = true)
 |-- id: string (nullable = true)
 |-- Subreddit: string (nullable = true)
 |-- url: string (nullable = true)
 |-- num_comments: integer (nullable = true)
 |-- body: string (nullable = true)
 |-- timestamp_in_ms: long (nullable = true)

+--------------------+-----+------+------------+--------------------+------------+----+---------------+--------------------+
|               title|score|    id|   Subreddit|                 url|num_comments|body|timestamp_in_ms|          event_time|
+--------------------+-----+-

In [8]:
from pyspark.sql.functions import *

#TO-DO: Write some spark operations with window and aggragate functions

avgscoredf = withEventTimedf \
    .groupBy(window(col("event_time"), "10 seconds"),"id") \
    .agg(avg("num_comments").alias("value"))

resultdf = avgscoredf.select(concat(col("value"), lit(" "), col("id")).alias("key"), col("value").cast("string"))

resultdf.printSchema()

query = resultdf \
    .writeStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka1:9093") \
    .option("checkpointLocation", "/home/jovyan/checkpoint") \
    .option("topic", "avg_score") \
    .outputMode("complete") \
    .start()
try:
    query.awaitTermination()
except KeyboardInterrupt:
    query.stop()
    # Stop the spark context
    spark.stop()
    print("Stoped the streaming query and the spark context")

AttributeError: 'NoneType' object has no attribute '_jvm'

In [4]:
spark.stop()