# Part 3: Spark Streaming

In [1]:
from pyspark.sql import SparkSession
spark = (SparkSession.builder.appName("cs544")
         .config("spark.sql.shuffle.partitions", 10)
         .config("spark.ui.showConsoleProgress", False)
         .config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.2')
         .getOrCreate())

df = (
    spark.readStream.format("kafka")
    .option("kafka.bootstrap.servers", "kafka:9092")
    .option("subscribe", "stations-json")
    .option("startingOffsets", "earliest")
    .load()
)

:: loading settings :: url = jar:file:/usr/local/lib/python3.10/dist-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-e641d4dd-d461-4eb3-ae37-7412d7cf552d;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.2.2 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.2.2 in central
	found org.apache.kafka#kafka-clients;2.8.1 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.8.4 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.1 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.hadoop#hadoop-client-api;3.3.1 in central
	found org.apache.htrace#htrace-core4;4.1.0-incubating in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.ap

In [2]:
from pyspark.sql.functions import col, from_json
from pyspark.sql.functions import min, max, mean, count, month, date_add

schema = "station STRING, date DATE, degrees FLOAT, raining INT"
weather = (df.select(col("key").cast("string"),
          from_json(col("value").cast("string"), schema).alias("value"))
    .select("key", "value.*"))

In [3]:
counts_df = weather.groupby("station").agg(
    min("date").alias("start"),
    max("date").alias("end"),
    count("*").alias("measurements"),
    mean("degrees").alias("avg"),
    max("degrees").alias("max")
).orderBy("station")
s = counts_df.writeStream.format("console").trigger(processingTime="5 seconds").outputMode("complete").start()
s.awaitTermination(30)
s.stop()

23/04/29 05:35:33 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-97532e48-6343-44ab-9517-1e01ed2de140. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/04/29 05:35:33 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


-------------------------------------------
Batch: 0
-------------------------------------------
+-------+----------+----------+------------+------------------+---------+
|station|     start|       end|measurements|               avg|      max|
+-------+----------+----------+------------+------------------+---------+
|      A|2000-01-01|2000-01-12|          12|38.378681659698486|48.363316|
|      B|2000-01-01|2000-01-12|          12|30.563111941019695|40.346504|
|      C|2000-01-01|2000-01-12|          12|25.198423147201538| 36.65578|
|      D|2000-01-01|2000-01-12|          12|12.959664026896158|25.170956|
|      E|2000-01-01|2000-01-12|          12|34.134008248647056|46.411068|
|      F|2000-01-01|2000-01-12|          12| 16.55533214410146|25.625895|
|      G|2000-01-01|2000-01-12|          12| 26.76465082168579|34.858067|
|      H|2000-01-01|2000-01-12|          12|  27.4657727877299| 37.98551|
|      I|2000-01-01|2000-01-12|          12|29.899755477905273| 39.54847|
|      J|2000-0

23/04/29 05:35:48 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 5000 milliseconds, but spent 14892 milliseconds


-------------------------------------------
Batch: 1
-------------------------------------------
+-------+----------+----------+------------+------------------+---------+
|station|     start|       end|measurements|               avg|      max|
+-------+----------+----------+------------+------------------+---------+
|      A|2000-01-01|2000-01-13|          13| 38.07362409738394|48.363316|
|      B|2000-01-01|2000-01-13|          13| 30.62152730501615|40.346504|
|      C|2000-01-01|2000-01-13|          13|25.386069297790527| 36.65578|
|      D|2000-01-01|2000-01-13|          13|13.225162946260893|25.170956|
|      E|2000-01-01|2000-01-13|          13| 33.73588004479041|46.411068|
|      F|2000-01-01|2000-01-13|          13|16.548119875100944|25.625895|
|      G|2000-01-01|2000-01-13|          13| 27.93824137174166|42.021328|
|      H|2000-01-01|2000-01-13|          13|27.036798917330227| 37.98551|
|      I|2000-01-01|2000-01-12|          12|29.899755477905273| 39.54847|
|      J|2000-0

In [4]:
schema = "station STRING, date DATE, raining INT"
today = (df.select(col("key").cast("string"),
          from_json(col("value").cast("string"), schema).alias("value"))
    .select("value.*"))

schema = "station STRING, date DATE"
features = (df.select(col("key").cast("string"),
          from_json(col("value").cast("string"), schema).alias("value"))
         .select("value.*").withColumn('month', month("date")))

schema = "station STRING, date DATE, degrees FLOAT, raining INT"
yesterday = (df.select(col("key").cast("string"),
          from_json(col("value").cast("string"), schema).alias("value"))
             .select("value.*").withColumn("date", date_add("date", 1))
            .withColumnRenamed('degrees', 'sub1degrees').withColumnRenamed('raining', 'sub1raining'))

yesterday2 = (df.select(col("key").cast("string"),
          from_json(col("value").cast("string"), schema).alias("value"))
             .select("value.*").withColumn("date", date_add("date", 2))
             .withColumnRenamed('degrees', 'sub2degrees').withColumnRenamed('raining', 'sub2raining'))

features = features.join(yesterday, ['station', 'date'], 'inner').join(yesterday2, ['station', 'date'], 'inner')
joined = today.join(features,['date', 'station'], 'inner')

In [5]:
today

DataFrame[station: string, date: date, raining: int]

In [6]:
features

DataFrame[station: string, date: date, month: int, sub1degrees: float, sub1raining: int, sub2degrees: float, sub2raining: int]

In [7]:
path = "/notebooks/parquet"
check = "/notebooks/check"
query = joined.repartition(1).writeStream.trigger(processingTime='1 minute').format("parquet").option("path", path).option("checkpointLocation", check).start()
query.awaitTermination(100)
query.stop()

23/04/29 05:36:13 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


# Part 4: Spark ML

In [8]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [16]:
data = spark.read.format("parquet").load("/notebooks/parquet/*")
x = ["month", "sub1degrees", "sub1raining", "sub2degrees", "sub2raining"]
assembler = VectorAssembler(inputCols=x, outputCol="features")
data = assembler.transform(data)

In [17]:
train_data, test_data = data.randomSplit([0.8, 0.2], seed=1)

In [18]:
dt_classifier = DecisionTreeClassifier(labelCol="raining", featuresCol="features")
dt_model = dt_classifier.fit(train_data)

In [19]:
print(dt_model.toDebugString)

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_207ab175abdf, depth=5, numNodes=23, numClasses=2, numFeatures=5
  If (feature 3 <= 40.78687858581543)
   If (feature 2 <= 0.5)
    If (feature 4 <= 0.5)
     Predict: 0.0
    Else (feature 4 > 0.5)
     If (feature 3 <= 32.768882751464844)
      If (feature 1 <= 29.064224243164062)
       Predict: 0.0
      Else (feature 1 > 29.064224243164062)
       Predict: 1.0
     Else (feature 3 > 32.768882751464844)
      Predict: 0.0
   Else (feature 2 > 0.5)
    If (feature 3 <= 28.791390419006348)
     Predict: 1.0
    Else (feature 3 > 28.791390419006348)
     If (feature 1 <= 37.36065673828125)
      Predict: 0.0
     Else (feature 1 > 37.36065673828125)
      If (feature 4 <= 0.5)
       Predict: 1.0
      Else (feature 4 > 0.5)
       Predict: 0.0
  Else (feature 3 > 40.78687858581543)
   If (feature 2 <= 0.5)
    Predict: 0.0
   Else (feature 2 > 0.5)
    If (feature 1 <= 38.4641170501709)
     If (feature 1 <= 37.36065673828125

In [20]:
predictions = dt_model.transform(test_data)

evaluator = MulticlassClassificationEvaluator(labelCol="raining", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

In [21]:
print("avg(raining): ", predictions.agg({'raining': 'mean'}).collect()[0][0])
print("avg(correct): ", accuracy)

avg(raining):  0.11627906976744186
avg(correct):  0.9302325581395349


In [22]:
final_df = dt_model.transform(assembler.transform(features))
s = final_df.select('station', 'date', 'prediction').writeStream.format("console").trigger(processingTime="5 seconds").outputMode("append").start()
s.awaitTermination(30)
s.stop()

23/04/29 05:40:36 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-c3ef3f84-3a53-4973-bcbe-20ba2d0ad96b. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/04/29 05:40:36 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


-------------------------------------------
Batch: 0
-------------------------------------------
+-------+----------+----------+
|station|      date|prediction|
+-------+----------+----------+
|      J|2000-01-13|       0.0|
|      I|2000-01-16|       0.0|
|      J|2000-01-18|       0.0|
|      J|2000-01-19|       0.0|
|      I|2000-01-20|       0.0|
|      J|2000-01-31|       0.0|
|      N|2000-01-03|       0.0|
|      N|2000-01-21|       0.0|
|      N|2000-01-30|       0.0|
|      K|2000-01-03|       0.0|
|      B|2000-01-04|       0.0|
|      B|2000-01-06|       0.0|
|      C|2000-01-06|       0.0|
|      C|2000-01-08|       0.0|
|      K|2000-01-09|       0.0|
|      A|2000-01-10|       0.0|
|      B|2000-01-12|       0.0|
|      L|2000-01-13|       0.0|
|      L|2000-01-14|       0.0|
|      B|2000-01-17|       1.0|
+-------+----------+----------+
only showing top 20 rows



23/04/29 05:40:49 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 5000 milliseconds, but spent 12710 milliseconds


-------------------------------------------
Batch: 1
-------------------------------------------
+-------+----------+----------+
|station|      date|prediction|
+-------+----------+----------+
|      K|2000-01-31|       0.0|
|      O|2000-01-31|       0.0|
|      N|2000-01-31|       0.0|
|      M|2000-01-31|       0.0|
|      A|2000-02-01|       0.0|
|      B|2000-02-01|       1.0|
|      L|2000-01-31|       0.0|
|      D|2000-02-01|       0.0|
|      C|2000-02-01|       0.0|
+-------+----------+----------+



23/04/29 05:40:55 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 5000 milliseconds, but spent 6376 milliseconds


-------------------------------------------
Batch: 2
-------------------------------------------
+-------+----------+----------+
|station|      date|prediction|
+-------+----------+----------+
|      I|2000-02-01|       0.0|
|      J|2000-02-01|       1.0|
|      G|2000-02-01|       0.0|
|      H|2000-02-01|       0.0|
|      E|2000-02-01|       0.0|
|      F|2000-02-01|       0.0|
+-------+----------+----------+



23/04/29 05:41:01 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 5000 milliseconds, but spent 5954 milliseconds
23/04/29 05:41:06 ERROR WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@444812ec is aborting.
23/04/29 05:41:06 ERROR WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@444812ec aborted.
23/04/29 05:41:06 WARN Shell: Interrupted while joining on: Thread[Thread-9709,5,main]
java.lang.InterruptedException
	at java.lang.Object.wait(Native Method)
	at java.lang.Thread.join(Thread.java:1257)
	at java.lang.Thread.join(Thread.java:1331)
	at org.apache.hadoop.util.Shell.joinThread(Shell.java:1043)
	at org.apache.hadoop.util.Shell.runCommand(Shell.java:1003)
	at org.apache.hadoop.util.Shell.run(Shell.java:901)
	at org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:1213)
	at org.apache.hadoop.util.S