# Part 3: Spark Streaming

In [1]:
from pyspark.sql import SparkSession
spark = (SparkSession.builder.appName("cs544")
         .config("spark.sql.shuffle.partitions", 10)
         .config("spark.ui.showConsoleProgress", False)
         .config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.2')
         .getOrCreate())

df = (
    spark.readStream.format("kafka")
    .option("kafka.bootstrap.servers", "kafka:9092")
    .option("subscribe", "stations-json")
    .option("startingOffsets", "earliest")
    .load()
)

:: loading settings :: url = jar:file:/usr/local/lib/python3.10/dist-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-51e536d0-6b73-494c-a882-c504634e91fe;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.2.2 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.2.2 in central
	found org.apache.kafka#kafka-clients;2.8.1 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.8.4 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.1 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.hadoop#hadoop-client-api;3.3.1 in central
	found org.apache.htrace#htrace-core4;4.1.0-incubating in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.ap

In [4]:
from pyspark.sql.functions import col, from_json

schema = "station STRING, date DATE, degrees DOUBLE, raining INT"
stations = (df.select(col("key").cast("string"),
          from_json(col("value").cast("string"), schema).alias("value"))
    .select("key", "value.*"))

## Stats

In [5]:
import pyspark.sql.functions as functions

counts_df = stations.groupBy("station")\
                .agg(functions.min("date").alias("start"), \
                     functions.max("date").alias("end"), \
                     functions.count("*").alias("measurements"), \
                     functions.avg("degrees").alias("avg"), \
                     functions.max("degrees").alias("max")) \
                .sort("station")
counts_df

DataFrame[station: string, start: date, end: date, measurements: bigint, avg: double, max: double]

In [7]:
s = counts_df.writeStream.format("console").trigger(processingTime="5 seconds").outputMode("complete").start()
s.awaitTermination(30)
s.stop()

23/04/29 04:25:43 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-9722e7dc-c846-48b6-b216-88dd38911ebe. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/04/29 04:25:43 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


-------------------------------------------
Batch: 0
-------------------------------------------
+-------+----------+----------+------------+------------------+------------------+
|station|     start|       end|measurements|               avg|               max|
+-------+----------+----------+------------+------------------+------------------+
|      A|2000-01-01|2019-06-02|        7093|56.668704420792686|107.44151918167982|
|      B|2000-01-01|2019-06-02|        7093| 48.10064640296037| 99.58872422458987|
|      C|2000-01-01|2019-06-02|        7093| 43.94465266851925|  93.4385638974453|
|      D|2000-01-01|2019-06-02|        7093| 63.15489726310973|112.74755001728609|
|      E|2000-01-01|2019-06-02|        7093| 57.13525259560014| 110.0060171182283|
|      F|2000-01-01|2019-06-02|        7093| 68.53567341979365|122.13936667629562|
|      G|2000-01-01|2019-06-02|        7093| 67.61963924542304|121.37722118686678|
|      H|2000-01-01|2019-06-02|        7093| 67.36593873868541| 117.31044

23/04/29 04:25:51 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 5000 milliseconds, but spent 7697 milliseconds


-------------------------------------------
Batch: 1
-------------------------------------------
+-------+----------+----------+------------+------------------+------------------+
|station|     start|       end|measurements|               avg|               max|
+-------+----------+----------+------------+------------------+------------------+
|      A|2000-01-01|2019-06-07|        7098|56.685522070683334|107.44151918167982|
|      B|2000-01-01|2019-06-07|        7098| 48.11131828093756| 99.58872422458987|
|      C|2000-01-01|2019-06-07|        7098| 43.96138812007781|  93.4385638974453|
|      D|2000-01-01|2019-06-07|        7098|  63.1714622293232|112.74755001728609|
|      E|2000-01-01|2019-06-07|        7098| 57.15407008549074| 110.0060171182283|
|      F|2000-01-01|2019-06-07|        7098| 68.54157883115742|122.13936667629562|
|      G|2000-01-01|2019-06-07|        7098| 67.62998916057417|121.37722118686678|
|      H|2000-01-01|2019-06-07|        7098|  67.3791004958546| 117.31044

## Rain Forecast Dataset

In [8]:
# Creating today DataFrame
today = stations.select("station", "date", "raining")

In [9]:
# Creating features DataFrame
df_yesterday = stations.select("station",
                               col("degrees").alias("sub1degrees"), \
                               col("raining").alias("sub1raining"), \
                               functions.date_add("date", 1).alias("date"))

df_two_days_ago = stations.select("station",
                                  col("degrees").alias("sub2degrees"), \
                                  col("raining").alias("sub2raining"), \
                                  functions.date_add("date", 2).alias("date"))
features = df_yesterday.join(df_two_days_ago, \
                             (df_yesterday.date == df_two_days_ago.date) & (df_yesterday.station == df_two_days_ago.station)) \
                    .select(df_yesterday.station, df_yesterday.date, functions.month(df_yesterday.date).alias("month"), "sub1degrees", "sub1raining", "sub2degrees", "sub2raining")

In [10]:
new_df = features.join(today, ["date", "station"])

s1 = (
    new_df
    .repartition(1)
    .writeStream.format("parquet")
    .outputMode("Append")
    .option("checkpointLocation", "checkpoint")
    .option("path", "pfiles")
    .trigger(processingTime="60 seconds")
    .start()
)

23/04/29 04:26:57 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


# Part 4: Spark ML

## Training and Evaluation

In [11]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [12]:
data = spark.read.format("parquet").load("pfiles")

In [13]:
va = VectorAssembler(inputCols=["month", "sub1degrees", "sub1raining", "sub2degrees", "sub2raining"], outputCol="features")
data = va.transform(data)

In [14]:
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

In [15]:
dt_classifier = DecisionTreeClassifier(featuresCol="features", labelCol="raining", predictionCol="prediction", maxDepth=5)
dt_model = dt_classifier.fit(train_data)
print(dt_model.toDebugString)

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_8c21bd7bd8d1, depth=5, numNodes=17, numClasses=2, numFeatures=5
  If (feature 2 <= 0.5)
   Predict: 0.0
  Else (feature 2 > 0.5)
   If (feature 1 <= 39.6116668769335)
    If (feature 1 <= 36.913191004253875)
     If (feature 0 <= 2.5)
      Predict: 0.0
     Else (feature 0 > 2.5)
      If (feature 1 <= 34.20731431135777)
       Predict: 0.0
      Else (feature 1 > 34.20731431135777)
       Predict: 1.0
    Else (feature 1 > 36.913191004253875)
     If (feature 0 <= 1.5)
      If (feature 3 <= 45.125769741185344)
       Predict: 0.0
      Else (feature 3 > 45.125769741185344)
       Predict: 1.0
     Else (feature 0 > 1.5)
      If (feature 3 <= 27.288838352542765)
       Predict: 0.0
      Else (feature 3 > 27.288838352542765)
       Predict: 1.0
   Else (feature 1 > 39.6116668769335)
    Predict: 1.0



In [16]:
predictions = dt_model.transform(test_data)
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="raining")
accuracy = evaluator.evaluate(predictions)

rain_count = test_data.select().filter(test_data.raining == 1).count()
total_count = test_data.select(test_data.raining).count()
rain_frequency = rain_count / total_count

print("Accuracy:", accuracy)
print("Percent of the time it is raining:", rain_frequency)

Accuracy: 0.7871331969860532
Percent of the time it is raining: 0.3531218483542679


## Model Deployment

In [17]:
features = va.transform(features)

In [20]:
weather_predictions = dt_model.transform(features).select("station", "date", "prediction").filter("station='A'")
s2 = weather_predictions.writeStream.format("console").trigger(processingTime="5 seconds").start()
s2.awaitTermination(40)
s2.stop()

23/04/29 04:36:34 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-42c65849-b9c0-4284-8596-47500cbf53e3. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/04/29 04:36:34 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


-------------------------------------------
Batch: 0
-------------------------------------------
+-------+----------+----------+
|station|      date|prediction|
+-------+----------+----------+
|      A|2000-01-13|       0.0|
|      A|2000-01-21|       0.0|
|      A|2000-01-23|       0.0|
|      A|2000-01-28|       0.0|
|      A|2000-02-01|       0.0|
|      A|2000-02-11|       0.0|
|      A|2000-02-16|       0.0|
|      A|2000-02-19|       0.0|
|      A|2000-02-21|       0.0|
|      A|2000-03-01|       0.0|
|      A|2000-03-04|       0.0|
|      A|2000-03-25|       1.0|
|      A|2000-03-29|       1.0|
|      A|2000-04-21|       0.0|
|      A|2000-04-25|       1.0|
|      A|2000-05-10|       0.0|
|      A|2000-05-22|       1.0|
|      A|2000-06-01|       1.0|
|      A|2000-06-04|       1.0|
|      A|2000-07-01|       1.0|
+-------+----------+----------+
only showing top 20 rows



23/04/29 04:36:48 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 5000 milliseconds, but spent 13858 milliseconds


-------------------------------------------
Batch: 1
-------------------------------------------
+-------+----------+----------+
|station|      date|prediction|
+-------+----------+----------+
|      A|2021-03-21|       1.0|
|      A|2021-03-22|       1.0|
|      A|2021-03-23|       0.0|
|      A|2021-03-17|       0.0|
|      A|2021-03-14|       1.0|
|      A|2021-03-20|       1.0|
|      A|2021-03-19|       1.0|
|      A|2021-03-18|       1.0|
|      A|2021-03-13|       1.0|
|      A|2021-03-15|       1.0|
|      A|2021-03-16|       1.0|
+-------+----------+----------+

-------------------------------------------
Batch: 2
-------------------------------------------
+-------+----------+----------+
|station|      date|prediction|
+-------+----------+----------+
|      A|2021-03-24|       0.0|
|      A|2021-03-26|       1.0|
|      A|2021-03-25|       0.0|
|      A|2021-03-27|       1.0|
|      A|2021-03-28|       1.0|
+-------+----------+----------+



23/04/29 04:36:59 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 5000 milliseconds, but spent 6199 milliseconds


-------------------------------------------
Batch: 3
-------------------------------------------
+-------+----------+----------+
|station|      date|prediction|
+-------+----------+----------+
|      A|2021-03-29|       1.0|
|      A|2021-04-01|       1.0|
|      A|2021-03-30|       1.0|
|      A|2021-03-31|       1.0|
|      A|2021-04-02|       1.0|
|      A|2021-04-03|       1.0|
+-------+----------+----------+

-------------------------------------------
Batch: 4
-------------------------------------------
+-------+----------+----------+
|station|      date|prediction|
+-------+----------+----------+
|      A|2021-04-07|       0.0|
|      A|2021-04-06|       0.0|
|      A|2021-04-04|       0.0|
|      A|2021-04-08|       1.0|
|      A|2021-04-05|       0.0|
+-------+----------+----------+

