# Spark Structured Streaming Application

This notebook contains ... TODO



In [None]:
# Install pre-requisites
!pip install ipython-sql psycopg2-binary pyspark==3.2.1

In [None]:
%load_ext sql

                                                                                

In [1]:
from pyspark.sql.types import (
    StructType,
    StructField,
    FloatType,
    StringType,
    LongType,
    IntegerType,
    DecimalType,
)
from pyspark.sql.types import (
    StructType,
    StructField,
    FloatType,
    StringType,
    LongType,
    IntegerType,
    DoubleType,
)
from pyspark.sql.functions import (
    split,
    regexp_replace,
    current_date,
    unix_timestamp,
    lit,
    current_timestamp,
)

from pyspark.sql.functions import col, from_json, struct, to_json
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark import SparkContext, SparkConf
from pyspark.sql import functions as F
from pyspark.sql import SparkSession

import pandas as pd
import pickle
import json
import time
import os


In [2]:
os.environ[
    "PYSPARK_SUBMIT_ARGS"
] = "--packages org.apache.spark:spark-streaming-kafka-0-10_2.12:3.2.0,org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0,org.postgresql:postgresql:42.1.1 pyspark-shell"


APP_NAME = os.getenv("APP_NAME", "spark-streaming-app")
MASTER = os.getenv("MASTER", "local[*]")

# 192.168.49.2 == minikube ip
# 30323 == NodePort

KAFKA_HOST = "192.168.49.2:30323"  # "kafka-cluster-kafka-bootstrap:9092"

# MASTER = "spark://carloshkayser:7077"

spark = (
    SparkSession.builder.appName("Spark Structured Streaming Application")
    .master(MASTER)
    .getOrCreate()
)

spark.sparkContext.setLogLevel('ERROR')

spark


22/06/09 01:48:39 WARN Utils: Your hostname, carloshkayser resolves to a loopback address: 127.0.1.1; using 10.32.45.215 instead (on interface ens160)
22/06/09 01:48:39 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/kayser/spark-3.2.1-bin-hadoop3.2/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/kayser/.ivy2/cache
The jars for the packages stored in: /home/kayser/.ivy2/jars
org.apache.spark#spark-streaming-kafka-0-10_2.12 added as a dependency
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
org.postgresql#postgresql added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-c97df09a-1c8a-4da6-874a-b1644037148d;1.0
	confs: [default]
	found org.apache.spark#spark-streaming-kafka-0-10_2.12;3.2.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.2.0 in central
	found org.apache.kafka#kafka-clients;2.8.0 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.8.4 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.1 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.hadoop#hadoop-client-api;3.3.1 in central
	found org.apache.htrace#htrace-core4;4.1.0-incubati

In [3]:
df_raw = (
    spark.readStream.format("kafka")
    .option("kafka.bootstrap.servers", KAFKA_HOST)
    .option("subscribe", "to_predict")
    .option("startingOffsets", "latest")
    .load()
)


In [4]:
df_raw.printSchema()


root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [5]:
# {
#   "id": 1.0079274744188029e+19,
#   "hour": 14103100,
#   "C1": 1005,
#   "banner_pos": 0,
#   "site_id": "85f751fd",
#   "site_domain": "c4e18dd6",
#   "site_category": "50e219e0",
#   "app_id": "febd1138",
#   "app_domain": "82e27996",
#   "app_category": "0f2161f8",
#   "device_id": "a99f214a",
#   "device_ip": "b72692c8",
#   "device_model": "99e427c9",
#   "device_type": 1,
#   "device_conn_type": 0,
#   "C14": 21611,
#   "C15": 320,
#   "C16": 50,
#   "C17": 2480,
#   "C18": 3,
#   "C19": 299,
#   "C20": 100111,
#   "C21": 61
# }

schema = StructType(
    [
        StructField("id", DecimalType(38, 0), True),
        StructField("hour", IntegerType(), True),
        StructField("C1", IntegerType(), True),
        StructField("banner_pos", IntegerType(), True),
        StructField("site_id", StringType(), True),
        StructField("site_domain", StringType(), True),
        StructField("site_category", StringType(), True),
        StructField("app_id", StringType(), True),
        StructField("app_domain", StringType(), True),
        StructField("app_category", StringType(), True),
        StructField("device_id", StringType(), True),
        StructField("device_ip", StringType(), True),
        StructField("device_model", StringType(), True),
        StructField("device_type", IntegerType(), True),
        StructField("device_conn_type", IntegerType(), True),
        StructField("C14", IntegerType(), True),
        StructField("C15", IntegerType(), True),
        StructField("C16", IntegerType(), True),
        StructField("C17", IntegerType(), True),
        StructField("C18", IntegerType(), True),
        StructField("C19", IntegerType(), True),
        StructField("C20", IntegerType(), True),
        StructField("C21", IntegerType(), True),
    ]
)


In [6]:
df = (
    df_raw.selectExpr("CAST(value AS STRING)")
    .select(from_json("value", schema).alias("data"))
    .select("data.*")
)

df.printSchema()


root
 |-- id: decimal(38,0) (nullable = true)
 |-- hour: integer (nullable = true)
 |-- C1: integer (nullable = true)
 |-- banner_pos: integer (nullable = true)
 |-- site_id: string (nullable = true)
 |-- site_domain: string (nullable = true)
 |-- site_category: string (nullable = true)
 |-- app_id: string (nullable = true)
 |-- app_domain: string (nullable = true)
 |-- app_category: string (nullable = true)
 |-- device_id: string (nullable = true)
 |-- device_ip: string (nullable = true)
 |-- device_model: string (nullable = true)
 |-- device_type: integer (nullable = true)
 |-- device_conn_type: integer (nullable = true)
 |-- C14: integer (nullable = true)
 |-- C15: integer (nullable = true)
 |-- C16: integer (nullable = true)
 |-- C17: integer (nullable = true)
 |-- C18: integer (nullable = true)
 |-- C19: integer (nullable = true)
 |-- C20: integer (nullable = true)
 |-- C21: integer (nullable = true)



In [7]:
from pyspark.ml import PipelineModel

# Read the model from disk
pipelineModel = PipelineModel.load("model/spark-logistic-regression-model")

# Apply machine learning pipeline to the data
results = pipelineModel.transform(df)

results.printSchema()


                                                                                

root
 |-- id: decimal(38,0) (nullable = true)
 |-- hour: integer (nullable = true)
 |-- C1: integer (nullable = true)
 |-- banner_pos: integer (nullable = true)
 |-- site_id: string (nullable = true)
 |-- site_domain: string (nullable = true)
 |-- site_category: string (nullable = true)
 |-- app_id: string (nullable = true)
 |-- app_domain: string (nullable = true)
 |-- app_category: string (nullable = true)
 |-- device_id: string (nullable = true)
 |-- device_ip: string (nullable = true)
 |-- device_model: string (nullable = true)
 |-- device_type: integer (nullable = true)
 |-- device_conn_type: integer (nullable = true)
 |-- C14: integer (nullable = true)
 |-- C15: integer (nullable = true)
 |-- C16: integer (nullable = true)
 |-- C17: integer (nullable = true)
 |-- C18: integer (nullable = true)
 |-- C19: integer (nullable = true)
 |-- C20: integer (nullable = true)
 |-- C21: integer (nullable = true)
 |-- site_idIndex: double (nullable = false)
 |-- site_domainIndex: double (nulla

In [8]:
results = results.withColumn("processed_at", current_timestamp())

results = (
    results.withColumn("probability", results["probability"].cast("String"))
    .withColumn(
        "probabilityre",
        split(regexp_replace("probability", "^\[|\]", ""), ",")[1].cast(DoubleType()),
    )
    .select("id", "probabilityre", "processed_at")
    .withColumnRenamed("probabilityre", "probability")
)

results_kafka = results.select(
    to_json(struct("id", "probability", "processed_at")).alias("value")
)

results_postgres = results.select(
    "id", "probability", "processed_at"
)

In [9]:
results_kafka.printSchema()

root
 |-- value: string (nullable = true)



In [10]:
results_postgres.printSchema()


root
 |-- id: decimal(38,0) (nullable = true)
 |-- probability: double (nullable = true)
 |-- processed_at: timestamp (nullable = false)



### Logging the data stream in the console

In [11]:
query = results.select("id", "probability", "processed_at") \
    .writeStream \
    .format("console") \
    .start()

time.sleep(10)

query.stop()


                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+---+-----------+------------+
| id|probability|processed_at|
+---+-----------+------------+
+---+-----------+------------+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+--------------------+------------------+--------------------+
|                  id|       probability|        processed_at|
+--------------------+------------------+--------------------+
|10731237011332368384|0.6400365553800906|2022-06-09 01:49:...|
|10731237011332368384|0.6400365553800906|2022-06-09 01:49:...|
|10731273124311236608|0.2563480426642468|2022-06-09 01:49:...|
|10731237011332368384|0.6400365553800906|2022-06-09 01:49:...|
|10731273124311236608|0.2563480426642468|2022-06-09 01:49:...|
|10731273124311236608|0.2563480426642468|2022-06-09 01:49:...|
+--------------------+------------------+--------------------+

-------------------------------------------
Batch: 2
-------------------------------------------
+--------------------+-------------------+--------------------+
|                  id|        probability|        processed_at|
+--------------------+-------------------+-----

22/06/09 01:49:08 ERROR WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@c17a26 is aborting.
22/06/09 01:49:08 ERROR WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@c17a26 aborted.
22/06/09 01:49:08 ERROR Utils: Aborting task                        (1 + 1) / 2]
org.apache.spark.TaskKilledException
	at org.apache.spark.TaskContextImpl.killTaskIfInterrupted(TaskContextImpl.scala:216)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:36)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:513)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCod

### Inserting data stream transformation results into another Apache Kafka topic

In [12]:
query = results_kafka.writeStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_HOST) \
    .option("topic", "predictions") \
    .option("checkpointLocation", "checkpointLocation") \
    .outputMode("Append") \
    .start()

In [13]:
!kafka-console-consumer --bootstrap-server localhost:9092 --topic predictions

[2022-06-09 01:49:10,221] WARN [Consumer clientId=console-consumer, groupId=console-consumer-74275] Connection to node -1 (localhost/127.0.0.1:9092) could not be established. Broker may not be available. (org.apache.kafka.clients.NetworkClient)
[2022-06-09 01:49:10,222] WARN [Consumer clientId=console-consumer, groupId=console-consumer-74275] Bootstrap broker localhost:9092 (id: -1 rack: null) disconnected (org.apache.kafka.clients.NetworkClient)
[2022-06-09 01:49:10,320] WARN [Consumer clientId=console-consumer, groupId=console-consumer-74275] Connection to node -1 (localhost/127.0.0.1:9092) could not be established. Broker may not be available. (org.apache.kafka.clients.NetworkClient)
[2022-06-09 01:49:10,321] WARN [Consumer clientId=console-consumer, groupId=console-consumer-74275] Bootstrap broker localhost:9092 (id: -1 rack: null) disconnected (org.apache.kafka.clients.NetworkClient)
[2022-06-09 01:49:10,421] WARN [Consumer clientId=console-consumer, groupId=console-consumer-74275

                                                                                

[2022-06-09 01:49:12,887] WARN [Consumer clientId=console-consumer, groupId=console-consumer-74275] Connection to node -1 (localhost/127.0.0.1:9092) could not be established. Broker may not be available. (org.apache.kafka.clients.NetworkClient)
[2022-06-09 01:49:12,887] WARN [Consumer clientId=console-consumer, groupId=console-consumer-74275] Bootstrap broker localhost:9092 (id: -1 rack: null) disconnected (org.apache.kafka.clients.NetworkClient)


[Stage 59:>                                                         (0 + 1) / 1]

[2022-06-09 01:49:14,044] WARN [Consumer clientId=console-consumer, groupId=console-consumer-74275] Connection to node -1 (localhost/127.0.0.1:9092) could not be established. Broker may not be available. (org.apache.kafka.clients.NetworkClient)
[2022-06-09 01:49:14,044] WARN [Consumer clientId=console-consumer, groupId=console-consumer-74275] Bootstrap broker localhost:9092 (id: -1 rack: null) disconnected (org.apache.kafka.clients.NetworkClient)


                                                                                

[2022-06-09 01:49:15,050] WARN [Consumer clientId=console-consumer, groupId=console-consumer-74275] Connection to node -1 (localhost/127.0.0.1:9092) could not be established. Broker may not be available. (org.apache.kafka.clients.NetworkClient)
[2022-06-09 01:49:15,050] WARN [Consumer clientId=console-consumer, groupId=console-consumer-74275] Bootstrap broker localhost:9092 (id: -1 rack: null) disconnected (org.apache.kafka.clients.NetworkClient)


[Stage 60:>                                                         (0 + 3) / 3]

[2022-06-09 01:49:16,209] WARN [Consumer clientId=console-consumer, groupId=console-consumer-74275] Connection to node -1 (localhost/127.0.0.1:9092) could not be established. Broker may not be available. (org.apache.kafka.clients.NetworkClient)
[2022-06-09 01:49:16,209] WARN [Consumer clientId=console-consumer, groupId=console-consumer-74275] Bootstrap broker localhost:9092 (id: -1 rack: null) disconnected (org.apache.kafka.clients.NetworkClient)


                                                                                

[2022-06-09 01:49:17,219] WARN [Consumer clientId=console-consumer, groupId=console-consumer-74275] Connection to node -1 (localhost/127.0.0.1:9092) could not be established. Broker may not be available. (org.apache.kafka.clients.NetworkClient)
[2022-06-09 01:49:17,220] WARN [Consumer clientId=console-consumer, groupId=console-consumer-74275] Bootstrap broker localhost:9092 (id: -1 rack: null) disconnected (org.apache.kafka.clients.NetworkClient)


[Stage 61:>                                                         (0 + 2) / 2]

[2022-06-09 01:49:18,275] WARN [Consumer clientId=console-consumer, groupId=console-consumer-74275] Connection to node -1 (localhost/127.0.0.1:9092) could not be established. Broker may not be available. (org.apache.kafka.clients.NetworkClient)
[2022-06-09 01:49:18,275] WARN [Consumer clientId=console-consumer, groupId=console-consumer-74275] Bootstrap broker localhost:9092 (id: -1 rack: null) disconnected (org.apache.kafka.clients.NetworkClient)


                                                                                

^C
Processed a total of 0 messages


[Stage 65:>                                                         (0 + 3) / 3]

In [14]:
query.stop()

22/06/09 01:49:25 ERROR WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@61cf7630 is aborting.
22/06/09 01:49:25 ERROR WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@61cf7630 aborted.
22/06/09 01:49:25 ERROR Utils: Aborting task
org.apache.spark.TaskKilledException
	at org.apache.spark.TaskContextImpl.killTaskIfInterrupted(TaskContextImpl.scala:216)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:36)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:513)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeSt

### Inserting data stream into PostgreSQL database

In [None]:
# Create a PostgreSQL database with Docker
!docker run -d -e POSTGRES_PASSWORD=postgres -p 5432:5432 --name postgres postgres:11.7-alpine

In [None]:
# Get PostgreSQL logs
!docker logs postgres

In [None]:
%%sql postgresql://postgres:postgres@localhost:5432/postgres

CREATE TABLE predictions (
	id DECIMAL(38, 0),
	probability DOUBLE PRECISION,
	processed_at TIMESTAMP
);

In [15]:
def foreach_batch_function(df, epoch_id):

    df.write.format("jdbc").option(
        "url", "jdbc:postgresql://localhost:5432/postgres"
    ).option("driver", "org.postgresql.Driver").option("dbtable", "predictions").option(
        "user", "postgres"
    ).option(
        "password", "postgres"
    ).mode(
        "append"
    ).save()

query = results_postgres.writeStream.foreachBatch(foreach_batch_function).option(
    "checkpointLocation", "checkpointLocation"
).outputMode("update").start()


                                                                                

In [21]:
%%sql postgresql://postgres:postgres@localhost:5432/postgres

SELECT COUNT(*) FROM PREDICTIONS;


1 rows affected.


count
126


                                                                                

In [None]:
%%sql postgresql://postgres:postgres@localhost:5432/postgres

SELECT
	*
FROM 
	PREDICTIONS
ORDER BY
	PROCESSED_AT DESC;


In [None]:
# Stop data streams
query.stop()

In [None]:
# # Create Temp View
# df.createOrReplaceTempView("dataframe")

# # Apply UDF in SQL query.
# resultDF = spark.sql("select predict(*) as up_down_udf from dataframe")
