In [146]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession

scala_version = '2.12'  # your scala version
spark_version = '3.0.1' # your spark version
packages = [
    f'org.apache.spark:spark-sql-kafka-0-10_{scala_version}:{spark_version}',
    'org.apache.kafka:kafka-clients:2.8.0' #your kafka version
]
spark = SparkSession.builder.master("local").appName("kafka-example").config("spark.jars.packages", ",".join(packages)).getOrCreate()
spark

In [147]:
packages

['org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.1',
 'org.apache.kafka:kafka-clients:2.8.0']

## Creating a Kafka Source for Batch Queries
If you have a use case that is better suited to batch processing, you can create a Dataset/DataFrame for a defined range of offsets.

In [148]:
topic_name = 'RandomNumber'
kafka_server = 'localhost:9092'

kafkaDf = spark.read.format("kafka").option("kafka.bootstrap.servers", kafka_server).option("subscribe", topic_name).option("startingOffsets", "earliest").load()

In [149]:
kafkaDf.toPandas()

Unnamed: 0,key,value,topic,partition,offset,timestamp,timestampType
0,,"[123, 34, 110, 117, 109, 98, 101, 114, 34, 58,...",RandomNumber,0,0,2022-10-05 15:18:37.301,0
1,,"[123, 34, 110, 117, 109, 98, 101, 114, 34, 58,...",RandomNumber,0,1,2022-10-05 15:18:42.314,0
2,,"[123, 34, 110, 117, 109, 98, 101, 114, 34, 58,...",RandomNumber,0,2,2022-10-05 15:18:47.327,0
3,,"[123, 34, 110, 117, 109, 98, 101, 114, 34, 58,...",RandomNumber,0,3,2022-10-05 15:18:52.341,0
4,,"[123, 34, 110, 117, 109, 98, 101, 114, 34, 58,...",RandomNumber,0,4,2022-10-05 15:18:57.352,0
5,,"[123, 34, 110, 117, 109, 98, 101, 114, 34, 58,...",RandomNumber,0,5,2022-10-05 15:19:02.363,0
6,,"[123, 34, 110, 117, 109, 98, 101, 114, 34, 58,...",RandomNumber,0,6,2022-10-05 15:19:07.376,0
7,,"[123, 34, 110, 117, 109, 98, 101, 114, 34, 58,...",RandomNumber,0,7,2022-10-05 15:19:12.395,0
8,,"[123, 34, 110, 117, 109, 98, 101, 114, 34, 58,...",RandomNumber,0,8,2022-10-05 15:19:17.411,0
9,,"[123, 34, 110, 117, 109, 98, 101, 114, 34, 58,...",RandomNumber,0,9,2022-10-05 15:19:22.417,0


In [150]:
from pyspark.sql.functions import col, concat, lit
#keep running this cell to get new message from the Kafka topic
kafkaDf.select(col('topic'),col('offset'),col('value').cast('string').substr(12,1).alias('rand_number')).toPandas()

Unnamed: 0,topic,offset,rand_number
0,RandomNumber,0,2
1,RandomNumber,1,4
2,RandomNumber,2,7
3,RandomNumber,3,7
4,RandomNumber,4,3
5,RandomNumber,5,7
6,RandomNumber,6,6
7,RandomNumber,7,9
8,RandomNumber,8,7
9,RandomNumber,9,3


In [151]:
batchDF = kafkaDf.select(col('topic'),col('offset'),col('value').cast('string').substr(12,1).alias('rand_number'))


from time import sleep
from IPython.display import display, clear_output

for x in range(0, 2000):
    try:
        print("Showing live view refreshed every 5 seconds")
        print(f"Seconds passed: {x*5}")
        display(batchDF.toPandas())
        sleep(5)
        clear_output(wait=True)
    except KeyboardInterrupt:
        print("break")
        break
print("Live view ended...")

Showing live view refreshed every 5 seconds
Seconds passed: 10


Unnamed: 0,topic,offset,rand_number
0,RandomNumber,0,2
1,RandomNumber,1,4
2,RandomNumber,2,7
3,RandomNumber,3,7
4,RandomNumber,4,3
5,RandomNumber,5,7
6,RandomNumber,6,6
7,RandomNumber,7,9
8,RandomNumber,8,7
9,RandomNumber,9,3


break
Live view ended...


In [152]:
batchCountDF = batchDF.groupBy('rand_number').count()
for x in range(0, 2000):
    try:
        print("Showing live view refreshed every 5 seconds")
        print(f"Seconds passed: {x*5}")
        display(batchCountDF.toPandas())
        sleep(5)
        clear_output(wait=True)
    except KeyboardInterrupt:
        print("break")
        break
print("Live view ended...")

Showing live view refreshed every 5 seconds
Seconds passed: 5


Unnamed: 0,rand_number,count
0,7,5
1,3,2
2,8,1
3,0,1
4,5,1
5,6,3
6,9,1
7,1,1
8,4,1
9,2,1


break
Live view ended...


## Creating a Kafka Source for Streaming Queries

In [153]:

streamRawDf = spark.readStream.format("kafka").option("kafka.bootstrap.servers", kafka_server).option("subscribe", topic_name).load()
streamDF = streamRawDf.select(col('topic'),col('offset'),col('value').cast('string').substr(12,1).alias('rand_number'))
checkEvenDF = streamDF.withColumn('Is_Even',col('rand_number').cast('int') % 2 == 0 )


In [154]:
from random import randint
randNum=str(randint(0,10000))
q1name = "queryNumber"+randNum
q2name = "queryCheckEven"+randNum

stream_writer1 = (streamDF.writeStream.queryName(q1name).trigger(processingTime="5 seconds").outputMode("append").format("memory"))
stream_writer2 = (checkEvenDF.writeStream.queryName(q2name).trigger(processingTime="5 seconds").outputMode("append").format("memory"))

query1 = stream_writer1.start()    
query2 = stream_writer2.start()


In [155]:
for x in range(0, 2000):
    try:
        print("Showing live view refreshed every 5 seconds")
        print(f"Seconds passed: {x*5}")
        result1 = spark.sql(f"SELECT * from {query1.name}")
        result2 = spark.sql(f"SELECT * from {query2.name}")
        display(result1.toPandas())
        display(result2.toPandas())
        sleep(5)
        clear_output(wait=True)
    except KeyboardInterrupt:
        print("break")
        break
print("Live view ended...")

Showing live view refreshed every 5 seconds
Seconds passed: 20


Unnamed: 0,topic,offset,rand_number
0,RandomNumber,21,0
1,RandomNumber,22,8
2,RandomNumber,23,2
3,RandomNumber,24,1


Unnamed: 0,topic,offset,rand_number,Is_Even
0,RandomNumber,21,0,True
1,RandomNumber,22,8,True
2,RandomNumber,23,2,True
3,RandomNumber,24,1,False


break
Live view ended...
