In [1]:
!pip install kafka-python

Collecting kafka-python
  Downloading kafka_python-2.0.2-py2.py3-none-any.whl (246 kB)
     |████████████████████████████████| 246 kB 5.0 MB/s            
[?25hInstalling collected packages: kafka-python
Successfully installed kafka-python-2.0.2
You should consider upgrading via the '/home/kayser/.cache/pypoetry/virtualenvs/spark-ml-training-Z1-rjbZ7-py3.9/bin/python -m pip install --upgrade pip' command.[0m


# Ingesting data in Apache Kafka

In [6]:
from kafka import KafkaProducer
from time import sleep

import pandas as pd
import random
import decimal
import json
import os


In [7]:
KAFKA_HOST = "localhost:9092"
KAFKA_TOPIC = "demo"

In [9]:
producer = KafkaProducer(bootstrap_servers=KAFKA_HOST)

for i in range(1000):

  # create a dictionary of cpu and memory random data to send to Kafka
  message = {"id": i, "cpu": round(random.uniform(0, 100), 2), "memory": round(random.uniform(0, 100), 2)}
  producer.send(KAFKA_TOPIC, json.dumps(message).encode("utf-8"))
  producer.flush()

  print(message)

{'id': 0, 'cpu': 81.75, 'memory': 81.12}
{'id': 1, 'cpu': 71.29, 'memory': 23.14}
{'id': 2, 'cpu': 28.48, 'memory': 17.97}
{'id': 3, 'cpu': 33.96, 'memory': 60.82}
{'id': 4, 'cpu': 15.02, 'memory': 41.28}
{'id': 5, 'cpu': 16.42, 'memory': 86.39}
{'id': 6, 'cpu': 71.33, 'memory': 49.86}
{'id': 7, 'cpu': 16.06, 'memory': 88.89}
{'id': 8, 'cpu': 84.49, 'memory': 28.3}
{'id': 9, 'cpu': 40.9, 'memory': 10.79}
{'id': 10, 'cpu': 25.58, 'memory': 9.04}
{'id': 11, 'cpu': 61.59, 'memory': 40.86}
{'id': 12, 'cpu': 75.12, 'memory': 9.19}
{'id': 13, 'cpu': 61.07, 'memory': 0.43}
{'id': 14, 'cpu': 3.13, 'memory': 47.56}
{'id': 15, 'cpu': 15.22, 'memory': 61.58}
{'id': 16, 'cpu': 14.28, 'memory': 24.01}
{'id': 17, 'cpu': 30.25, 'memory': 70.21}
{'id': 18, 'cpu': 30.72, 'memory': 28.39}
{'id': 19, 'cpu': 99.43, 'memory': 16.24}
{'id': 20, 'cpu': 57.71, 'memory': 34.05}
{'id': 21, 'cpu': 18.4, 'memory': 61.71}
{'id': 22, 'cpu': 92.6, 'memory': 71.62}
{'id': 23, 'cpu': 30.21, 'memory': 1.91}
{'id': 24, 

# Reading data from Apache Kafka with Apache Spark Structured Streaming

https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html

In [10]:
from pyspark.sql.types import (
    StructType,
    StructField,
    FloatType,
    StringType,
    LongType,
    IntegerType,
    DecimalType,
)
from pyspark.sql.types import (
    StructType,
    StructField,
    FloatType,
    StringType,
    LongType,
    IntegerType,
    DoubleType,
)
from pyspark.sql.functions import (
    split,
    regexp_replace,
    current_date,
    unix_timestamp,
    lit,
    current_timestamp,
)

from pyspark.sql.functions import col, from_json, struct, to_json
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark import SparkContext, SparkConf
from pyspark.sql import functions as F
from pyspark.sql import SparkSession

import pandas as pd
import pickle
import json
import time
import os


In [11]:
spark = (
    SparkSession.builder
    .appName("Spark Structured Streaming Application")
    .master("local[*]")
    .getOrCreate()
)

spark

22/06/08 20:26:56 WARN Utils: Your hostname, carloshkayser resolves to a loopback address: 127.0.1.1; using 10.32.45.215 instead (on interface ens160)
22/06/08 20:26:56 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/06/08 20:26:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/06/08 20:26:58 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [None]:
# Subscribe to 1 topic
df = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "host1:port1,host2:port2") \
  .option("subscribe", "topic1") \
  .load()

df = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

query = df \
    .writeStream \
    .format("console") \
    .start()

time.sleep(10) # sleep 10 seconds

query.stop()


In [None]:
# Subscribe to 1 topic, with headers
# TODO

In [None]:
# Subscribe to 1 topic defaults to the earliest and latest offsets
# TODO

## startingOffsets

"latest" for streaming, "earliest" for batch

The start point when a query is started, either "earliest" which is from the earliest offsets, "latest" which is just from the latest offsets, or a json string specifying a starting offset for each TopicPartition. In the json, -2 as an offset can be used to refer to earliest, -1 to latest. Note: For batch queries, latest (either implicitly or by using -1 in json) is not allowed. For streaming queries, this only applies when a new query is started, and that resuming will always pick up from where the query left off. Newly discovered partitions during a query will start at earliest.

In [None]:
# Subscribe to multiple topics, specifying explicit Kafka offsets

df = spark \
  .read \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "host1:port1,host2:port2") \
  .option("subscribe", KAFKA_TOPIC) \
  .option("startingOffsets", """{"topic1":{"0":23,"1":-2}""") \
  .option("endingOffsets", """{"topic1":{"0":50,"1":-1}""") \
  .load()

df = df.selectExpr("CAST(value AS STRING)")

query = df \
    .writeStream \
    .format("console") \
    .start()

time.sleep(10) # sleep 10 seconds

query.stop()