In [None]:
!pip install kafka-python

# Ingesting data in Apache Kafka

In [None]:
from kafka import KafkaProducer
from time import sleep

import pandas as pd
import random
import decimal
import json
import uuid
import os


In [None]:
KAFKA_HOST = "192.168.49.2:30323"

producer = KafkaProducer(bootstrap_servers=KAFKA_HOST)

In [None]:
KAFKA_TOPIC = "demo-users"

for i in range(10):

  # create a dictionary of name and email random data to send to Kafka
  message = {"id": str(uuid.uuid4()), "name": "name_" + str(i), "email": "email_" + str(i) + "@example.com"}

  producer.send(KAFKA_TOPIC, json.dumps(message).encode("utf-8"))
  producer.flush()

  print(message)

In [None]:
KAFKA_TOPIC = "demo-logs"

for i in range(10):

  # create a dictionary of cpu and memory random data to send to Kafka
  message = {"id": i, "cpu": round(random.uniform(0, 100), 2), "memory": round(random.uniform(0, 100), 2)}
  producer.send(KAFKA_TOPIC, json.dumps(message).encode("utf-8"))
  producer.flush()

  print(message)

# Reading data from Apache Kafka with Apache Spark Structured Streaming

https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html

In [None]:
from pyspark.sql.types import (
    StructType,
    StructField,
    FloatType,
    StringType,
    LongType,
    IntegerType,
    DecimalType,
)
from pyspark.sql.types import (
    StructType,
    StructField,
    FloatType,
    StringType,
    LongType,
    IntegerType,
    DoubleType,
)
from pyspark.sql.functions import (
    split,
    regexp_replace,
    current_date,
    unix_timestamp,
    lit,
    current_timestamp,
)

from pyspark.sql.functions import col, from_json, struct, to_json
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark import SparkContext, SparkConf
from pyspark.sql import functions as F
from pyspark.sql import SparkSession

import pandas as pd
import pickle
import json
import time
import os


In [None]:
os.environ[
    "PYSPARK_SUBMIT_ARGS"
] = "--packages org.apache.spark:spark-streaming-kafka-0-10_2.12:3.2.0,org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0,org.postgresql:postgresql:42.1.1 pyspark-shell"

spark = (
    SparkSession.builder
    .appName("Spark Structured Streaming Application")
    .master("local[*]")
    .getOrCreate()
)

spark

# readStream from Kafka

**Required configurations:**

- kafka.bootstrap.servers: A comma-separated list of host:port
- subscribe: A comma-separated list of topics
- 

**Optional configurations:**

- startingTimestamp
- startingOffsetsByTimestamp
- startingOffsets:
  - "latest" for streaming
  - "earliest" for batch
  - {"topicA":{"0":23,"1":-1}} for specifying a starting offset for each TopicPartition
  - 
  

In [None]:
# Subscribe to demo topic
df = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", KAFKA_HOST) \
  .option("subscribe", "demo, demo-logs") \
  .option("startingOffsets", "earliest") \
  .load()

# df = df.selectExpr("CAST(value AS STRING)")

query = df \
    .where(col("topic") == "demo") \
    .writeStream \
    .format("console") \
    .start()

time.sleep(10) # sleep 10 seconds

query.stop()


In [None]:
# Subscribe to demo users and logs topic
df = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", KAFKA_HOST) \
  .option("subscribe", "demo-users, demo-logs") \
  .option("startingOffsets", "earliest") \
  .load()

df = df.selectExpr("CAST(value AS STRING)")

query = df \
    .writeStream \
    .format("console") \
    .start()

time.sleep(10) # sleep 10 seconds

query.stop()

In [None]:
# Subscribe to demo users and logs topic
df = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", KAFKA_HOST) \
  .option("subscribe", "demo-users, demo-logs") \
  .option("startingOffsets", "earliest") \
  .load()

df = df.selectExpr("CAST(value AS STRING)")

# query = df \
#     .writeStream \
#     .format("console") \
#     .start()

# time.sleep(10) # sleep 10 seconds

# query.stop()

# write the datastream to json file
query = df \
    .writeStream \
    .format("json") \
    .option("path", "demo-users-and-logs") \
    .option("checkpointLocation", "checkpointLocation") \
    .start()

query.awaitTermination()


In [None]:
!cat demo-users-and-logs/part-00000-*

In [None]:
# Subscribe to 1 topic, with headers
# TODO

In [None]:
# Subscribe to 1 topic defaults to the earliest and latest offsets
# TODO

## startingOffsets

"latest" for streaming, "earliest" for batch

The start point when a query is started, either "earliest" which is from the earliest offsets, "latest" which is just from the latest offsets, or a json string specifying a starting offset for each TopicPartition. In the json, -2 as an offset can be used to refer to earliest, -1 to latest. Note: For batch queries, latest (either implicitly or by using -1 in json) is not allowed. For streaming queries, this only applies when a new query is started, and that resuming will always pick up from where the query left off. Newly discovered partitions during a query will start at earliest.

In [None]:
# Subscribe to multiple topics, specifying explicit Kafka offsets

df = spark \
  .read \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "host1:port1,host2:port2") \
  .option("subscribe", KAFKA_TOPIC) \
  .option("startingOffsets", """{"topic1":{"0":23,"1":-2}""") \
  .option("endingOffsets", """{"topic1":{"0":50,"1":-1}""") \
  .load()

df = df.selectExpr("CAST(value AS STRING)")

query = df \
    .writeStream \
    .format("console") \
    .start()

time.sleep(10) # sleep 10 seconds

query.stop()

In [None]:
query = values.writeStream \
    .format("json") \
    .outputMode("append")
    .start("./topic.json")
    
import time

time.sleep(10) # sleep 10 seconds

query.stop()

In [None]:
# get current timestamp
df = df.withColumn("timestamp", current_timestamp())

