### Streaming Data from Kafka

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json
from pyspark.sql.types import StructType, StringType


In [5]:
spark = SparkSession.builder \
    .appName("KafkaSparkConsumer") \
    .config("spark.sql.streaming.checkpointLocation", "/tmp/kafka_checkpoint") \
    .config("spark.jars.packages", "io.confluent:kafka-spark_2.12:5.5.0") \
    .getOrCreate()

25/03/09 00:09:22 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [6]:
video_schema = StructType() \
    .add("videoId", StringType()) \
    .add("title", StringType()) \
    .add("description", StringType()) \
    .add("channelId", StringType()) \
    .add("publishedAt", StringType()) \
    .add("viewCount", StringType()) \
    .add("likeCount", StringType()) \
    .add("commentCount", StringType())

In [7]:
kafka_df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "youtube_video_info") \
    .option("startingOffsets", "earliest") \
    .load()

In [9]:
parsed_df = kafka_df \
    .selectExpr("CAST(value AS STRING)") \
    .select(from_json(col("value"), video_schema).alias("data")) \
    .select("data.*")

parsed_df.printSchema()

root
 |-- videoId: string (nullable = true)
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- channelId: string (nullable = true)
 |-- publishedAt: string (nullable = true)
 |-- viewCount: string (nullable = true)
 |-- likeCount: string (nullable = true)
 |-- commentCount: string (nullable = true)



In [13]:
query = parsed_df \
    .writeStream \
    .outputMode("append") \
    .format("console") \
    .start()

# Stop the streaming query after 10 sec
query.awaitTermination(10)

25/03/09 00:14:00 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/03/09 00:14:00 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+-------+--------------------+--------------------+---------+-----------+---------+---------+------------+
|videoId|               title|         description|channelId|publishedAt|viewCount|likeCount|commentCount|
+-------+--------------------+--------------------+---------+-----------+---------+---------+------------+
|   NULL|Learn Python - Fu...|This course will ...|     NULL|       NULL|     NULL|     NULL|        NULL|
|   NULL|Math's Fundamenta...|Not everything th...|     NULL|       NULL|     NULL|     NULL|        NULL|
|   NULL|Stealing Baseball...|I always sucked a...|     NULL|       NULL|     NULL|     NULL|        NULL|
|   NULL|Help Protect the ...|We are excited to...|     NULL|       NULL|     NULL|     NULL|        NULL|
|   NULL|Pinky and Panda F...|Pinky and Panda F...|     NULL|       NULL|     NULL|     NULL|        NULL|
+-------+--------------------+-----------------

False