In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, window
from pyspark.sql.types import StringType
from pymongo import MongoClient

In [6]:
packages = [
    'org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.1',
    'org.apache.kafka:kafka-clients:3.2.0'
]

spark = SparkSession.builder\
   .master("local")\
   .appName("kafka-example")\
   .config("spark.jars.packages", ",".join(packages))\
   .getOrCreate()

kafka_params = {
    "kafka.bootstrap.servers": "kafka:9092",
    "subscribe": "video_watch_topic",
    "startingOffsets": "earliest"
}

In [7]:
# Read data from Kafka
raw_data = spark \
    .readStream \
    .format("kafka") \
    .options(**kafka_params) \
    .load()

In [8]:
# Parse JSON data
parsed_data = raw_data \
    .select(from_json(col("value").cast("string"), schema).alias("data")) \
    .select("data.*")

# Group by date and video_id to count occurrences
trending_videos = parsed_data \
    .groupBy(window(col("watched_at"), "1 day"), "video_id") \
    .count() \
    .orderBy(col("window"), col("count").desc())

# Connect to MongoDB
client = MongoClient("mongodb://localhost:27017/")
db = client["bigdata"]
collection = db["trending_videos"]

# Define a function to write top 10 videos to MongoDB
def write_to_mongo(rows):
    top_videos = [row.video_id for row in rows[:10]]  # Extract only the top 10 videos
    collection.insert_one({"top_videos": top_videos})

# Start streaming query to continuously update the results
query = trending_videos \
    .writeStream \
    .outputMode("complete") \
    .foreachBatch(write_to_mongo) \
    .start()

# Await termination
query.awaitTermination()

NameError: name 'schema' is not defined