In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, window
from pyspark.sql.types import StringType
from pymongo import MongoClient
from pyspark.sql.functions import from_unixtime
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType

In [3]:
packages = [
    'org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.1',
    'org.apache.kafka:kafka-clients:3.2.0'
]

spark = SparkSession.builder\
   .master("local")\
   .appName("kafka-example")\
   .config("spark.jars.packages", ",".join(packages))\
   .getOrCreate()



:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
org.apache.kafka#kafka-clients added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-031aefe9-4f77-4650-bf15-cd9af94b9325;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.2.1 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.2.1 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.1 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.hadoop#hadoop-client-api;3.3.1 in central
	found org.apache.htrace#htrace-core4;4.1.0-incubating in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.6.2 in central
	found org.apache.kafka#kafka-clients;3.2.0 in central


In [4]:
kafka_params = {
    "kafka.bootstrap.servers": "kafka:9092",
    "subscribe": "VID_OPEN_TOPIC",
    "startingOffsets": "earliest"
}

# Read data from Kafka
raw_data = spark \
    .readStream \
    .format("kafka") \
    .options(**kafka_params) \
    .load()

In [5]:
# schema = "user_id STRING, video_id STRING, watched_at TIMESTAMP"
# schema = "user_id STRING, video_id STRING, timestamp TIMESTAMP"
schema = StructType([
    StructField("user_id", StringType(), False),
    StructField("video_id", StringType(), False),
    StructField("timestamp", TimestampType(), False)
])

In [None]:
# Assuming 'value' is the column containing the JSON data
parsed_data = raw_data\
    .select(from_json(col("value").cast("string"), schema).alias("data"))\
    .select("data.timestamp", "data.video_id")\
    .withWatermark("timestamp", "10 minutes")\
    .groupBy("video_id").count()\
    .orderBy(col("count").desc()).limit(5)

query = parsed_data.writeStream \
    .outputMode("complete")\
    .format("console") \
    .option("truncate", False)\
    .start()

result = query.awaitTermination()

# query.stop()

NameError: name 'result' is not defined

#### Sample value in `VID_OPEN_TOPIC` Kafka Topic
`{"timestamp": 1715268711.870222, "video_id": "QdBZY2fkU-0", "email": "allenA"}`

In [None]:
# Connect to MongoDB
client = MongoClient("mongodb://localhost:27017/")
db = client["bigdata"]
collection = db["trending_videos"]

In [None]:
# Define a function to write top 10 videos to MongoDB
def write_to_mongo(rows):
    top_videos = [row.video_id for row in rows[:10]]  # Extract only the top 10 videos
    collection.insert_one({"top_videos": top_videos})

# Start streaming query to continuously update the results
query = trending_videos \
    .writeStream \
    .outputMode("complete") \
    .foreachBatch(write_to_mongo) \
    .start()

# Await termination
query.awaitTermination()