In [1]:
import json
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, window, count, from_unixtime
from pyspark.sql.types import StringType
from pymongo import MongoClient
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType

In [4]:
packages = [
    'org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.1',
    'org.apache.kafka:kafka-clients:3.2.0',
    'org.mongodb.spark:mongo-spark-connector_2.12:3.0.2'
]

spark = SparkSession.builder\
   .master("local")\
   .appName("kafka-example")\
   .config("spark.jars.packages", ",".join(packages))\
   .getOrCreate()



:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
org.apache.kafka#kafka-clients added as a dependency
org.mongodb.spark#mongo-spark-connector_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-8c8dfa59-cc79-4a52-bd3d-35a764d79e12;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.2.1 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.2.1 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.1 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.hadoop#hadoop-client-api;3.3.1 in central
	found org.apache.htrace#htrace-core4;4.1.0-incubating in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.6.2

### Reading from Mongo

In [9]:
mongo_params = {
    "uri": "mongodb://mongo:27017",
    "database": "bigdata",
    "collection": "trending_videos"
}

dataFrame = spark.read\
                 .format("mongo")\
                 .options(**mongo_params) \
                 .load()

In [10]:
dataFrame.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- count: integer (nullable = true)
 |-- video_id: string (nullable = true)



### Reading from Kafka

In [3]:
kafka_params = {
    "kafka.bootstrap.servers": "kafka:9092",
    "subscribe": "VID_OPEN_TOPIC",
    "startingOffsets": "earliest",
    "endingOffsets": "latest"
}

# Read data from Kafka
raw_data = spark \
    .read \
    .format("kafka") \
    .options(**kafka_params) \
    .load()

In [4]:
# schema = "user_id STRING, video_id STRING, watched_at TIMESTAMP"
# schema = "user_id STRING, video_id STRING, timestamp TIMESTAMP"
schema = StructType([
    StructField("user_id", StringType(), False),
    StructField("video_id", StringType(), False),
    StructField("timestamp", TimestampType(), False)
])

In [5]:
# Assuming 'value' is the column containing the JSON data
parsed_data = raw_data\
    .select(from_json(col("value").cast("string"), schema).alias("data"))\
    .select("data.timestamp", "data.video_id")\
    .groupBy("video_id").count()\
    .orderBy(col("count").desc()).limit(10)

top_videos = parsed_data.toJSON().collect()

                                                                                

#### Sample value in `VID_OPEN_TOPIC` Kafka Topic
`{"timestamp": 1715268711.870222, "video_id": "QdBZY2fkU-0", "email": "allenA"}`

In [6]:
parsed_vids = []

for v in top_videos:
    parsed_vids.append(json.loads(v))

In [7]:
# Connect to MongoDB
client = MongoClient("mongodb://mongo:27017/")
db = client["bigdata"]
collection = db["trending_videos"]

In [8]:
collection.delete_many({})

for vid in parsed_vids:
    collection.insert_one(vid)