In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, FloatType, IntegerType

# Initialize Spark
spark = SparkSession.builder \
    .appName("YouTubeTranscriptAnalysis") \
    .config("spark.sql.streaming.checkpointLocation", "/tmp/kafka_checkpoint") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.4") \
    .getOrCreate()

### Step 1: Load Channel Data

In [None]:
channel_df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "youtube_channel_info") \
    .option("startingOffsets", "earliest") \
    .load()

# Parse channel data
channel_parsed_df = channel_df \
    .selectExpr("CAST(value AS STRING)") \
    .select(from_json(col("value"), channel_schema).alias("data")) \
    .select("data.channel_id", "data.subscriber_count", "data.view_count", "data.video_count")

channel_pd = channel_parsed_df.toPandas()

### Step 2: Calculate Ranking Score

In [None]:
channel_pd["engagement_rate"] = (channel_pd["view_count"] / channel_pd["subscriber_count"])
channel_pd["sponsorship_score"] = (
    0.5 * channel_pd["subscriber_count"] +
    0.3 * channel_pd["engagement_rate"] +
    0.2 * channel_pd["video_count"]
)

# Rank channels
channel_pd = channel_pd.sort_values("sponsorship_score", ascending=False)

import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
plt.barh(channel_pd["channel_id"], channel_pd["sponsorship_score"])
plt.xlabel("Sponsorship Score")
plt.ylabel("Channel ID")
plt.title("Top YouTube Channels for Sponsorships")
plt.show()