In [3]:
import os
import sys
import warnings

warnings.filterwarnings("ignore")

### Load Transcripts Data from Kafka Stream (PySpark)

In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, udf
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, FloatType, IntegerType

In [5]:
spark = SparkSession.builder \
    .appName("YouTubeTranscriptAnalysis") \
    .config("spark.sql.streaming.checkpointLocation", "/tmp/kafka_checkpoint") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.4") \
    .getOrCreate()

In [6]:
transcripts_schema = StructType([
    StructField("videoId", StringType(), True),
    StructField("transcript", ArrayType(StructType([
        StructField("text", StringType(), True),
        StructField("start", FloatType(), True),
        StructField("duration", FloatType(), True)
    ])), True)
])

In [7]:
kafka_transcripts_df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "youtube_transcripts") \
    .option("startingOffsets", "earliest") \
    .load()

In [8]:
transcripts_parsed_df = kafka_transcripts_df \
    .selectExpr("CAST(value AS STRING)") \
    .select(from_json(col("value"), transcripts_schema).alias("data")) \
    .select("data.videoId", "data.transcript")

In [16]:
transcripts = transcripts_parsed_df \
    .writeStream \
    .outputMode("append") \
    .format("memory") \
    .queryName("transcripts_table") \
    .start()

transcripts.awaitTermination(10)

25/03/14 00:14:56 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/03/14 00:14:56 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
                                                                                

False

In [18]:
transcripts_df = spark.sql("SELECT * FROM transcripts_table")

In [19]:
def concatenate_transcript(transcript):
    if transcript is not None:
        return " ".join([t["text"] for t in transcript if t["text"] is not None])
    return ""

concat_udf = udf(concatenate_transcript, StringType())

In [20]:
transcripts_df = transcripts_df.withColumn("full_transcript", concat_udf(col("transcript"))).select("videoId", "full_transcript")

transcripts_pd = transcripts_df.toPandas()

                                                                                

### Preprocess Transcript Text

In [21]:
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim import corpora, models
import numpy as np

nltk.download("stopwords")
nltk.download("wordnet")
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bharathvelamala/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/bharathvelamala/nltk_data...


In [22]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def preprocess_text(text):
    if not text:
        return ""
    text = re.sub(r"\W", " ", text.lower()) 
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words and len(word) > 2]
    return " ".join(words)

transcripts_pd["cleaned_transcript"] = transcripts_pd["full_transcript"].apply(preprocess_text)

### Apply Topic Modeling (LDA)

In [23]:
from sklearn.decomposition import LatentDirichletAllocation

In [24]:
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words="english")
transcript_matrix = vectorizer.fit_transform(transcripts_pd["cleaned_transcript"])

In [25]:
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(transcript_matrix)

In [26]:
terms = vectorizer.get_feature_names_out()
topic_words = {}
for topic_idx, topic in enumerate(lda.components_):
    topic_words[topic_idx] = [terms[i] for i in topic.argsort()[:-10 - 1:-1]]

In [27]:
for topic, words in topic_words.items():
    print(f"Topic {topic}: {', '.join(words)}")

Topic 0: force, trick, trouble, occur, link, youtube, description, channel, showing, man
Topic 1: gödel, statement, hilbert, proof, turing, zero, halt, mathematics, question, true
Topic 2: steal, app, indicator, input, toy, sequence, boundary, base, game, knob
Topic 3: table, customer, query, column, join, execute, database, select, sql, value
Topic 4: python, function, file, print, program, list, value, variable, loop, type


### Visualize Topics

In [36]:
# import pyLDAvis.gensim

# pyLDAvis.enable_notebook()
# visualization = pyLDAvis.gensim.prepare(lda, transcript_matrix, vectorizer)
# pyLDAvis.display(visualization)

### Step 5: Track with MLflow

In [38]:
# import mlflow
# import mlflow.sklearn

# mlflow.set_experiment("YouTube Topic Modeling")

# with mlflow.start_run():
#     mlflow.log_param("n_topics", 5)
#     mlflow.sklearn.log_model(lda, "LDA_model")
#     mlflow.log_artifact("lda_visualization.html")