In [None]:
%%capture
!pip install pyspark pandas numpy scikit-learn pyLDAvis gensim mlflow

### Step 1: Load Transcripts Data from Kafka Stream (PySpark)

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, FloatType, IntegerType

# Initialize Spark
spark = SparkSession.builder \
    .appName("YouTubeTranscriptAnalysis") \
    .config("spark.sql.streaming.checkpointLocation", "/tmp/kafka_checkpoint") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.4") \
    .getOrCreate()

# Define transcript schema
transcripts_schema = StructType([
    StructField("videoId", StringType(), True),
    StructField("transcript", ArrayType(StructType([
        StructField("text", StringType(), True),
        StructField("start", FloatType(), True),
        StructField("duration", FloatType(), True)
    ])), True)
])

# Read Kafka Stream
kafka_transcripts_df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "youtube_transcripts") \
    .option("startingOffsets", "earliest") \
    .load()

# Parse JSON data
transcripts_parsed_df = kafka_transcripts_df \
    .selectExpr("CAST(value AS STRING)") \
    .select(from_json(col("value"), transcripts_schema).alias("data")) \
    .select("data.videoId", "data.transcript")

# Flatten transcripts (convert list of text into a single document per video)
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def concatenate_transcript(transcript):
    if transcript is not None:
        return " ".join([t["text"] for t in transcript if t["text"] is not None])
    return ""

concat_udf = udf(concatenate_transcript, StringType())
transcripts_parsed_df = transcripts_parsed_df.withColumn("full_transcript", concat_udf(col("transcript"))).select("videoId", "full_transcript")

# Convert to Pandas
transcripts_pd = transcripts_parsed_df.toPandas()


### Step 2: Preprocess Transcript Text

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim import corpora, models
import numpy as np

# Download necessary NLTK data
nltk.download("stopwords")
nltk.download("wordnet")
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

# Text cleaning function
def preprocess_text(text):
    if not text:
        return ""
    text = re.sub(r"\W", " ", text.lower())  # Remove special characters
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words and len(word) > 2]
    return " ".join(words)

transcripts_pd["cleaned_transcript"] = transcripts_pd["full_transcript"].apply(preprocess_text)


### Step 3: Apply Topic Modeling (LDA)

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

# Convert text into a document-term matrix
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words="english")
transcript_matrix = vectorizer.fit_transform(transcripts_pd["cleaned_transcript"])

# Train LDA model
lda = LatentDirichletAllocation(n_components=5, random_state=42)  # Adjust topics as needed
lda.fit(transcript_matrix)

# Get topic words
terms = vectorizer.get_feature_names_out()
topic_words = {}
for topic_idx, topic in enumerate(lda.components_):
    topic_words[topic_idx] = [terms[i] for i in topic.argsort()[:-10 - 1:-1]]  # Top 10 words

# Print Topics
for topic, words in topic_words.items():
    print(f"Topic {topic}: {', '.join(words)}")


### Step 4: Visualize Topics

In [None]:
import pyLDAvis.sklearn

pyLDAvis.enable_notebook()
visualization = pyLDAvis.sklearn.prepare(lda, transcript_matrix, vectorizer)
pyLDAvis.display(visualization)


### Step 5: Track with MLflow

In [None]:
import mlflow
import mlflow.sklearn

mlflow.set_experiment("YouTube Topic Modeling")

with mlflow.start_run():
    mlflow.log_param("n_topics", 5)
    mlflow.sklearn.log_model(lda, "LDA_model")
    mlflow.log_artifact("lda_visualization.html")
