In [3]:
import streamlit as st
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from transformers import pipeline
from textblob import TextBlob
import re
import nltk

In [10]:
# Ensure that necessary NLTK data is downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\oem\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\oem\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\oem\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
#text summarization function
def summarize_text(text, max_length=100000):
    summarization_pipeline = pipeline("summarization")
    summary = summarization_pipeline(text, max_length=max_length, min_length=50, do_sample=False)
    return summary[0]['summary_text']

In [6]:
import numpy as np
#extract keywords 
def extract_keywords(text):
    #use text cleaning techniques 
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    #start with tokenization
    words= word_tokenize(text)
    #second step is lematizarion
    words = [lemmatizer.lemmatize(word.lower()) for word in words if word.isalnum()]
    #get keywords
    keywords = [word for word in words if word not in stop_words and len(word) > 1]
    # Create CountVectorizer and fit-transform the text
    vectorizer = CountVectorizer()
    counter = vectorizer.fit_transform(keywords)
    # Extract the vocabulary (word-to-index mapping)
    vocabulary = vectorizer.vocabulary_

    # Get word frequencies (sum across columns)
    word_frequencies = np.array(counter.sum(axis=0)).flatten()

    # Create a dictionary mapping words to their frequencies
    word_freq_dict = {word: word_frequencies[idx] for word, idx in vocabulary.items()}

    # Sort words by frequency in descending order
    top_keywords = sorted(word_freq_dict, key=word_freq_dict.get, reverse=True)[:5]
    return top_keywords

In [7]:
def topic_modeling(text):
    vectorizer = CountVectorizer(max_df=0.95, min_df=0.05, stop_words='english')
    tf = vectorizer.fit_transform([text])
    lda_model = LatentDirichletAllocation(n_components=5, max_iter=5, learning_method='online', random_state=42)
    lda_model.fit(tf)
    feature_names = vectorizer.get_feature_names_out()
    topics = []
    for topic_idx, topic in enumerate(lda_model.components_):
        topics.append([feature_names[i] for i in topic.argsort()[:-6:-1]])  # Top 5 words for each topic
    return topics

In [8]:
def extract_video_id(url):
    video_id = None
    patterns = [
        r'v=([^&]+)',  # Pattern for URLs with 'v=' parameter
        r'youtu.be/([^?]+)',  # Pattern for shortened URLs
        r'youtube.com/embed/([^?]+)'  # Pattern for embed URLs
    ]
    for pattern in patterns:
        match = re.search(pattern, url)
        if match:
            video_id = match.group(1)
            break
    return video_id