In [None]:
# Importing libraries

import pandas as pd

import nltk   # nltk is for natural language processing and computational linguistics
from nltk.corpus import stopwords   # corpus is a collection of authentic text or audio organized into datasets
from nltk.sentiment import SentimentIntensityAnalyzer    # To analyse sentiment
from sklearn.feature_extraction.text import CountVectorizer    # method to convert text to numerical data
from sklearn.decomposition import LatentDirichletAllocation    # explains a set of observations through unobserved groups, and each group explains why some parts of the data are similar

In [None]:
path = '/content/youtube_comments_scrapped.csv'

# Reading data
df = pd.read_csv(path)
df

Unnamed: 0.1,Unnamed: 0,0
0,0,"And, could you also put up some material on ho..."
1,1,"HI sir ,thanks for sharing your knowledge it r..."
2,2,"i love how you spelling it, if we could build ..."
3,3,I just love how you keep reassuring us in the ...
4,4,At 13:00 you had mentioned about weakness of L...
5,5,Can you please guide me on learning NLP with R...
6,6,"Sir, will we be able to do text analytics and ..."
7,7,I think anyone who is in nlp has eventually fo...
8,8,i come across with all these concepts in my or...
9,9,"Great Sir, Big fan of yours. Thanks a lot for..."


In [None]:
nltk.download('stopwords')     # used to eliminate unimportant words (commonly used words)
nltk.download('punkt')    # a tokenizer that divides a text into a list of sentences by using an unsupervised algorithm to build a model for abbreviation words, collocations, and words that start sentences.
nltk.download('vader_lexicon')    # is a lexicon and rule-based sentiment analysis tool that is specifically attuned to sentiments expressed in social media, and works well on texts from other domains.

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [None]:
# Preprocess the comments o remove the unwanted words

stop_words = set(stopwords.words('english'))

In [None]:
# Word Frequency Analysis

def word_frequency_analysis(comments):
    """
    This function prints frequency of each word in a descending order
    """
    # Convert comments to strings and handle float values
    comments = [str(comment) if not pd.isnull(comment) else '' for comment in comments]

    # Combine all comments into a single string
    all_comments = ' '.join(comments)

    # Tokenize the comments
    tokens = nltk.word_tokenize(all_comments)

    # Filter out stopwords
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]

    # Calculate word frequencies
    word_freq = nltk.FreqDist(filtered_tokens)

    # Print the most common words
    print('Most common words:')
    for word, freq in word_freq.most_common(10):
        print(f'{word}: {freq}')

In [None]:
# Sentiment Analysis

def sentiment_analysis(comments):
    """
    This function prints the over all sentiment of the text
    """
    sid = SentimentIntensityAnalyzer()

    # Calculate sentiment scores for each comment
    sentiment_scores = [sid.polarity_scores(comment) for comment in comments]

    # Calculate average sentiment scores
    avg_sentiment = sum(score['compound'] for score in sentiment_scores) / len(sentiment_scores)

    print(f'Average sentiment: {avg_sentiment}')

In [None]:
# Topic Modeling

def topic_modeling(comments):
    """
    This function prints the most discussed topics in the comments
    """
    # Create a CountVectorizer object
    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
    
    # Fit and transform the comments
    tf = tf_vectorizer.fit_transform(comments)
    
    # Create an LDA model
    lda = LatentDirichletAllocation(n_components=5, random_state=42)
    
    # Fit the LDA model
    lda.fit(tf)
    
    # Print the top words for each topic
    print('Top words per topic:')
    feature_names = tf_vectorizer.get_feature_names_out()
    for topic_idx, topic in enumerate(lda.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-10 - 1:-1]]
        print(f"Topic {topic_idx+1}: {' '.join(top_words)}")


In [None]:
# Columns of the data

df.columns

Index(['Unnamed: 0', '0'], dtype='object')

In [None]:
# Perform analysis on the comments

comments = df['0'].tolist()

In [None]:
# list of data

comments

["And, could you also put up some material on how Hidden Markov Models are used in NLP? have studied them way back in 2011 during my Master's degree in the pre Deep Learning era. But don't have much practical exposure to NLP? And does acoustic model for phonemes recognition come more under speech Recognition? Could you also provide a short description on that?",
 'HI sir ,thanks for sharing your knowledge it really helps me alot sometime, i have a question. \nif LSTM has problem , why cant we directly use bidirectional LSTM instead of LSTM , can we skip LSTM and directly apply Bidirectional LSTM ?',
 'i love how you spelling it, if we could build such a pyramid for other techs, life of the learners would be much easier as it s enough to keep in mind this intuitive/organic scale of complexity.',
 'I just love how you keep reassuring us in the video that you got us covered from bottom to top. This is super helpful. Thank you',
 'At 13:00 you had mentioned about weakness of LSTM. If we ta

In [None]:
# Word Frequency Analysis with count

word_frequency_analysis(comments)

Most common words:
,: 23
.: 23
?: 10
NLP: 8
LSTM: 8
need: 8
): 5
Thanks: 5
playlist: 5
using: 5


In [None]:
# Sentiment Analysis

sentiment_analysis(comments)

Average sentiment: 0.442875


In [None]:
# Topic Modeling

topic_modeling(comments)

Top words per topic:
Topic 1: thank just love able video helpful text thanks cover using
Topic 2: really using helpful video cover make hard language based models
Topic 3: need playlist sir learning tell deep machine course following knowledge
Topic 4: lstm question bidirectional need using use able sir different mentioned
Topic 5: nlp thanks used learning deep come mentioned models language based
