In [74]:
import csv
import pandas as pd
import numpy as np
import pickle
from collections import Counter
from googleapiclient.discovery import build
from textblob import TextBlob
from nltk import word_tokenize, pos_tag
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM, Dropout, Dense
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

# Download Comments (Do not run again unless you have a google API key)

Fetch YouTube Comments

In [None]:
youtube = build('youtube', 'v3', developerKey='KEY')

video_id = '78IJdhvY1zg'
comments = []

request = youtube.commentThreads().list(
    part='snippet',
    videoId=video_id,
    textFormat='plainText',
)

while request:
    response = request.execute()
    for item in response['items']:
        comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
        comments.append(comment)

    request = youtube.commentThreads().list_next(request, response)

for comment in comments:
    print(comment)


Pre-label Comments and Save to CSV

In [None]:
def analyze_sentiment(comment):
    analysis = TextBlob(comment)

    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity < 0:
        return 'negative'
    else:
        return 'neutral'

labeled_comments = []

for comment in comments:
    sentiment_label = analyze_sentiment(comment)
    labeled_comments.append({
        'comment': comment,
        'sentiment': sentiment_label
    })


csv_filename = 'labeled_comments.csv'

with open(csv_filename, 'w', newline='', encoding='utf-8') as csv_file:
    fieldnames = ['comment', 'sentiment']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

    writer.writeheader()
    writer.writerows(labeled_comments)


# Train Models

Load Data from Excel File

In [77]:
file_path = '/content/Data.xlsx'
data = pd.DataFrame(pd.read_excel(file_path))

for label, count in Counter(data['sentiment']).items():
    print(f"Label in data: {label}, Frequency: {count}")

Label in data: positive, Frequency: 631
Label in data: neutral, Frequency: 510
Label in data: negative, Frequency: 434


Select a Balanced Data

In [78]:
min_frequency = min(Counter(data['sentiment']).values())
data = pd.concat([data[data['sentiment'] == label].sample(min_frequency) for label in Counter(data['sentiment']).keys()])
print(len(data))

1302


Preprocess Data - Tokenization and POS Tagging

In [79]:
def preprocess_and_analyze(text):
    tokens = word_tokenize(text)
    pos_tags = [tag[1] for tag in pos_tag(tokens)]
    return tokens, pos_tags

data['tokens'], data['pos_tags'] = zip(*data['comment'].apply(preprocess_and_analyze).tolist())
data.to_csv('preprocessed_data.csv', index=False)

Sentiment Mapping

In [80]:
sentiment_mapping = {'positive': 2, 'neutral': 1, 'negative': 0}
label_mapping = {0: 'negative', 1: 'neutral', 2: 'positive'}
data['sentiment_numeric'] = data['sentiment'].map(sentiment_mapping)
sentiment_classes = ['negative', 'neutral', 'positive']

Train-Test Split

In [81]:
X_train, X_test, y_train, y_test = train_test_split(data['comment'], data['sentiment_numeric'], test_size=0.2, random_state=42,
                                                    stratify=data['sentiment_numeric'])

TF-IDF Vectorization

In [82]:
ngram_range = (1, 3)
num = data['comment'].str.split().explode().nunique()
print('Unique Words in Data: ', num)
tfidf_vectorizer = TfidfVectorizer(ngram_range=ngram_range, min_df=5, max_features=num, stop_words='english')
tfidf_vectorizer_for_nb = TfidfVectorizer(ngram_range=ngram_range, min_df=5, max_features=num, stop_words='english')

Unique Words in Data:  2827


Print Number of Labels in Train & Test

In [83]:
for label_train, count_train in Counter(y_train).items():
    label_train = label_mapping[label_train]
    print(f'Label in train: {label_train}, Frequency: {count_train}')

for label_test, count_test in Counter(y_test).items():
    label_test = label_mapping[label_test]
    print(f'Label in test: {label_test}, Frequency: {count_test}')

Label in train: neutral, Frequency: 347
Label in train: negative, Frequency: 347
Label in train: positive, Frequency: 347
Label in test: positive, Frequency: 87
Label in test: negative, Frequency: 87
Label in test: neutral, Frequency: 87


Train-Test Split with Tokens and POS Taggins Information for NB

In [84]:
df = pd.DataFrame()
df['combined_text'] = data['comment'].astype(str) + data['tokens'].astype(str) + data['pos_tags'].astype(str)
matrix_tfidf = tfidf_vectorizer_for_nb.fit_transform(df['combined_text'])
data_tfidf = pd.DataFrame(matrix_tfidf.toarray(), columns=tfidf_vectorizer_for_nb.get_feature_names_out())

Naive Bayes Classification

In [85]:
X_train_nb, X_test_nb, y_train_nb, y_test_nb = train_test_split(
    data_tfidf, data['sentiment_numeric'], test_size=0.2, random_state=67, stratify=data['sentiment_numeric'])

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_nb, y_train_nb)
y_pred = nb_classifier.predict(X_test_nb)
accuracy = accuracy_score(y_test_nb, y_pred)
nb_classification_rep = classification_report(y_test_nb, y_pred, target_names=sentiment_mapping.keys())

print(f'Accuracy: {accuracy:.2f}')
print(nb_classification_rep)

Accuracy: 0.78
              precision    recall  f1-score   support

    positive       0.82      0.83      0.82        87
     neutral       0.74      0.72      0.73        87
    negative       0.77      0.78      0.78        87

    accuracy                           0.78       261
   macro avg       0.78      0.78      0.78       261
weighted avg       0.78      0.78      0.78       261



SVM Classification with Grid Search

In [86]:
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

param_grid_svm = {
    'C': [0.1, 1, 10,100],
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'rbf']
}

svm_model = SVC(random_state=42)
svm_model_grid = GridSearchCV(estimator=svm_model, param_grid=param_grid_svm, verbose=10, cv=5, n_jobs=-1)
svm_model_grid.fit(X_train_tfidf, y_train)

results_df = pd.DataFrame(svm_model_grid.cv_results_)
print('Grid Search Results:')
print(results_df[['params', 'mean_test_score', 'rank_test_score']])

best_estimator_svm = svm_model_grid.best_estimator_
print('Best Estimator: ', best_estimator_svm)

y_pred = best_estimator_svm.predict(X_test_tfidf)

svm_report = classification_report(y_test, y_pred, target_names=sentiment_mapping.keys())

print('Classification Report:')
print(svm_report)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Grid Search Results:
                                              params  mean_test_score  \
0   {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}         0.688774   
1      {'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}         0.642524   
2    {'C': 0.1, 'gamma': 'auto', 'kernel': 'linear'}         0.688774   
3       {'C': 0.1, 'gamma': 'auto', 'kernel': 'rbf'}         0.460259   
4     {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}         0.783842   
5        {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}         0.851113   
6      {'C': 1, 'gamma': 'auto', 'kernel': 'linear'}         0.783842   
7         {'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}         0.460259   
8    {'C': 10, 'gamma': 'scale', 'kernel': 'linear'}         0.818458   
9       {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}         0.860710   
10    {'C': 10, 'gamma': 'auto', 'kernel': 'linear'}         0.818458   
11       {'C': 10, 'gamma': 'auto', 'kerne

LSTM Model

In [87]:
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

model_lstm = Sequential()
max_sequence_length = max(X_train_tfidf.shape[1], X_test_tfidf.shape[1])
model_lstm.add(Embedding(input_dim=X_train_tfidf.shape[1], output_dim=50, input_length=max_sequence_length))
model_lstm.add(Bidirectional(LSTM(50, return_sequences=True)))
model_lstm.add(Dropout(0.2))
model_lstm.add(Bidirectional(LSTM(50)))
model_lstm.add(Dense(3, activation='softmax'))

model_lstm.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_lstm.fit(X_train_tfidf.toarray(), y_train, epochs=10, batch_size=16, validation_split = 0.1)

model_lstm.summary()

predictions = model_lstm.predict(X_test_tfidf.toarray())
predicted_labels = np.argmax(predictions, axis=1)
predicted_sentiments = [label_mapping[label] for label in predicted_labels]
lstm_tfidf_report = classification_report(y_test, predicted_labels, target_names=sentiment_mapping.keys())
print(lstm_tfidf_report)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 389, 50)           19450     
                                                                 
 bidirectional_12 (Bidirect  (None, 389, 100)          40400     
 ional)                                                          
                                                                 
 dropout_6 (Dropout)         (None, 389, 100)          0         
                                                                 
 bidirectional_13 (Bidirect  (None, 100)               60400     
 ional)                                                          
                                                                 
 dense_6 (Dense)             (None, 3)                 303       
         

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


LSTM Using Pre-trained Models

In [88]:
X_train = X_train.astype(str)
X_test = X_test.astype(str)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
tokenizer.fit_on_texts(X_test)

X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

max_sequence_length = num
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_length, padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length, padding='post')

embedding_dim = 50
embedding_matrix = {}

with open('glove.6B.50d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_matrix[word] = coefs

vocab_size = len(tokenizer.word_index) + 1
embedding_matrix_for_model = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    embedding_vector = embedding_matrix.get(word)
    if embedding_vector is not None:
        embedding_matrix_for_model[i] = embedding_vector

model_pretrained = Sequential()
model_pretrained.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix_for_model], input_length=max_sequence_length, trainable=False))
model_pretrained.add(Bidirectional(LSTM(50, return_sequences=True)))
model_pretrained.add(Dropout(0.2))
model_pretrained.add(Bidirectional(LSTM(50)))
model_pretrained.add(Dense(3, activation='softmax'))

model_pretrained.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_pretrained.summary()
model_pretrained.fit(X_train_padded, y_train, epochs=10, batch_size=16, validation_split=0.1)

result = model_pretrained.predict(X_test_padded)
predicted_labels = np.argmax(result, axis=1)
predicted_sentiments = [label_mapping[label] for label in predicted_labels]
lstm_pretrained_report = classification_report(y_test, predicted_labels, target_names=sentiment_mapping.keys())
print(lstm_pretrained_report)


Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 2827, 50)          102300    
                                                                 
 bidirectional_14 (Bidirect  (None, 2827, 100)         40400     
 ional)                                                          
                                                                 
 dropout_7 (Dropout)         (None, 2827, 100)         0         
                                                                 
 bidirectional_15 (Bidirect  (None, 100)               60400     
 ional)                                                          
                                                                 
 dense_7 (Dense)             (None, 3)                 303       
                                                                 
Total params: 203403 (794.54 KB)
Trainable params: 101

Save Models

In [89]:
def save_model(variable, filename):
    pickle.dump(variable, open(filename, "wb"))

save_model(nb_classifier, 'Naive_Bayes.pickle')
save_model(best_estimator_svm, 'SVM.pickle')
save_model(model_lstm, 'LSTM_TF-IDF.pickle')
save_model(model_pretrained, 'LSTM_Word2Vec.pickle')
save_model(tfidf_vectorizer_for_nb, 'TF-IDF_for_NB.pickle')
save_model(tfidf_vectorizer, 'TF-IDF.pickle')
save_model(tokenizer, 'Word2Vec.pickle')

# Load Model and Predict

Load Models

In [90]:
def load_model(filename):
    variable = pickle.load(open(filename, "rb"))
    return variable

nb_classifier = load_model('Naive_Bayes.pickle')
best_estimator_svm = load_model('SVM.pickle')
moedel_lstm = load_model('LSTM_TF-IDF.pickle')
model_pretrained = load_model('LSTM_Word2Vec.pickle')
tfidf_vectorizer_for_nb = load_model('TF-IDF_for_NB.pickle')
tfidf_vectorizer = load_model('TF-IDF.pickle')
tokenizer = load_model('Word2Vec.pickle')

Predict Sentiment for a Comment using All Models

In [92]:
def predict_sentiment(comment):

    tokens, pos_tags = preprocess_and_analyze(comment)

    comment_vectorized = tfidf_vectorizer.transform([comment])
    svm_prediction = best_estimator_svm.predict(comment_vectorized)

    comment_combined = tfidf_vectorizer_for_nb.transform([' '.join(map(str, [comment] + tokens + pos_tags))])
    comment_vectorized_combined = pd.DataFrame(comment_combined.toarray(), columns=tfidf_vectorizer_for_nb.get_feature_names_out())
    nb_prediction = nb_classifier.predict(comment_vectorized_combined)

    comment_sequence = tokenizer.texts_to_sequences([comment])
    comment_padded = pad_sequences(comment_sequence, maxlen=2827, padding='post')

    lstm_predictions = model_lstm.predict(comment_vectorized)
    lstm_pretrained_prediction = model_pretrained.predict(comment_padded)

    svm_predicted_class_index = np.argmax(svm_prediction)
    nb_predicted_class_index = np.argmax(nb_prediction)
    lstm_predicted_class_index = np.argmax(lstm_predictions)
    lstm_pretrained_prediction_class_index = np.argmax(lstm_pretrained_prediction)

    svm_sentiment = sentiment_classes[svm_predicted_class_index]
    nb_sentiment = sentiment_classes[nb_predicted_class_index]
    lstm_sentiment = sentiment_classes[lstm_predicted_class_index]
    lstm_pretrained_sentiment = sentiment_classes[lstm_pretrained_prediction_class_index]

    return svm_sentiment, nb_sentiment, lstm_sentiment, lstm_pretrained_sentiment, tokens, pos_tags


def predict_from_command_line():
    comment = input('Enter your comment: ')
    if comment:
        svm_sentiment, nb_sentiment, lstm_sentiment, lstm_pretrained_sentiment, tokens, pos_tags= predict_sentiment(comment)
        print(f'Tokens: {tokens}')
        print(f'POS Tags: {pos_tags}')
        print(f'SVM Model Prediction: {svm_sentiment}')
        print(f'Naive Bayes Model Prediction: {nb_sentiment}')
        print(f'LSTM Model Prediction: {lstm_sentiment}')
        print(f'LSTM Pretrained Model Prediction: {lstm_pretrained_sentiment}')
    else:
        print('Please enter a comment.')

predict_from_command_line()

Enter your comment: I love the movie
Tokens: ['I', 'love', 'the', 'movie']
POS Tags: ['PRP', 'VBP', 'DT', 'NN']
SVM Model Prediction: negative
Naive Bayes Model Prediction: negative
LSTM Model Prediction: positive
LSTM Pretrained Model Prediction: positive
