In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import numpy as np
import nltk
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import seaborn as sns
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Bhavya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# Load the dataset for TED youtube
ted_data = pd.read_csv("./Datasets/TED-ED_youtube_metadata.csv")

# Initialize the TF-IDF vectorizer
ted_tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

# Fit the vectorizer and transform the transcript data
ted_tfidf_matrix = ted_tfidf_vectorizer.fit_transform(ted_data['transcript'])

# Convert the TF-IDF matrix to a DataFrame
ted_tfidf_df = pd.DataFrame(ted_tfidf_matrix.toarray(), columns=ted_tfidf_vectorizer.get_feature_names_out())



In [3]:
# Load the dataset for trending Youtube videos
yt_data = pd.read_csv("./Datasets/GB_youtube_filtered_dataset.csv")

# Dropping rows with empty transcript values
yt_data = yt_data.dropna(subset=['transcript'])

# Initialize the TF-IDF vectorizer
yt_tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

# Fit the vectorizer and transform the transcript data
yt_tfidf_matrix = yt_tfidf_vectorizer.fit_transform(yt_data['transcript'])

# Convert the TF-IDF matrix to a DataFrame
yt_tfidf_df = pd.DataFrame(yt_tfidf_matrix.toarray(), columns=yt_tfidf_vectorizer.get_feature_names_out())


In [4]:
# Finding mean of TF-IDF values without 0 to prevent the mean to be close to 0
def tf_idf_mean(row):
    non_zero_values = row[row != 0]  # Select non-zero values
    if len(non_zero_values) > 0:
        return non_zero_values.mean()  # Calculate mean excluding zeros
    else:
        return 0  # Return 0 if all values are 0

# Apply the function row-wise to create a new column
yt_tfidf_df['tf_idf_mean'] = yt_tfidf_df.apply(tf_idf_mean, axis=1)
ted_tfidf_df['tf_idf_mean'] = ted_tfidf_df.apply(tf_idf_mean, axis=1)


In [5]:
# Creating combined entertainment dataset and educational datasets
yt_entertainment_data = yt_data[yt_data['categoryId'] == 24]
yt_entertainment_data = yt_entertainment_data.reset_index(drop=1)

# Deleting unnecessary columns
yt_entertainment_data.drop(columns=['video_id','categoryId','dislikes'], inplace=True)

In [None]:
# Plot the density plot
plt.figure(figsize=(10,6))
ted_tfidf_df['tf_idf_mean'].plot(kind='density', label = 'TED')
yt_tfidf_df['tf_idf_mean'].plot(kind='density', label='YouTube Trending')
plt.title('TF-IDF Density Plot')
plt.xlabel('TF-IDF Score')
plt.ylabel('Density')
plt.xlim(0, 0.2)
plt.ylim(0,)
plt.legend()
plt.show()


In [None]:
# Word2Vec Approach and it's analysis

ted_word2vec_df = pd.DataFrame()

# Tokenize the transcripts
ted_word2vec_df['tokenized_transcript'] = ted_data['transcript'].apply(lambda x: word_tokenize(str(x).lower()))

# Train Word2Vec model on the tokenized transcripts
model = Word2Vec(sentences=ted_word2vec_df['tokenized_transcript'], vector_size=100, window=5, min_count=1, workers=4)

# Average the word embeddings for each transcript
def average_word_vectors(tokens, model, vocabulary, vector_size):
    if len(tokens) < 1:
        return np.zeros(vector_size)
    vectors = [model.wv[word] for word in tokens if word in vocabulary]
    if len(vectors) < 1:
        return np.zeros(vector_size)
    return np.mean(vectors, axis=0)

ted_word2vec_df['embedding'] = ted_word2vec_df['tokenized_transcript'].apply(lambda x: average_word_vectors(x, model, model.wv.index_to_key, model.vector_size))

# Finding mean of the embeddings: any logic to combine the vector
ted_word2vec_df['embedding'].mean()
ted_word2vec_df['embedding_mean'] = ted_word2vec_df['embedding'].apply(lambda x: pd.Series(x).mean())


In [None]:
# Correlation matrix visualisation

# Merge the dataframes
ted_merged_df = pd.concat([ted_word2vec_df, ted_tfidf_df, ted_data], axis=1)
ted_merged_df = ted_merged_df[['title','transcript','embedding_mean', 'tf_idf_mean', 'like_count', 'view_count', 'comment_count']]

ted_merged_df['like_view_ratio'] = ted_merged_df['like_count'] / ted_merged_df['view_count']
ted_merged_df['comment_view_ratio'] = ted_merged_df['comment_count'] / ted_merged_df['view_count']
ted_merged_df['comment_like_ratio'] = ted_merged_df['comment_count'] / ted_merged_df['like_count']

# Compute correlation matrix
correlation_matrix = ted_merged_df[['embedding_mean', 'tf_idf_mean', 'like_count', 'view_count', 'comment_count','like_view_ratio','comment_view_ratio','comment_like_ratio']].corr()
print(correlation_matrix)
# Plot correlation matrix using heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", vmin=-1, vmax=1)
plt.title('Correlation Between Metrics')
plt.show()


In [None]:
# Word2Vec Approach and it's analysis

yt_word2vec_df = pd.DataFrame()
# Tokenize the transcripts
yt_word2vec_df['tokenized_transcript'] = yt_data['transcript'].apply(lambda x: word_tokenize(str(x).lower()))

# Train Word2Vec model on the tokenized transcripts
model = Word2Vec(sentences=yt_word2vec_df['tokenized_transcript'], vector_size=100, window=5, min_count=1, workers=4)

# Average the word embeddings for each transcript
def average_word_vectors(tokens, model, vocabulary, vector_size):
    if len(tokens) < 1:
        return np.zeros(vector_size)
    vectors = [model.wv[word] for word in tokens if word in vocabulary]
    if len(vectors) < 1:
        return np.zeros(vector_size)
    return np.mean(vectors, axis=0)

yt_word2vec_df['embedding'] = yt_word2vec_df['tokenized_transcript'].apply(lambda x: average_word_vectors(x, model, model.wv.index_to_key, model.vector_size))

# Finding mean of the embeddings: any logic to combine the vector
yt_word2vec_df['embedding'].mean()
yt_word2vec_df['embedding_mean'] = yt_word2vec_df['embedding'].apply(lambda x: pd.Series(x).mean())

In [None]:
# Correlation matrix visualisation

# Merge the dataframes
yt_merged_df = pd.concat([yt_word2vec_df, yt_tfidf_df, yt_data], axis=1)
yt_merged_df = yt_merged_df[['title','transcript','embedding_mean', 'tf_idf_mean', 'like_count', 'view_count', 'comment_count']]
yt_merged_df.head()
yt_merged_df['like_view_ratio'] = yt_merged_df['like_count'] / yt_merged_df['view_count']
yt_merged_df['comment_view_ratio'] = yt_merged_df['comment_count'] / yt_merged_df['view_count']
yt_merged_df['comment_like_ratio'] = yt_merged_df['comment_count'] / yt_merged_df['like_count']

# Compute correlation matrix
correlation_matrix = yt_merged_df[['embedding_mean', 'tf_idf_mean', 'like_count', 'view_count', 'comment_count','like_view_ratio','comment_view_ratio','comment_like_ratio']].corr()
print(correlation_matrix)
# Plot correlation matrix using heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", vmin=-1, vmax=1)
plt.title('Correlation Between Metrics')
plt.show()

# Combining dataset

In [6]:
# Combining datasets 

# Importing other useful datasets
yt_educational_data = pd.read_csv("./Datasets/educational_youtube_metadata.csv")
vox_data = pd.read_csv("./Datasets/vox_youtube_metadata.csv")

# Entertainment 
entertainment_data = yt_entertainment_data.copy()
entertainment_data['label'] = 'entertainment'

# Informational
informational_data = pd.concat([ted_data,yt_educational_data,vox_data], ignore_index=True, join='inner')
informational_data['label'] = 'informative'

combined_data = pd.concat([entertainment_data,informational_data], ignore_index=True)



In [7]:
combined_data['transcript']

0        i have been gone off youtube for over one mont...
1        hi sisters james charles here and welcome back...
2        my [Music] [Music] [Music] [Applause] [Music] ...
3        i think it's the heat i think it needs to go i...
4        what's going on live fam i hope everybody's do...
                               ...                        
13812    The United States' national debt is $12.5 tril...
13813    what have we made progress in that people don'...
13814    a mass extinction is just defined as a moment ...
13815    "Obamacare" "The obamacare sign-up deadline ge...
13816    There's a problem in journalism. We call some ...
Name: transcript, Length: 13817, dtype: object

In [8]:
# Preprocessing of the data
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string
import contractions

# Download NLTK stopwords
nltk.download('stopwords')

# Define preprocessing function
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Expand contractions
    text = contractions.fix(text)
    # Removing punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word.lower() not in stop_words])
    # # Stemming
    # stemmer = PorterStemmer()
    # text = ' '.join([stemmer.stem(word) for word in text.split()])
    return text

# Apply preprocessing function to 'transcript' column
combined_data['transcript'] = combined_data['transcript'].apply(preprocess_text)



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Bhavya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
# Tokenization of transcripts and encoding of labels

combined_data['tokenized_transcript'] = combined_data['transcript'].apply(lambda x: word_tokenize(str(x).lower()))

combined_data['label_encoded'] = combined_data['label'].map({'informative': 1, 'entertainment': 0})

# Data Analysis

## Density Plot of Likes, Views and Comments

In [None]:
# Views

plt.figure(figsize=(10, 6))
sns.kdeplot(informational_data['view_count']/10**6, label='Informative', fill=True,)
sns.kdeplot(entertainment_data['view_count']/10**6, label='Non-Informative', fill=True,)

plt.xlabel('Views in millions')
plt.ylabel('Density')
plt.title('Density plot of Views')

# plt.xlim(0,0.2*10**8)
plt.xlim(0,15)
plt.ylim(0,)

plt.tight_layout() 
plt.legend()
plt.show()

# Likes
plt.figure(figsize=(10, 6))
sns.kdeplot(informational_data['like_count']/10**3, label='Informative', fill=True)
sns.kdeplot(entertainment_data['like_count']/10**3, label='Non-Informative', fill=True)

plt.xlabel('Likes in thousands')
plt.ylabel('Density')
plt.title('Density plot of Likes')

# plt.xlim(0,0.6*10**6)
plt.xlim(0,600)
plt.ylim(0,)

plt.tight_layout() 
plt.legend()
plt.show()

# Comments
plt.figure(figsize=(10, 6))
sns.kdeplot(informational_data['comment_count']/10**3, label='Informative', fill=True)
sns.kdeplot(entertainment_data['comment_count']/10**3, label='Non-Informative', fill=True)

plt.xlabel('Comments in thousands')
plt.ylabel('Density')
plt.title('Density plot of Comments')

# plt.xlim(0,0.4*10**5)
plt.xlim(0,40)
plt.ylim(0,)

plt.tight_layout() 
plt.legend()
plt.show()



In [None]:
# Correlation visualisation of likes, view and comments 
correlation_matrix = combined_data[['like_count', 'view_count', 'comment_count']].copy()
correlation_matrix = correlation_matrix.corr()
print(correlation_matrix)
# Plot correlation matrix using heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", vmin=-1, vmax=1)
plt.title('Correlation between metrics in the dataset')
plt.show()

In [None]:
# Correlation visualisation of likes, view and comments 
correlation_matrix = informational_data[['like_count', 'view_count', 'comment_count']].copy()
correlation_matrix = correlation_matrix.corr()
print(correlation_matrix)
# Plot correlation matrix using heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", vmin=-1, vmax=1)
plt.title('Correlation between metrics for informative content')
plt.show()

# Correlation visualisation of likes, view and comments 
correlation_matrix = entertainment_data[['like_count', 'view_count', 'comment_count']].copy()
correlation_matrix = correlation_matrix.corr()
print(correlation_matrix)
# Plot correlation matrix using heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", vmin=-1, vmax=1)
plt.title('Correlation between metrics for non-informative content')
plt.show()

## Like-View Ratio

In [None]:
lvr = combined_data['like_count']/combined_data['view_count']
lvr = lvr.dropna()

mean_lvr = np.mean(lvr)
median_lvr = np.median(lvr)
std_dev_lvr = np.std(lvr)
min_lvr = np.min(lvr)
max_lvr = np.max(lvr)

print("Descriptive Statistics for Like-to-View Ratio (LVR):")
print("Mean LVR:", mean_lvr)
print("Median LVR:", median_lvr)
print("Standard Deviation of LVR:", std_dev_lvr)
print("Minimum LVR:", min_lvr)
print("Maximum LVR:", max_lvr)

# Density plot
plt.figure(figsize=(10, 6))
sns.kdeplot(lvr, fill=True)
plt.title('Density Plot of Like-to-View Ratio (LVR)')
plt.xlabel('Like to View Ratio')
plt.ylabel('Density')
plt.xlim(0,0.25)
plt.ylim(0,)
plt.show()

In [None]:
informational_data_lvr = informational_data['like_count'] / informational_data ['view_count']
entertainment_data_lvr = entertainment_data['like_count'] / entertainment_data ['view_count']
informational_data['lvr'] = informational_data['like_count'] / informational_data ['view_count']
entertainment_data['lvr'] = entertainment_data['like_count'] / entertainment_data ['view_count']

informational_data_lvr = informational_data_lvr.dropna()
entertainment_data_lvr = entertainment_data_lvr.dropna()

informational_mean_lvr = np.mean(informational_data_lvr)
informational_median_lvr = np.median(informational_data_lvr)
print(f"Informational:\n Mean: {informational_mean_lvr} Median {informational_median_lvr}")
entertainment_mean_lvr = np.mean(entertainment_data_lvr)
entertainment_median_lvr = np.median(entertainment_data_lvr)
print(f"Entertainment:\n Mean: {entertainment_mean_lvr} Median {entertainment_median_lvr}")

plt.figure(figsize=(10, 6))
sns.kdeplot(informational_data_lvr, label='Informative', fill=True,)
sns.kdeplot(entertainment_data_lvr, label='Non-Informative', fill=True,)

plt.xlabel('Like to View Ratio')
plt.ylabel('Density')
plt.title('Density plot of Like to View Ratio')

plt.xlim(0,0.25)
plt.ylim(0,)

plt.tight_layout() 
plt.legend()
plt.show()


## TF-IDF Visualisation

In [None]:
# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

# Fit the vectorizer and transform the transcript data
tfidf_matrix = tfidf_vectorizer.fit_transform(combined_data['transcript'])

# Convert the TF-IDF matrix to a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())


In [None]:
# TF-IDF visualisations for informational and entertainment data
entertainment_tfidf_df = tfidf_df.iloc[:len(entertainment_data)].copy()
informational_tfidf_df = tfidf_df.iloc[len(entertainment_data):].copy()

informational_tfidf_df['tf_idf_mean'] = informational_tfidf_df.apply(tf_idf_mean, axis=1)
entertainment_tfidf_df['tf_idf_mean'] = entertainment_tfidf_df.apply(tf_idf_mean, axis=1)
tfidf_df['tf_idf_mean'] = tfidf_df.apply(tf_idf_mean, axis=1)
# Plot the density plot
plt.figure(figsize=(10,6))
# informational_tfidf_df['tf_idf_mean'].plot(kind='density', label = 'Informational')
# entertainment_tfidf_df['tf_idf_mean'].plot(kind='density', label='Entertainment')
sns.kdeplot(informational_tfidf_df['tf_idf_mean'], label='Informative', fill=True)
sns.kdeplot(entertainment_tfidf_df['tf_idf_mean'], label='Non-Informative', fill=True)

plt.title('TF-IDF Density Plot')
plt.xlabel('TF-IDF Score')
plt.ylabel('Density')
plt.xlim(0, 0.2)
plt.legend()
plt.show()

In [None]:
# Correlation matrix visualisation
combined_data_tfidf = pd.concat([tfidf_df,combined_data],axis=1)
# Merge the dataframes based on a common identifier
correlation_matrix = combined_data_tfidf[['title','transcript', 'like_count', 'view_count', 'comment_count','tf_idf_mean','label_encoded']]
correlation_matrix['like_view_ratio'] = correlation_matrix['like_count'] / correlation_matrix['view_count']
correlation_matrix['comment_view_ratio'] = correlation_matrix['comment_count'] / correlation_matrix['view_count']
correlation_matrix['comment_like_ratio'] = correlation_matrix['comment_count'] / correlation_matrix['like_count']

# Compute correlation matrix
correlation_matrix = correlation_matrix[['tf_idf_mean','like_view_ratio','label_encoded']].corr()
print(correlation_matrix)
# Plot correlation matrix using heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", vmin=-1, vmax=1)
plt.title('TF-IDF Correlation Between Metrics')
plt.show()


## Top 10 TF-IDF Visualisation

In [None]:
# Get the TF-IDF matrix as an array
tfidf_array = tfidf_matrix.toarray()

# Sort the TF-IDF values for each document
sorted_tfidf = np.sort(tfidf_array, axis=1)

# Top values
top_10_values = sorted_tfidf[:, -10:]

# Mean of top values
top_10_mean_values = np.mean(top_10_values, axis=1)

# Add column to dataframe
tfidf_df['top_10_mean'] = top_10_mean_values


In [None]:
# TF-IDF visualisations for informational and entertainment data
top10_entertainment_tfidf_df = tfidf_df.iloc[:len(entertainment_data)].copy()
top10_informational_tfidf_df = tfidf_df.iloc[len(entertainment_data):].copy()

# Plot the density plot
plt.figure(figsize=(10,6))
sns.kdeplot(top10_entertainment_tfidf_df['top_10_mean'], label='Entertainment', fill=True)
sns.kdeplot(top10_informational_tfidf_df['top_10_mean'], label='Informational', fill=True)
plt.title('Top 10 TF-IDF Density Plot')
plt.xlabel('Mean of top 10 TF-IDF Scores')
plt.ylabel('Density')
plt.xlim(0,)
plt.ylim(0,)
plt.legend()
plt.show()

In [None]:
# Correlation matrix visualisation
combined_data_tfidf = pd.concat([tfidf_df,combined_data],axis=1)
# Merge the dataframes based on a common identifier
correlation_matrix = combined_data_tfidf[['title','transcript', 'like_count', 'view_count', 'comment_count','top_10_mean','label_encoded']]
correlation_matrix['like_view_ratio'] = correlation_matrix['like_count'] / correlation_matrix['view_count']
correlation_matrix['comment_view_ratio'] = correlation_matrix['comment_count'] / correlation_matrix['view_count']
correlation_matrix['comment_like_ratio'] = correlation_matrix['comment_count'] / correlation_matrix['like_count']

# Compute correlation matrix
correlation_matrix = correlation_matrix[['top_10_mean','like_view_ratio','label_encoded']].corr()
print(correlation_matrix)
# Plot correlation matrix using heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", vmin=-1, vmax=1)
plt.title('Top 10 TF-IDF Correlation Between Metrics')
plt.show()

In [None]:
from wordcloud import WordCloud

# ENTERTAINMENT

# Get the feature names (words)
feature_names = entertainment_tfidf_df.columns
feature_names = [feature for feature in feature_names if feature != 'tf_idf_mean']

entertainment_total_tfidf_scores = {}

# Sum of Tf-idf scores of words
for word in feature_names:
    entertainment_total_tfidf_scores[word] = entertainment_tfidf_df[word].sum()

# Sort the words based on tf-idf values
sorted_tfidf_scores = dict(sorted(entertainment_total_tfidf_scores.items(), key=lambda x: x[1], reverse=True))
# Get the top 100 words based on TF-IDF scores
top_100_words = dict(sorted(sorted_tfidf_scores.items(), key=lambda item: item[1], reverse=True)[:100])

# # Get the top 10 words based on TF-IDF scores
# top_100_words = list(sorted_tfidf_scores.keys())[:100]

# # Print out the top 10 words and their TF-IDF scores
# print("Top 100 Words and their TF-IDF Scores:")
# for word in top_100_words:
#     print(f"{word}: {sorted_tfidf_scores[word]}")
# Create a word cloud
# wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(top_100_words))
wordcloud = WordCloud(width=800, height=400, background_color='white')
wordcloud.generate_from_frequencies(top_100_words)

# Plot the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
print('\nWord Cloud of Top 100 words based on TF-IDF Scores for Entertainment Category')
plt.axis('off')
plt.show()




In [None]:
# INFORMATIONAL

# Get the feature names (words)
feature_names = informational_tfidf_df.columns
feature_names = [feature for feature in feature_names if feature != 'tf_idf_mean']

info_total_tfidf_scores = {}

# Iterate through each word and sum its TF-IDF scores across all documents
for word in feature_names:
    info_total_tfidf_scores[word] = informational_tfidf_df[word].sum()

# Sort the words based on tf-idf values
sorted_tfidf_scores = dict(sorted(info_total_tfidf_scores.items(), key=lambda x: x[1], reverse=True))
# Get the top 100 words based on TF-IDF scores
top_100_words = dict(sorted(sorted_tfidf_scores.items(), key=lambda item: item[1], reverse=True)[:100])
# Get the top 10 words based on TF-IDF scores
# top_100_words = list(sorted_tfidf_scores.keys())[:100]
# # Print out the top 10 words and their TF-IDF scores
# print("Top 100 Words and their TF-IDF Scores:")
# for word in top_100_words:
#     print(f"{word}: {sorted_tfidf_scores[word]}")

# Create a word cloud
# wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(top_100_words))
wordcloud = WordCloud(width=800, height=400, background_color='white')
wordcloud.generate_from_frequencies(top_100_words)
# Plot the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
print('\nWord Cloud of Top 100 words based on TF-IDF Scores for Informational Category')
plt.axis('off')
plt.show()

## DOC2VEC Visualisation


Run the next section before running this section

In [None]:
# Load the Doc2Vec model which is created in the next phase
model = Doc2Vec.load("./models/doc2vec_model")

combined_data_doc2vec = combined_data.copy()

# Function to infer embeddings for a given tokenized transcript
def infer_embeddings(tokenized_transcript, model):
    return model.infer_vector(tokenized_transcript)

# Apply the function
combined_data_doc2vec['embeddings'] = combined_data_doc2vec['tokenized_transcript'].apply(lambda x: infer_embeddings(x, model))

combined_data_doc2vec['embedding_mean'] = combined_data_doc2vec['embeddings'].apply(lambda x: pd.Series(x).mean())



In [None]:
# Correlation matrix visualisation

# Merge the dataframes based on a common identifier
correlation_matrix = combined_data_doc2vec[['title','transcript','embedding_mean', 'like_count', 'view_count', 'comment_count','label_encoded']]
correlation_matrix.head()
correlation_matrix['like_view_ratio'] = correlation_matrix['like_count'] / correlation_matrix['view_count']
correlation_matrix['comment_view_ratio'] = correlation_matrix['comment_count'] / correlation_matrix['view_count']
correlation_matrix['comment_like_ratio'] = correlation_matrix['comment_count'] / correlation_matrix['like_count']

# Compute correlation matrix
correlation_matrix = correlation_matrix[['embedding_mean','like_view_ratio','label_encoded']].corr()
print(correlation_matrix)
# Plot correlation matrix using heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", vmin=-1, vmax=1)
plt.title('Doc2Vec Correlation Between Metrics')
plt.show()

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Word2Vec visualisations for informational and entertainment data
entertainment_doc2vec = combined_data_doc2vec.iloc[:len(entertainment_data)].copy()
informational_doc2vec = combined_data_doc2vec.iloc[len(entertainment_data):].copy()

# Scale the input data to [0, 1] range
scaler = MinMaxScaler()
informational_doc2vec_scaled = scaler.fit_transform(informational_doc2vec['embedding_mean'].values.reshape(-1, 1))
entertainment_doc2vec_scaled = scaler.transform(entertainment_doc2vec['embedding_mean'].values.reshape(-1, 1))

# Plot the density plot
plt.figure(figsize=(10,6))
sns.kdeplot(informational_doc2vec_scaled.flatten(), label='Informative', fill=True)
sns.kdeplot(entertainment_doc2vec_scaled.flatten(), label='Non-Informative', fill=True)

plt.title('Doc2Vec Density Plot')
plt.xlabel('Mean of Doc2Vec Embeddings ')
plt.ylabel('Density')
plt.xlim(0,)
plt.legend()
plt.show()





# CLASSIFICATION MODEL

In [None]:
# # Tokenize transcripts
# combined_data['tokenized_transcript'] = combined_data['transcript'].apply(lambda x: word_tokenize(str(x).lower()))

# # Encode labels ('informative' and 'non-informative')
# combined_data['label_encoded'] = combined_data['label'].map({'informative': 1, 'entertainment': 0})

In [10]:
# Split data into features and labels
X = combined_data['tokenized_transcript']
y = combined_data['label_encoded']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tag documents with unique identifiers
tagged_data = [TaggedDocument(words=doc, tags=[i]) for i, doc in enumerate(X_train)]

# Load the Doc2Vec model
model = Doc2Vec.load("./models/doc2vec_model")

# # Train Doc2Vec model
# model = Doc2Vec(vector_size=100, window=5, min_count=1, workers=4, epochs=20)
# model.build_vocab(tagged_data)
# model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

# Get document embeddings for training and testing sets
X_train_embeddings = [model.infer_vector(doc) for doc in X_train]
X_test_embeddings = [model.infer_vector(doc) for doc in X_test]



## Exploring classifiers for the model

In [15]:
from sklearn.metrics import confusion_matrix

In [17]:
# Classifier for the model
# Train logistic regression model
clf_logreg = LogisticRegression()
clf_logreg.fit(X_train_embeddings, y_train)
# Predict labels for testing set
y_pred_lr = clf_logreg.predict(X_test_embeddings)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred_lr)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred_lr, digits=5))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred_lr)
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.9529667149059334
              precision    recall  f1-score   support

           0    0.96373   0.95917   0.96145      1690
           1    0.93623   0.94320   0.93970      1074

    accuracy                        0.95297      2764
   macro avg    0.94998   0.95119   0.95058      2764
weighted avg    0.95305   0.95297   0.95300      2764

Confusion Matrix:
[[1621   69]
 [  61 1013]]


In [18]:
# Train SVM model
clf_svm = SVC()
clf_svm.fit(X_train_embeddings, y_train)
# Predict labels for testing set using SVM
y_pred_svm = clf_svm.predict(X_test_embeddings)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred_svm)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred_svm, digits=5))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred_svm)
print("Confusion Matrix:")
print(conf_matrix)


Accuracy: 0.9678002894356006
              precision    recall  f1-score   support

           0    0.97395   0.97337   0.97366      1690
           1    0.95814   0.95903   0.95859      1074

    accuracy                        0.96780      2764
   macro avg    0.96604   0.96620   0.96612      2764
weighted avg    0.96781   0.96780   0.96780      2764

Confusion Matrix:
[[1645   45]
 [  44 1030]]


In [19]:
# Train Random Forest model
clf_rf = RandomForestClassifier()
clf_rf.fit(X_train_embeddings, y_train)
# Predict labels for testing set using Random Forest
y_pred_rf = clf_rf.predict(X_test_embeddings)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred_rf)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred_rf, digits=5))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred_rf)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.9569464544138929
              precision    recall  f1-score   support

           0    0.96288   0.96686   0.96487      1690
           1    0.94752   0.94134   0.94442      1074

    accuracy                        0.95695      2764
   macro avg    0.95520   0.95410   0.95464      2764
weighted avg    0.95691   0.95695   0.95692      2764

Confusion Matrix:
[[1634   56]
 [  63 1011]]


In [20]:
from sklearn.naive_bayes import GaussianNB

# Train Gaussian NB model
clf_gnb = GaussianNB()
clf_gnb.fit(X_train_embeddings, y_train)
# Predict labels for testing set using Multinomial NB
y_pred_nb = clf_gnb.predict(X_test_embeddings)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred_nb)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred_nb, digits=5))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred_nb)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.821273516642547
              precision    recall  f1-score   support

           0    0.95098   0.74615   0.83621      1690
           1    0.70167   0.93948   0.80334      1074

    accuracy                        0.82127      2764
   macro avg    0.82632   0.84282   0.81978      2764
weighted avg    0.85411   0.82127   0.82344      2764

Confusion Matrix:
[[1261  429]
 [  65 1009]]


In [None]:
# Save the Doc2Vec model
model.save("./models/doc2vec_model")

# LIKE PREDICTION MODEL

In [None]:
# New dataset
# yt_lvr_data = yt_data.copy()
# yt_lvr_data.drop(columns=['video_id','categoryId','dislikes'], inplace=True)

# lvr_data = pd.concat([ted_data,yt_educational_data,vox_data,yt_lvr_data], ignore_index=True, join='inner')
lvr_data = combined_data.copy()
# Drop rows where view_count or like_count is 0 or NaN
lvr_data = lvr_data.dropna(subset=['view_count', 'like_count'])
lvr_data = lvr_data[(lvr_data['view_count'] != 0) | (lvr_data['like_count'] != 0)]
lvr_data = lvr_data.reset_index(drop=True)




## DOC2VEC Model

In [None]:
X = lvr_data['tokenized_transcript']
y = lvr_data['like_count']/lvr_data['view_count']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Tag documents with unique identifiers
# tagged_data = [TaggedDocument(words=doc, tags=[i]) for i, doc in enumerate(X_train)]

# Load the Doc2Vec model
model = Doc2Vec.load("./models/doc2vec_model")
# # Train Doc2Vec lvr_model
# lvr_model = Doc2Vec(vector_size=100, window=5, min_count=1, workers=4, epochs=20)
# lvr_model.build_vocab(tagged_data)
# lvr_model.train(tagged_data, total_examples=lvr_model.corpus_count, epochs=lvr_model.epochs)

# Get document embeddings for training and testing sets
X_train_embeddings = [model.infer_vector(doc) for doc in X_train]
X_test_embeddings = [model.infer_vector(doc) for doc in X_test]




### EXPLORING CLASSIFIERS

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
# Train a linear regression clf
clf = LinearRegression()
clf.fit(X_train_embeddings, y_train)

# Predict on the test set
y_pred = clf.predict(X_test_embeddings)

print("Linear Regression using DOC2VEC")
# Evaluate the clf
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

# Evaluate model performance
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)





In [None]:
from sklearn.linear_model import Ridge

# Initialize Ridge Regression model
ridge_model = Ridge()

# Train Ridge Regression model
ridge_model.fit(X_train_embeddings, y_train)

# Predict engagement ratios for the testing set
y_pred = ridge_model.predict(X_test_embeddings)


print("Ridge Regression using DOC2VEC")
# Evaluate the clf
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

# Evaluate model performance
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)




## USING TF-IDF

In [None]:
# # TF-IDF
# # Initialize the TF-IDF vectorizer
# lvr_tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

# # Fit the vectorizer and transform the transcript data
# lvr_tfidf_matrix = lvr_tfidf_vectorizer.fit_transform(lvr_data['transcript'])

# # Convert the TF-IDF matrix to a DataFrame
# lvr_tfidf_df = pd.DataFrame(lvr_tfidf_matrix.toarray(), columns=lvr_tfidf_vectorizer.get_feature_names_out())

# lvr_tfidf_df['tf_idf_mean'] = lvr_tfidf_df.apply(tf_idf_mean, axis=1)

In [None]:
X = lvr_data['transcript']
y = lvr_data['like_count']/lvr_data['view_count']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data to numerical features using TF-IDF
lvr_tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')  # You can adjust max_features as needed
X_train_tfidf = lvr_tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = lvr_tfidf_vectorizer.transform(X_test)

### EXPLORING CLASSIFIERS

In [None]:
# Train a linear regression clf
clf = LinearRegression()
clf.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred = clf.predict(X_test_tfidf)

print("Linear Regression using TF-IDF")
# Evaluate the clf
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

# Evaluate model performance
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)

In [None]:
from sklearn.linear_model import Ridge

# Initialize Ridge Regression model
ridge_model = Ridge()

# Train Ridge Regression model
ridge_model.fit(X_train_tfidf, y_train)

# Predict engagement ratios for the testing set
y_pred = ridge_model.predict(X_test_tfidf)

print("Ridge Regression using TF-IDF")
# Evaluate the clf
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)

# Evaluate model performance
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

rmse = np.sqrt(mse)
print("Root Mean Squared Error:", rmse)


# TESTING

In [None]:
# Load and preprocess the new transcript
with open("./transcript_to_test.txt", "r") as file:
    new_transcript = file.read().lower()
tokenized_new_transcript = word_tokenize(new_transcript)

# Load the Doc2Vec model
model = Doc2Vec.load("./models/doc2vec_model")

# Infer Doc2Vec embedding for the new transcript
embedding = model.infer_vector(tokenized_new_transcript)

# Predict the label using the trained SVM classification model
predicted_label = clf_svm.predict([embedding])[0]  

# Map the predicted label back to its original form
predicted_label_original = {1: 'informative', 0: 'non-informative'}.get(predicted_label)

print("Predicted Label:", predicted_label_original)

# Predict probabilities for each class for testing set
y_probabilities = clf_svm.predict_proba([embedding])[0]

print("Predicted Probabilities for each class for the testing set:")
print(y_probabilities)

