In [None]:
# @title Creating, Training & Saving BERT to Google Drive
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.preprocessing import LabelEncoder

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Load your dataset
df = pd.read_excel('/content/drive/MyDrive/AI and Data Science/Segment1.xlsx')  # Make sure to provide the correct path to your dataset

# Encoding labels
label_encoder = LabelEncoder()
df['EncodedLabels'] = label_encoder.fit_transform(df['Sentiment_3'])  # Replace 'Sentiment_3' with your actual column name

# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Dataset class
class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts.iloc[index]
        label = self.labels.iloc[index]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Prepare dataset and dataloader using all data
dataset = EmotionDataset(df['Text'], df['EncodedLabels'], tokenizer)
data_loader = DataLoader(dataset, batch_size=16, shuffle=True)

# Model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
epochs = 5
for epoch in range(epochs):
    model.train()
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        model.zero_grad()
        outputs = model(input_ids, attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1} complete.")
# Save the model and tokenizer to your Google Drive
model.save_pretrained('/content/drive/MyDrive/AI and Data Science/Model')
tokenizer.save_pretrained('/content/drive/MyDrive/AI and Data Science/Model')

print(f"Model and tokenizer saved to {'/content/drive/MyDrive/AI and Data Science/Model'}")

In [None]:
# @title Loading, Training & Saving Pre-Trained BERT to Google Drive
from transformers import BertConfig, BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder
from torch.optim import AdamW
import pandas as pd

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Load the dataset
df = pd.read_excel('/content/drive/MyDrive/AI and Data Science/Segment6.xlsx')
label_encoder = LabelEncoder()
df['EncodedLabels'] = label_encoder.fit_transform(df['Sentiment_3'])

# Load tokenizer and model configuration
model_path = '/content/drive/MyDrive/AI and Data Science/Model'
tokenizer = BertTokenizer.from_pretrained(model_path)

# Check the original model configuration
config = BertConfig.from_pretrained(model_path)
original_num_labels = config.num_labels
current_num_labels = len(label_encoder.classes_)

# Load the model, adjust num_labels if necessary
if original_num_labels != current_num_labels:
    config.num_labels = current_num_labels  # Update config to new number of labels
    model = BertForSequenceClassification(config=config)  # Reinitialize the model
else:
    model = BertForSequenceClassification.from_pretrained(model_path, config=config)

# Define Dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Prepare data loader
dataset = TextDataset(df['Text'].tolist(), df['EncodedLabels'].tolist(), tokenizer)
loader = DataLoader(dataset, batch_size=16, shuffle=True)

# Setup training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
model.train()
for epoch in range(5):
    for batch in loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1} complete.")

# Save the fine-tuned model back to Google Drive
model.save_pretrained('/content/drive/MyDrive/AI and Data Science/Model')
tokenizer.save_pretrained('/content/drive/MyDrive/AI and Data Science/Model')

In [None]:
# @title Loading the Validation Dataset
# Load the validation dataset
val_df = pd.read_excel('/content/drive/MyDrive/AI and Data Science/Validation.xlsx')
val_df['EncodedLabels'] = label_encoder.transform(val_df['Sentiment_3'])  # Assuming the same label encoder can be applied

# Define the dataset class for the validation dataset
class ValidationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Prepare the validation DataLoader
val_dataset = ValidationDataset(val_df['Text'].tolist(), val_df['EncodedLabels'].tolist(), tokenizer)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [None]:
# @title Evaluating the Model and Print Results
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

def evaluate_model(model, data_loader, device):
    model.eval()  # Set model to evaluation mode
    true_labels = []
    predicted_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)

            true_labels.extend(labels.cpu().numpy())
            predicted_labels.extend(preds.cpu().numpy())

    accuracy = accuracy_score(true_labels, predicted_labels)
    report = classification_report(true_labels, predicted_labels, target_names=label_encoder.classes_)

    print(f'Validation Accuracy: {accuracy}')
    print('Classification Report:')
    print(report)

# Evaluate the model
evaluate_model(model, val_loader, device)

In [None]:
# @title Youtube API & Video IDs
# Youtube API Credentials
dev = "AIzaSyAM07vB7FSGs46coHuu1wv3i5prszm8e54"

# List of Youtube Videos IDs
video_ids = ['e3KCOFCI4js','1_qod_2ZIxM'# Add more video IDs here
           ]

In [None]:
# @title Function to Fetch Comments
import googleapiclient.discovery
import pandas as pd
from datetime import datetime

api_service_name = "youtube"
api_version = "v3"

youtube = googleapiclient.discovery.build(
    api_service_name, api_version, developerKey=dev)


def getcomments(video):
    request = youtube.commentThreads().list(
        part="snippet",
        videoId=video,
        maxResults=100
    )

    comments = []

    # Executing the request.
    response = request.execute()

    # Getting the comments from the response.
    for item in response['items']:
        comment = item['snippet']['topLevelComment']['snippet']
        public = item['snippet']['isPublic']
        published_date = datetime.strptime(comment['publishedAt'], '%Y-%m-%dT%H:%M:%SZ')
        year = published_date.strftime('%Y')
        comments.append([
            comment['authorDisplayName'],
            year,  # Using only the year
            comment['likeCount'],
            comment['textOriginal'],
            comment['videoId'],
            public
        ])
    # Fetch next pages if available
    while True:
        try:
            nextPageToken = response['nextPageToken']
        except KeyError:
            break

        # Creating a new request object with the next page token.
        nextRequest = youtube.commentThreads().list(part="snippet", videoId=video, maxResults=100, pageToken=nextPageToken)

        # Executing the next request.
        response = nextRequest.execute()

        # Getting the comments from the next response.
        for item in response['items']:
            comment = item['snippet']['topLevelComment']['snippet']
            public = item['snippet']['isPublic']
            published_date = datetime.strptime(comment['publishedAt'], '%Y-%m-%dT%H:%M:%SZ')
            year = published_date.strftime('%Y')
            comments.append([
                comment['authorDisplayName'],
                year,  # Using only the year
                comment['likeCount'],
                comment['textOriginal'],
                comment['videoId'],
                public
            ])

    df2 = pd.DataFrame(comments, columns=['Author', 'Published Date', 'Likes', 'Text', 'Video ID', 'Public'])
    return df2

In [None]:
# @title Calling Function to Fetch Comments
# Initializing an empty dataframe to store comments
df = pd.DataFrame(columns=['Author', 'Published Date', 'Likes', 'Text', 'Video ID', 'Public'])

# Loop through video IDs and Fetching Comments
for video_id in video_ids:
    df1 = getcomments(video_id)
    df = pd.concat([df, df1], ignore_index=True)

# Displaying the DataFrame Created
df

In [None]:
# @title Saving & Downloading CSV File (Can be used whenever needed)
from google.colab import files

# Saving combined dataframe to a single CSV file
csv_filename = "YouTubeComments.csv"
df.to_csv(csv_filename, index=False)
print("Combined Comments saved to:", csv_filename)

# Downloading the CSV file
files.download(csv_filename)

# Printing message indicating that the CSV file has been downloaded
print("CSV file has been downloaded to your computer with the filename:", csv_filename)

In [None]:
# @title Prediction Using Pretrained BERT
# Import necessary libraries
import torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.preprocessing import LabelEncoder
import joblib

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Path to the saved model and tokenizer
model_path = '/content/drive/MyDrive/AI and Data Science/Model'

# Load the tokenizer, model, and label encoder
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
label_encoder = joblib.load('/content/drive/MyDrive/AI and Data Science/Model/label_encoder.pkl')

# Setup device for Torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def sentiment_analysis(text, model, tokenizer, label_encoder, device, max_length=512):
    """Perform sentiment analysis on the provided text using a pre-trained BERT model."""
    # Prepare the text input for BERT
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    # Evaluate the model
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    # Process the model outputs
    logits = outputs.logits
    probs = torch.nn.functional.softmax(logits, dim=1)
    predicted_index = torch.argmax(probs, dim=1).cpu().numpy()[0]
    predicted_label = label_encoder.inverse_transform([predicted_index])[0]
    confidence = probs.cpu().numpy()[0][predicted_index]

    return predicted_label, confidence

# Assuming 'df' is already loaded with the necessary data
# Load your DataFrame here if not already loaded
# df = pd.read_excel('/content/drive/MyDrive/AI and Data Science/YourDataFrame.xlsx')

# Run predictions on the 'Text' column of the DataFrame
results = []
for text in df['Text']:
    predicted_label, confidence = sentiment_analysis(text, model, tokenizer, label_encoder, device)
    results.append((text, predicted_label, confidence))

# Create a DataFrame to store and display results
result_df = pd.DataFrame(results, columns=['Text', 'Predicted Sentiment', 'Confidence'])

# Optionally, save or display the DataFrame
result_df

In [None]:
# @title Preprocessing
!pip install contractions -q
!pip install autocorrect -q

import pandas as pd
import string
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import contractions
from autocorrect import Speller

# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Lowercasing
df['CText'] = df['Text'].apply(lambda x: x.lower())

# Handling Contractions
df['CText'] = df['CText'].apply(lambda x: contractions.fix(x))

# Removing URLs
df['CText'] = df['CText'].apply(lambda x: re.sub(r'http\S+|www\S+|[^a-zA-Z\s]', '', x))

# Removing Special Characters and Emojis
df['CText'] = df['CText'].apply(lambda x: re.sub(r'[^\w\s]|_+', '', x))

# Removing Numbers
df['CText'] = df['CText'].apply(lambda x: re.sub(r'\d+', '', x))

# Whitespace Removal
df['CText'] = df['CText'].apply(lambda x: ' '.join(x.split()))

# Tokenization
df['Tokens'] = df['CText'].apply(lambda x: word_tokenize(x))

# Removing Punctuation
df['Tokens'] = df['Tokens'].apply(lambda x: [word for word in x if word not in string.punctuation])

# Removing Stopwords
stop_words = set(stopwords.words('english'))
df['Tokens'] = df['Tokens'].apply(lambda x: [word for word in x if word.lower() not in stop_words])

# Lemmatization
lemmatizer = WordNetLemmatizer()
df['Tokens'] = df['Tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Finalizing Text by Join tokens back into a string
df['CText'] = df['Tokens'].apply(lambda x: ' '.join(x))

# Droping unnecessary columns
df = df.drop(columns=['Author', 'Public', 'Tokens'])

# Initialize the spell checker with English language
#spell = Speller(lang='en')

# Apply spell-checking to each word in the Text
#df['CText'] = df['CText'].apply(lambda x: ' '.join([spell(word) for word in x.split()]))

# Set display options to show larger text
pd.set_option('display.max_colwidth', None)

# Display the modified DataFrame
df

In [None]:
# @title TextBlob Function
from textblob import TextBlob

def get_sentiment_score(comment):
    try:
        blob = TextBlob(comment)
        sentiment_score = blob.sentiment.polarity

        # Determine sentiment category
        if sentiment_score > 0:
            return 'Positive'
        elif sentiment_score < 0:
            return 'Negative'
        else:
            return 'Neutral'

    except Exception as e:
        print(f"Error processing comment: {e}")
        return 'Error'

In [None]:
# @title TB Sentiment Analysis
# Add 'Sentiment' column to DataFrame
df['TBSentiment2'] = df['CText'].apply(get_sentiment_score)

df

In [None]:
# @title TB Visualization of Sentiment Analysis
import matplotlib.pyplot as plt

# Calculate the count of each sentiment category
sentiment_counts = df['TBSentiment2'].value_counts()

# Plotting the pie chart
plt.figure(figsize=(8, 8))
plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=100)
plt.title('Sentiment Distribution', fontsize=20)
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()

In [None]:
# @title TB Sentiment Analysis with Like Counts
# Add 'Sentiment' column to DataFrame
df['TBSentiment'] = df.apply(lambda row: get_sentiment_score(row['CText']), axis=1)

# Initialize counters for each sentiment category
positive_likes = 0
negative_likes = 0
neutral_likes = 0

# Aggregate like counts for each sentiment category
for index, row in df.iterrows():
    sentiment = row['TBSentiment']
    likes = row['Likes']
    if sentiment == 'Positive':
        positive_likes += likes
    elif sentiment == 'Negative':
        negative_likes += likes
    elif sentiment == 'Neutral':
        neutral_likes += likes

# Print the total like counts for each sentiment category
print("Total Positive, Negative and Neutral Sentiments (Including Likes on the Comments)")
print("-"*80)
print("\n")
print(" "*25,f"Positive: {positive_likes}")
print(" "*25,f"Negative: {negative_likes}")
print(" "*25,f"Neutral: {neutral_likes}")
print("\n")
df

In [None]:
# @title TB Sentiment Visualisation with Likes
import matplotlib.pyplot as plt

# Setting the figure size
plt.figure(figsize=(8, 8))

# Define counts, labels, and colors for the pie chart
counts = [positive_likes, negative_likes, neutral_likes]
labels = ['Positive', 'Negative', 'Neutral']

# Plotting the pie chart
patches, texts, autotexts = plt.pie(counts, labels=labels, autopct='%1.2f%%', startangle=100)

# Adding count annotations to each slice
for i, (count, autotext) in enumerate(zip(counts, autotexts)):
    autotext.set_text(f"{count}\n{autotext.get_text()}")

# Setting the title of the chart
plt.title('Sentiment Distribution', fontsize=20)

# Equal aspect ratio ensures that pie is drawn as a circle
plt.axis('equal')

# Displaying the pie chart
plt.show()

In [None]:
# @title Lexicon Sentiment Analysis
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Initializing SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# Getting The Sentiment Scores
df['LIntensity'] = df['CText'].apply(lambda x:sia.polarity_scores(x)['compound'])

# Classifying the Sentiment scores as Positive, Negative and Neutral
df['LSentiment'] = df['LIntensity'].apply(lambda s : 'Positive' if s > 0 else ('Neutral' if s == 0 else 'Negative'))

df

In [None]:
# @title Lexixon Results Visualisation
# Calculate the count of each sentiment category
sentiment_counts = df['LSentiment'].value_counts()

# Plotting the pie chart
plt.figure(figsize=(8, 8))
plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=100)
plt.title('Sentiment Distribution', fontsize=20)
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()