In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
import pandas as pd
import ast
import re
import nltk
from nltk.corpus import stopwords

In [None]:
# Load the CSV
comments_df = pd.read_csv('kurzgesagt_comments_original.csv')

In [None]:
import torch
torch.cuda.is_available()

**Here we're tring to clean and tokenize the data then get our sentiment scores for each video**

Train
____

*Getting the tokenizer*

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.nn.functional import softmax
import torch

# Load the model and tokenizer
model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [None]:
# Example of tokenizing a single comment
inputs = tokenizer("I love this!", return_tensors="pt", truncation=True, padding=True, max_length=512)

In [None]:
# Get predictions
outputs = model(**inputs)
predictions = softmax(outputs.logits, dim=-1)

In [None]:
predictions

Clean the comments of puncuation and links

In [None]:
import ast

# The string to be removed
remove_string = ["offset carbon footprint wren \u200b first 200 people sign kurzgesagt pay first month subscription video sponsored wren thanks lot support",
                "'✨ WORLDWIDE SHIPPING AVAILABLE ✨ The 12,024 Human Era Calendar has landed! https://shop.kgs.link/12-024\nJoin us on an exploration of how different cosmic conditions could shape unique worlds and civilizations.\nStocks are highly limited, so don’t miss your chance to own a truly special piece of kurzgesagt.'",
                "':sparkles: WORLDWIDE SHIPPING AVAILABLE :sparkles: The 12,024 Human Era Calendar has landed! Join us on an exploration of how different cosmic conditions could shape unique worlds and civilizations. Stocks are highly limited, so don’t miss your chance to own a truly special piece of kurzgesagt.'",
                "Join us over on Discord to discuss and share your thoughts:", 
                "Go ‘beyond the nutshell’ at and dive deeper into these topics and more with a free 30-day trial! This video was sponsored by Brilliant. Thanks a lot for the support!",
                "You want to learn more about science? Check out our sciency products on the kurzgesagt shop – all designed with love and produced with care. Getting something from the kurzgesagt shop is the best way to support us and to keep our videos free for everyone. ►► (Worldwide Shipping Available)",
                "Head over to our shop to get exclusive kurzgesagt merch and sciency products designed with love. Getting something from the kurzgesagt shop is the best way to support us and to keep our videos free for everyone. ►► (Worldwide Shipping Available)",]

# Loop through all rows in the dataframe
for index, row in comments_df.iterrows():
    comment_str = row['comments']
    try:
        # Deserialize the comments for the current index
        comment_list = ast.literal_eval(comment_str)

        # Attempt to remove the unwanted string if it's in the list
        for remove_string in comment_list:
            comment_list.remove(remove_string)

        # Serialize the list back and update the dataframe
        comments_df.at[index, 'comments'] = str(comment_list)
    except:
        continue

In [None]:
# Function to clean the text data while keeping emojis and URLs
import re

# Function to clean the text data
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove newlines
    text = text.replace('\n', ' ')
    # Keep emojis, lowercase the text, and remove unwanted punctuation
    # No need to match emojis with a pattern, we are keeping them
    text = text.lower()
    # Remove punctuation (except for apostrophes and emojis)
    text = re.sub(r'[^\w\s\'\"\,\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF\U00002702-\U000027B0\U000024C2-\U0001F251]', '', text)
    return text


# Clean the comments
comments_df['cleaned_comments'] = comments_df['comments'].apply(lambda x: clean_text(x))

# Show the cleaned comments
comments_df[['comments', 'cleaned_comments']].head()


In [None]:
# Function to get sentiment scores as a list
def get_sentiment_score(comment):
    inputs = tokenizer(comment, return_tensors="pt", truncation=True, padding=True, max_length=512)
    outputs = model(**inputs)
    prediction = softmax(outputs.logits, dim=-1)
    return prediction.mean(dim=0).tolist()  # Averaging across all tokens

# Clean the comments using the previously defined clean_text function

# Get sentiment scores for each comment
comments_df['sentiment_scores'] = comments_df['cleaned_comments'].apply(get_sentiment_score)

# Now, split the sentiment_scores list into separate columns
sentiment_columns = ['negative', 'neutral', 'positive']  # Adjust these names based on the model's sentiment labels
for i, sentiment in enumerate(sentiment_columns):
    comments_df[sentiment] = comments_df['sentiment_scores'].apply(lambda scores: scores[i])

# Drop the 'sentiment_scores' column as it's no longer needed
comments_df.drop('sentiment_scores', axis=1, inplace=True)

# Group by video_id and calculate the mean sentiment scores
video_sentiment_scores = comments_df.groupby('video_id')[sentiment_columns].mean()

# Function to determine the label based on the highest mean score
def get_video_sentiment_label(row):
    max_score = max(row['negative'], row['neutral'], row['positive'])
    if row['positive'] == max_score:
        return 'Positive'
    elif row['neutral'] == max_score:
        return 'Neutral'
    return 'Negative'  # If 'negative' has the highest score

# Apply the function to determine the label for each video
video_sentiment_scores['label'] = video_sentiment_scores.apply(get_video_sentiment_label, axis=1)

# Convert the aggregated scores and labels to a DataFrame
video_sentiment_df = video_sentiment_scores.reset_index()

# Now you have a DataFrame with an overall sentiment and label for each video
print(video_sentiment_df)


End
___

Single Comment Test
___

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.nn.functional import softmax

# Load the model and tokenizer
model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Example of tokenizing a single comment
comment = "'This theory was by far the most ideal and believable theory in the universe. I enjoyed this video very much. \n\nThank you for bringing extensive and complex theories to us in easier and understandable ways. You are one of the channels I adore.\nKeep up with the fantastic videos, and I truly appreciate the commitment of your team. ❤'"
inputs = tokenizer(comment, return_tensors="pt", truncation=True, padding=True, max_length=512)

# Get predictions
outputs = model(**inputs)

# Apply softmax to get probabilities
probabilities = softmax(outputs.logits, dim=1)

# Get the highest probability index
predicted_class_index = torch.argmax(probabilities, dim=1).item()

# Convert index to label
predicted_label = model.config.id2label[predicted_class_index]

# Print the results
print(f"Comment: '{comment}'")
print(f"Predicted probabilities: {probabilities.tolist()[0]}")
print(f"Predicted label: {predicted_label}")


End
_____

Visualize the results
___

In [None]:
import matplotlib.pyplot as plt

# Assuming video_sentiment_df is your DataFrame and 'label' is the column with sentiment labels
sentiment_counts = video_sentiment_df['label'].value_counts()

# Create a pie chart
plt.figure(figsize=(10, 10))
plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=140)
plt.title('Sentiment Distribution Across All Videos')
plt.show()

In [None]:
comments_df.head(4)

**Model training**

Importing the files for RoBerta model Training

In [None]:
video_sentiment_df.to_csv('kurzgesagt_video_sentiment_labels.csv', index=False)

In [None]:
comments_df.to_csv('kurzgesagt_comments_cleaned.csv', index=False)

In [None]:
cleaned_comments_df = pd.read_csv('kurzgesagt_comments_cleaned.csv')

In [None]:
video_sentiment_df = pd.read_csv('kurzgesagt_video_sentiment_labels.csv')

End
___

In [None]:
training_sentiment = pd.merge(cleaned_comments_df, video_sentiment_df, on='video_id', how='inner')

In [None]:
training_sentiment = training_sentiment.drop(columns=['negative_y', 'neutral_y', 'positive_y'])

In [None]:
# List of video IDs you want to focus on
selected_video_ids = ['4_aOIA-vyBo', 'MUWUHf-rzks', 'NtQkz0aRDe8', 'EhAemz1v7dQ', 'PKMQzkIiB0Y', 'GDSf2h9_39I',
'GoJsr4IwCm4', 'GqA42M4RtxE', 'Hug0rfFC_L8', 'IayvE_jFgrc', 'J0ldO87Pprc', 'JQVmkDUkZT4', 'Kr57ax0OWMk', 'LNv4y3wPQA0',
'LBudghsdByQ', 'yiw6_JakZFc', 'ouAccsTzlGU', 'n3Xv_g3g-mA']

# Filter the DataFrame to only include comments from those videos
filtered_comments_df = training_sentiment[training_sentiment['video_id'].isin(selected_video_ids)]

# Proceed to split this filtered DataFrame into training, validation, and test sets as before


In [None]:
label_dict = {"Negative": 0, "Neutral": 1, "Positive": 2}

In [None]:
from sklearn.model_selection import train_test_split

# Assuming 'filtered_comments_df' is your DataFrame
# Ensure the 'label' column (or the appropriate column name for your labels) is converted to integers
# Map string labels to integers
filtered_comments_df.loc[:, 'label'] = filtered_comments_df['label'].replace(label_dict)



# Now when you split your dataset
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    filtered_comments_df['cleaned_comments'], filtered_comments_df['label'], 
    test_size=0.3, random_state=42, stratify=filtered_comments_df['label']
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, 
#use stratify to make sure the distribution of labels is similar in all sets.
    test_size=0.5, random_state=42, stratify=temp_labels
)


In [None]:
print(filtered_comments_df['label'].value_counts())


In [None]:
print(label_dict)

In [None]:
print(filtered_comments_df['label'].unique()) 

In [None]:
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

class YouTubeCommentsDataset(Dataset):
    def __init__(self, comments, labels, tokenizer, max_token_len=512):
        self.comments = comments
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len
        
    def __len__(self):
        return len(self.comments)
    
    def __getitem__(self, item_index):
        comment = self.comments[item_index]
        label = self.labels[item_index]
        encoding = self.tokenizer.encode_plus(
            comment,
            add_special_tokens=True,
            max_length=self.max_token_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )
    
        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(label, dtype=torch.long)
        }
    



In [None]:
# Verify all labels are now integers
assert filtered_comments_df['label'].apply(lambda x: isinstance(x, int)).all(), "Not all labels are integers."


In [None]:
# Example mapping: adjust according to your data


# Splitting the dataset
from sklearn.model_selection import train_test_split



# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

# Create datasets
train_dataset = YouTubeCommentsDataset(train_texts.tolist(), train_labels.tolist(), tokenizer)
val_dataset = YouTubeCommentsDataset(val_texts.tolist(), val_labels.tolist(), tokenizer)
test_dataset = YouTubeCommentsDataset(test_texts.tolist(), test_labels.tolist(), tokenizer)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)


Step 3: Model Fine-Tuning

In [None]:
from transformers import AutoModelForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
import torch
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score

# Assuming you have a GPU available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForSequenceClassification.from_pretrained(
    "cardiffnlp/twitter-roberta-base-sentiment-latest",
    num_labels=len(label_dict)  # Adjust based on your number of labels
).to(device)

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)

epochs = 10  # Define the number of epochs here
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0, 
    num_training_steps=total_steps
)

def evaluate(model, val_loader):
    model.eval()
    val_preds, val_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            inputs, labels = batch['input_ids'].to(device), batch['labels'].to(device)
            outputs = model(inputs)
            preds = torch.argmax(outputs.logits, dim=1)
            val_preds.extend(preds.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())
    model.train()
    return accuracy_score(val_labels, val_preds)

best_val_acc = 0
early_stopping_counter = 0
early_stopping_limit = 4  # Stop after 3 epochs of no improvement

# Integrate training loop with validation and early stopping
for epoch in range(epochs):
    total_loss = 0
    model.train()
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        
        model.zero_grad()  # Reset gradients to zero before starting backpropagation
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()  # Accumulate the loss
        loss.backward()
        
        optimizer.step()
        scheduler.step()

    # Calculate average loss over the epoch
    avg_train_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{epochs} | Average Training Loss: {avg_train_loss}')

    # Validation evaluation
    val_acc = evaluate(model, val_loader)
    print(f"Epoch {epoch+1}, Validation Accuracy: {val_acc}")
    model_save_path = './model_save'  # Define the directory to save the model
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        early_stopping_counter = 0
        # Save the model and its configuration
        model.save_pretrained(model_save_path)
        # Optionally, you can save the optimizer and scheduler state as well
        torch.save({
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
        }, f'{model_save_path}/optimizer_scheduler_states.bin')
    else:
        early_stopping_counter += 1
    if early_stopping_counter >= early_stopping_limit:
        print("Early stopping triggered")
        break


In [None]:
print(set(true_labels))
print(set(predictions))
print(label_dict.keys())

In [None]:
for batch in test_loader:
    print(batch['labels'])


Step 4: Evaluation

In [None]:
from sklearn.metrics import classification_report

model.eval()
predictions, true_labels = [], []

for batch in test_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    
    with torch.no_grad():
        outputs = model(**batch)
        
    logits = outputs.logits
    predictions.extend(torch.argmax(logits, dim=-1).tolist())
    true_labels.extend(batch['labels'].tolist())

print(classification_report(true_labels, predictions, target_names=list(label_dict.keys())))


Saving Model and model embeddings

In [None]:
from transformers import AutoModelForSequenceClassification

model_load_path = './model_save'
model = AutoModelForSequenceClassification.from_pretrained(model_load_path)

optimizer_state = torch.load(f'{model_load_path}/optimizer_scheduler_states.bin')
optimizer.load_state_dict(optimizer_state['optimizer_state_dict'])
scheduler.load_state_dict(optimizer_state['scheduler_state_dict'])

# Make sure to move the model to the correct device again
model.to(device)

Load our model

In [None]:
from transformers import AutoTokenizer

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

def predict(text, model, tokenizer):
    model.eval()  # Set the model to evaluation mode
    inputs = tokenizer(text, return_tensors="pt", max_length=512, padding=True, truncation=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=-1).item()
    return prediction  # Return the predicted class index

# Example usage
text = "i'll check out those links though"
prediction = predict(text, model, tokenizer)
print(f'Predicted class index: {prediction}')
# You may want to map the predicted index to a label string based on your 'label_dict'
