**Pre processing with datset reduced to 10% of its real size , and 10% of the emotion words found were masked. This preprocessing has only been done to apply the grid search on the model, to find the best learning rate. With reduced dataset, more epochs can be trained without using too much resources.**

In [None]:
!pip install nltk gdown datasets transformers torch
import nltk
from nltk.corpus import wordnet as wn
import gdown
import torch
from datasets import load_dataset
from transformers import (
    RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_scheduler
)
from sklearn.metrics import classification_report
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
from tqdm import tqdm
import re
import random
import os

# We downloaded the NRC emotion lexicon file first
nrc_url = 'https://drive.google.com/uc?id=1WkxJrL5ECa4NLFnXlfVmFyP6VlVnrv5K'
nrc_output = 'nrc_emotion_lexicon.txt'
gdown.download(nrc_url, nrc_output, quiet=False)

# Function used for extracting emotion words
def get_emotion_words(file_path):
    emotion_words = set()
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split('\t')
            word, emotion_value = parts[0], parts[2:]
            if any(int(value) == 1 for value in emotion_value):
                emotion_words.add(word)
    return list(emotion_words)


emotion_words_list = get_emotion_words(nrc_output)  # Geting the list of emotion words


print("Sample emotion words:", emotion_words_list[:10]) # The first 10 words for debugging
print(f"Total number of emotion words: {len(emotion_words_list)}")

nltk.download('wordnet')
nltk.download('omw-1.4')

# Expanding the emotion words list
def expand_emotion_words(words):
    expanded_set = set()

    for word in words:
        expanded_set.add(word)
        for synset in wn.synsets(word):
            for lemma in synset.lemmas():
                expanded_set.add(lemma.name().replace('_', ' '))
                for related_form in lemma.derivationally_related_forms():
                    expanded_set.add(related_form.name().replace('_', ' '))

    return sorted(expanded_set)

# Generate the expanded list
expanded_emotion_words_list = expand_emotion_words(emotion_words_list)
print("Expanded Emotion Words Sample:", expanded_emotion_words_list[:20])
print(f"Total number of expanded words: {len(expanded_emotion_words_list)}")


dataset = load_dataset("go_emotions", "simplified")  # Load GoEmotions

# Reducing the dataset'a size
def reduce_dataset(dataset, fraction=0.9):
    indices = torch.randperm(len(dataset))[:int(len(dataset) * fraction)]
    return dataset.select(indices)

dataset["train"] = reduce_dataset(dataset["train"], fraction=0.1)
import random
emotion_words = expanded_emotion_words_list


# Preprocess the text to mask emotion-related words with 10% probability

def mask_emotion_words(example, mask_ratio=0.10):
    def replace_emotion_words(text):
        words = text.split()
        for i, word in enumerate(words):
            if word.lower() in emotion_words and random.random() < mask_ratio:
                words[i] = "[MASK]"
        return ' '.join(words)

    example["text"] = replace_emotion_words(example["text"])
    return example


dataset = dataset.map(mask_emotion_words) # Apply the function to the dataset






Downloading...
From: https://drive.google.com/uc?id=1WkxJrL5ECa4NLFnXlfVmFyP6VlVnrv5K
To: /content/nrc_emotion_lexicon.txt
100%|██████████| 2.72M/2.72M [00:00<00:00, 92.0MB/s]


Sample emotion words: ['rabble', 'covet', 'doll', 'ratify', 'tiling', 'gasp', 'peace', 'crushed', 'neglecting', 'needle']
Total number of emotion words: 6453


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Expanded Emotion Words Sample: ['1', '13th', '144', 'A-one', 'ACE', 'ALT', 'AS', 'Adam', 'Adam Smith', 'Advent', 'Aesculapius', 'Aga', 'Age of Reason', 'Agha', 'Agriculture', 'Agriculture Department', 'Al Gore', 'Albert Gore Jr.', 'Alexander Pope', 'Allied']
Total number of expanded words: 26359


Map:   0%|          | 0/4341 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

**Pre-trained model, roberta-base, has been used here to fine tune on the go emotion dataset. Using the redcued dataset, we applied grid search to find the best learning rate on this model**

In [None]:
# Initializing Roberta tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Tokenize the dataset
def tokenize_function(example):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=128,
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Ensure the labels are in the correct tensor format
def format_labels(example):
    example["labels"] = example["labels"][0] if isinstance(example["labels"], list) else example["labels"]
    return example

tokenized_datasets = tokenized_datasets.map(format_labels)

# Format dataset for PyTorch
data_collator = DataCollatorWithPadding(tokenizer)
columns = ["input_ids", "attention_mask", "labels"]
tokenized_datasets.set_format(type="torch", columns=columns)

train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=16, shuffle=True, collate_fn=data_collator)
val_dataloader = DataLoader(tokenized_datasets["validation"], batch_size=16, collate_fn=data_collator)
test_dataloader = DataLoader(tokenized_datasets["test"], batch_size=16, collate_fn=data_collator)

# Train and evaluate with grid search for learning rates
def train_and_evaluate_model(learning_rates=[4e-5, 3e-5, 2e-5]):
    best_model_path = None
    best_val_loss = float("inf")
    best_lr = None
    best_epoch = None

    for lr in learning_rates:
        print(f"\nTraining with learning rate: {lr}")
        model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=28)
        model.to(device)

        optimizer = AdamW(model.parameters(), lr=lr)
        num_training_steps = len(train_dataloader) * 1  # 1 epoch for testing
        lr_scheduler = get_scheduler(
            "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
        )

        loss_fn = torch.nn.CrossEntropyLoss()
        for epoch in range(2):
            model.train()
            total_loss = 0
            for batch in tqdm(train_dataloader):
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
                loss = loss_fn(outputs.logits, batch["labels"])
                total_loss += loss.item()
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                lr_scheduler.step()
            avg_loss = total_loss / len(train_dataloader)
            print(f"Epoch {epoch + 1}, Loss: {avg_loss}")

            # Save the model if validation loss improves
            model.eval()
            val_loss = 0
            with torch.no_grad():
                for batch in val_dataloader:
                    batch = {k: v.to(device) for k, v in batch.items()}
                    outputs = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
                    loss = loss_fn(outputs.logits, batch["labels"])
                    val_loss += loss.item()
            avg_val_loss = val_loss / len(val_dataloader)
            print(f"Validation Loss: {avg_val_loss}")

            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                best_model_path = f"best_model_lr_{lr}.pt"
                best_lr = lr
                best_epoch = epoch + 1
                torch.save(model.state_dict(), best_model_path)
                print(f"Saved best model with validation loss: {best_val_loss}")

    # Load the best model for evaluation
    print(f"\nBest learning rate: {best_lr}")
    print(f"Best epoch: {best_epoch}")
    print(f"Loading best model from {best_model_path}")
    best_model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=28)
    best_model.load_state_dict(torch.load(best_model_path))
    best_model.to(device)

    return best_model

# Training and evaluation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
best_model = train_and_evaluate_model()

# Evaluation
def evaluate_model(model, dataloader):
    model.eval()
    y_true = []
    y_preds = []
    with torch.no_grad():
        for batch in tqdm(dataloader):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            labels = batch["labels"].cpu().numpy()
            y_preds.extend(preds)
            y_true.extend(labels)

    label_names = dataset["train"].features["labels"].feature.names
    print(classification_report(y_true, y_preds, target_names=label_names))

# Validate the best model
print("\nValidation Results:")
evaluate_model(best_model, val_dataloader)

# Test the best model
print("\nTest Results:")
evaluate_model(best_model, test_dataloader)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Map:   0%|          | 0/4341 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

Map:   0%|          | 0/4341 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]


Training with learning rate: 4e-05


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 272/272 [01:25<00:00,  3.17it/s]


Epoch 1, Loss: 2.4825498864931217
Validation Loss: 1.995811963782591
Saved best model with validation loss: 1.995811963782591


100%|██████████| 272/272 [01:34<00:00,  2.89it/s]


Epoch 2, Loss: 2.0143602446598163
Validation Loss: 1.995811963782591

Training with learning rate: 3e-05


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 272/272 [01:33<00:00,  2.91it/s]


Epoch 1, Loss: 2.474024429479066
Validation Loss: 2.027176155588206


100%|██████████| 272/272 [01:33<00:00,  2.90it/s]


Epoch 2, Loss: 2.045586587751613
Validation Loss: 2.027176155588206

Training with learning rate: 2e-05


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 272/272 [01:33<00:00,  2.90it/s]


Epoch 1, Loss: 2.6369684185175335
Validation Loss: 2.2760358635117024


100%|██████████| 272/272 [01:33<00:00,  2.90it/s]


Epoch 2, Loss: 2.3355705738067627
Validation Loss: 2.2760358635117024

Best learning rate: 4e-05
Best epoch: 1
Loading best model from best_model_lr_4e-05.pt


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  best_model.load_state_dict(torch.load(best_model_path))



Validation Results:


100%|██████████| 340/340 [00:36<00:00,  9.32it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                precision    recall  f1-score   support

    admiration       0.59      0.69      0.63       488
     amusement       0.54      0.75      0.63       297
         anger       1.00      0.01      0.02       192
     annoyance       0.20      0.20      0.20       247
      approval       0.26      0.04      0.07       355
        caring       0.00      0.00      0.00       138
     confusion       0.50      0.01      0.03       136
     curiosity       0.40      0.58      0.47       205
        desire       0.00      0.00      0.00        64
disappointment       0.17      0.02      0.03       129
   disapproval       0.18      0.02      0.04       246
       disgust       0.00      0.00      0.00        74
 embarrassment       0.00      0.00      0.00        28
    excitement       0.00      0.00      0.00        78
          fear       0.00      0.00      0.00        74
     gratitude       0.59      0.93      0.72       297
         grief       0.00      0.00      0.00  

100%|██████████| 340/340 [00:36<00:00,  9.26it/s]

                precision    recall  f1-score   support

    admiration       0.55      0.66      0.60       504
     amusement       0.46      0.75      0.57       252
         anger       0.67      0.02      0.04       197
     annoyance       0.19      0.18      0.18       286
      approval       0.37      0.05      0.08       318
        caring       0.00      0.00      0.00       114
     confusion       0.50      0.02      0.04       139
     curiosity       0.40      0.63      0.49       233
        desire       0.00      0.00      0.00        74
disappointment       0.05      0.01      0.01       127
   disapproval       0.15      0.02      0.04       220
       disgust       0.00      0.00      0.00        84
 embarrassment       0.00      0.00      0.00        30
    excitement       0.00      0.00      0.00        84
          fear       0.00      0.00      0.00        74
     gratitude       0.56      0.94      0.70       288
         grief       0.00      0.00      0.00  


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**Pre trained model, emotion-bert has been fine turned here on the go emotion dataset. The grid search has been used to find the best learning rate and it has also given us the idea that a maximum of 2 epoch should be enough when we train our final model on the complete dataset**.

In [None]:
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_scheduler
)
from sklearn.metrics import classification_report
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
from tqdm import tqdm
import re
import random
# Initializing the EmotionBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("monologg/bert-base-cased-goemotions-original")

# Tokenizing the dataset
def tokenize_function(example):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=128,
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# To ensure that labels are in the correct tensor format
def format_labels(example):
    example["labels"] = example["labels"][0] if isinstance(example["labels"], list) else example["labels"]
    return example

tokenized_datasets = tokenized_datasets.map(format_labels)

# Format dataset for PyTorch
data_collator = DataCollatorWithPadding(tokenizer)
columns = ["input_ids", "attention_mask", "labels"]
tokenized_datasets.set_format(type="torch", columns=columns)

train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=16, shuffle=True, collate_fn=data_collator)
val_dataloader = DataLoader(tokenized_datasets["validation"], batch_size=16, collate_fn=data_collator)
test_dataloader = DataLoader(tokenized_datasets["test"], batch_size=16, collate_fn=data_collator)

# train and evaluate with grid search for learning rates
def train_and_evaluate_model(learning_rates=[4e-5, 3e-5, 5e-5]):
    best_model_path = None
    best_val_loss = float("inf")
    best_lr = None
    best_epoch = None

    for lr in learning_rates:
        print(f"\nTraining with learning rate: {lr}")
        model = AutoModelForSequenceClassification.from_pretrained("monologg/bert-base-cased-goemotions-original", num_labels=28)
        model.to(device)

        optimizer = AdamW(model.parameters(), lr=lr)
        num_training_steps = len(train_dataloader) * 3  # 3 epochs
        lr_scheduler = get_scheduler(
            "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
        )

        loss_fn = torch.nn.CrossEntropyLoss()
        for epoch in range(3):
            model.train()
            total_loss = 0
            for batch in tqdm(train_dataloader):
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
                loss = loss_fn(outputs.logits, batch["labels"])
                total_loss += loss.item()
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                lr_scheduler.step()
            avg_loss = total_loss / len(train_dataloader)
            print(f"Epoch {epoch + 1}, Loss: {avg_loss}")

            # Save model if validation loss improves
            model.eval()
            val_loss = 0
            with torch.no_grad():
                for batch in val_dataloader:
                    batch = {k: v.to(device) for k, v in batch.items()}
                    outputs = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
                    loss = loss_fn(outputs.logits, batch["labels"])
                    val_loss += loss.item()
            avg_val_loss = val_loss / len(val_dataloader)
            print(f"Validation Loss: {avg_val_loss}")

            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                best_model_path = f"best_model_lr_{lr}.pt"
                best_lr = lr
                best_epoch = epoch + 1
                torch.save(model.state_dict(), best_model_path)
                print(f"Saved best model with validation loss: {best_val_loss}")


    print(f"\nBest learning rate: {best_lr}") # Loading the best model for evaluation
    print(f"Best epoch: {best_epoch}")
    print(f"Loading best model from {best_model_path}")
    best_model = AutoModelForSequenceClassification.from_pretrained("monologg/bert-base-cased-goemotions-original", num_labels=28)
    best_model.load_state_dict(torch.load(best_model_path))
    best_model.to(device)

    return best_model

# Training and evaluation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
best_model = train_and_evaluate_model()

# Evaluation function
def evaluate_model(model, dataloader):
    model.eval()
    y_true = []
    y_preds = []
    with torch.no_grad():
        for batch in tqdm(dataloader):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            labels = batch["labels"].cpu().numpy()
            y_preds.extend(preds)
            y_true.extend(labels)

    label_names = dataset["train"].features["labels"].feature.names
    print(classification_report(y_true, y_preds, target_names=label_names))

# Validate the best model
print("\nValidation Results:")
evaluate_model(best_model, val_dataloader)

# Test the best model
print("\nTest Results:")
evaluate_model(best_model, test_dataloader)


Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]


Training with learning rate: 4e-05


100%|██████████| 272/272 [01:30<00:00,  2.99it/s]


Epoch 1, Loss: 0.44193402257395276
Validation Loss: 2.370078487255994
Saved best model with validation loss: 2.370078487255994


100%|██████████| 272/272 [01:33<00:00,  2.92it/s]


Epoch 2, Loss: 0.15037737352437996
Validation Loss: 2.562018847640823


100%|██████████| 272/272 [01:32<00:00,  2.94it/s]


Epoch 3, Loss: 0.06504334643164732
Validation Loss: 2.617738169081071

Training with learning rate: 3e-05


100%|██████████| 272/272 [01:32<00:00,  2.94it/s]


Epoch 1, Loss: 0.4370824028729283
Validation Loss: 2.5111686264767368


100%|██████████| 272/272 [01:32<00:00,  2.94it/s]


Epoch 2, Loss: 0.1553464956985622
Validation Loss: 2.5251706801793157


100%|██████████| 272/272 [01:32<00:00,  2.94it/s]


Epoch 3, Loss: 0.07217324881297399
Validation Loss: 2.5957713693380358

Training with learning rate: 5e-05


100%|██████████| 272/272 [01:32<00:00,  2.93it/s]


Epoch 1, Loss: 0.49074412123231653
Validation Loss: 2.376532168598736


100%|██████████| 272/272 [01:32<00:00,  2.94it/s]


Epoch 2, Loss: 0.1517786326705177
Validation Loss: 2.6085807677577524


100%|██████████| 272/272 [01:32<00:00,  2.94it/s]


Epoch 3, Loss: 0.048670549351406994
Validation Loss: 2.6845497385543937

Best learning rate: 4e-05
Best epoch: 1
Loading best model from best_model_lr_4e-05.pt


  best_model.load_state_dict(torch.load(best_model_path))



Validation Results:


100%|██████████| 340/340 [00:41<00:00,  8.27it/s]


                precision    recall  f1-score   support

    admiration       0.63      0.68      0.65       488
     amusement       0.77      0.76      0.76       297
         anger       0.40      0.47      0.44       192
     annoyance       0.23      0.22      0.23       247
      approval       0.36      0.30      0.33       355
        caring       0.48      0.33      0.39       138
     confusion       0.37      0.41      0.39       136
     curiosity       0.38      0.72      0.50       205
        desire       0.41      0.52      0.46        64
disappointment       0.22      0.32      0.26       129
   disapproval       0.36      0.23      0.28       246
       disgust       0.38      0.39      0.39        74
 embarrassment       0.34      0.46      0.39        28
    excitement       0.26      0.19      0.22        78
          fear       0.47      0.61      0.53        74
     gratitude       0.87      0.81      0.84       297
         grief       1.00      0.10      0.18  

100%|██████████| 340/340 [00:41<00:00,  8.27it/s]

                precision    recall  f1-score   support

    admiration       0.62      0.67      0.64       504
     amusement       0.75      0.83      0.79       252
         anger       0.46      0.47      0.46       197
     annoyance       0.31      0.24      0.27       286
      approval       0.34      0.32      0.33       318
        caring       0.43      0.32      0.37       114
     confusion       0.39      0.42      0.41       139
     curiosity       0.36      0.67      0.46       233
        desire       0.38      0.34      0.36        74
disappointment       0.23      0.33      0.27       127
   disapproval       0.39      0.28      0.32       220
       disgust       0.42      0.42      0.42        84
 embarrassment       0.33      0.47      0.38        30
    excitement       0.36      0.33      0.35        84
          fear       0.45      0.69      0.55        74
     gratitude       0.85      0.82      0.83       288
         grief       0.00      0.00      0.00  




**Pre processing with 90% of the dataset (maximum we could use given colab limit), and 10% emotion masking rate.**

In [None]:
!pip install nltk gdown datasets transformers torch
import nltk
from nltk.corpus import wordnet as wn
import gdown
import torch
from datasets import load_dataset
from transformers import (
    RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_scheduler
)
from sklearn.metrics import classification_report
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
from tqdm import tqdm
import re
import random
import os

# Downloading the NRC emotion lexicon file here
nrc_url = 'https://drive.google.com/uc?id=1WkxJrL5ECa4NLFnXlfVmFyP6VlVnrv5K'
nrc_output = 'nrc_emotion_lexicon.txt'
gdown.download(nrc_url, nrc_output, quiet=False)

# extracting the emotion words
def get_emotion_words(file_path):
    emotion_words = set()
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split('\t')
            word, emotion_value = parts[0], parts[2:]
            if any(int(value) == 1 for value in emotion_value):
                emotion_words.add(word)
    return list(emotion_words)

# Get the list of emotion words
emotion_words_list = get_emotion_words(nrc_output)

# Print the first 10 words for debugging
print("Sample emotion words:", emotion_words_list[:10])
print(f"Total number of emotion words: {len(emotion_words_list)}")

# WordNet download
nltk.download('wordnet')
nltk.download('omw-1.4')

# to expand the emotion words list
def expand_emotion_words(words):
    expanded_set = set()

    for word in words:
        expanded_set.add(word)
        for synset in wn.synsets(word):
            for lemma in synset.lemmas():
                expanded_set.add(lemma.name().replace('_', ' '))
                for related_form in lemma.derivationally_related_forms():
                    expanded_set.add(related_form.name().replace('_', ' '))

    return sorted(expanded_set)

# Genertaing expanded list
expanded_emotion_words_list = expand_emotion_words(emotion_words_list)
print("Expanded Emotion Words Sample:", expanded_emotion_words_list[:20])
print(f"Total number of expanded words: {len(expanded_emotion_words_list)}")

# Loading GoEmotions
dataset = load_dataset("go_emotions", "simplified")

# Reduce dataset size
def reduce_dataset(dataset, fraction=0.9):
    indices = torch.randperm(len(dataset))[:int(len(dataset) * fraction)]
    return dataset.select(indices)

dataset["train"] = reduce_dataset(dataset["train"], fraction=0.9)
import random
emotion_words = expanded_emotion_words_list

# Mask emotion-related words with 10% probability
def mask_emotion_words(example, mask_ratio=0.10):
    def replace_emotion_words(text):
        words = text.split()
        for i, word in enumerate(words):
            if word.lower() in emotion_words and random.random() < mask_ratio:
                words[i] = "[MASK]"
        return ' '.join(words)

    example["text"] = replace_emotion_words(example["text"])
    return example

# Apply the function to the dataset
dataset = dataset.map(mask_emotion_words)



Downloading...
From: https://drive.google.com/uc?id=1WkxJrL5ECa4NLFnXlfVmFyP6VlVnrv5K
To: /content/nrc_emotion_lexicon.txt
100%|██████████| 2.72M/2.72M [00:00<00:00, 205MB/s]
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Sample emotion words: ['bran', 'improve', 'doomsday', 'snob', 'incongruous', 'academic', 'peck', 'sense', 'impress', 'erotic']
Total number of emotion words: 6453
Expanded Emotion Words Sample: ['1', '13th', '144', 'A-one', 'ACE', 'ALT', 'AS', 'Adam', 'Adam Smith', 'Advent', 'Aesculapius', 'Aga', 'Age of Reason', 'Agha', 'Agriculture', 'Agriculture Department', 'Al Gore', 'Albert Gore Jr.', 'Alexander Pope', 'Allied']
Total number of expanded words: 26359


Map:   0%|          | 0/39069 [00:00<?, ? examples/s]

**Pre trained EmotionBERT fine tuned on go emotion dataset used on the 90% of the dataset using the best learning rate we found from grid search.**

In [None]:
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_scheduler
)
from sklearn.metrics import classification_report
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
from tqdm import tqdm
import re
import gdown
import nltk
from nltk.corpus import wordnet as wn
import random



# Initializing EmotionBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("monologg/bert-base-cased-goemotions-original")

# Tokenize the dataset
def tokenize_function(example):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=128,
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Ensure labels are in the correct tensor format
def format_labels(example):
    example["labels"] = example["labels"][0] if isinstance(example["labels"], list) else example["labels"]
    return example

tokenized_datasets = tokenized_datasets.map(format_labels)

# Format dataset for PyTorch
data_collator = DataCollatorWithPadding(tokenizer)
columns = ["input_ids", "attention_mask", "labels"]
tokenized_datasets.set_format(type="torch", columns=columns)

train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=16, shuffle=True, collate_fn=data_collator)
val_dataloader = DataLoader(tokenized_datasets["validation"], batch_size=16, collate_fn=data_collator)
test_dataloader = DataLoader(tokenized_datasets["test"], batch_size=16, collate_fn=data_collator)



def train_and_evaluate_model(learning_rates=[4e-5]):
    best_model_path = None
    best_val_loss = float("inf")
    best_lr = None
    best_epoch = None

    for lr in learning_rates:
        print(f"\nTraining with learning rate: {lr}")
        model = AutoModelForSequenceClassification.from_pretrained("monologg/bert-base-cased-goemotions-original", num_labels=28)
        model.to(device)

        optimizer = AdamW(model.parameters(), lr=lr)
        num_training_steps = len(train_dataloader) * 3  # 3 epochs
        lr_scheduler = get_scheduler(
            "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
        )

        loss_fn = torch.nn.CrossEntropyLoss()
        for epoch in range(2):
            model.train()
            total_loss = 0
            for batch in tqdm(train_dataloader):
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
                loss = loss_fn(outputs.logits, batch["labels"])
                total_loss += loss.item()
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                lr_scheduler.step()
            avg_loss = total_loss / len(train_dataloader)
            print(f"Epoch {epoch + 1}, Loss: {avg_loss}")

            # Save model if validation loss improves
            model.eval()
            val_loss = 0
            with torch.no_grad():
                for batch in val_dataloader:
                    batch = {k: v.to(device) for k, v in batch.items()}
                    outputs = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
                    loss = loss_fn(outputs.logits, batch["labels"])
                    val_loss += loss.item()
            avg_val_loss = val_loss / len(val_dataloader)
            print(f"Validation Loss: {avg_val_loss}")

            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                best_model_path = f"best_model_lr_{lr}.pt"
                best_lr = lr
                best_epoch = epoch + 1
                torch.save(model.state_dict(), best_model_path)
                print(f"Saved best model with validation loss: {best_val_loss}")

    # Load the best model for evaluation
    print(f"\nBest learning rate: {best_lr}")
    print(f"Best epoch: {best_epoch}")
    print(f"Loading best model from {best_model_path}")
    best_model = AutoModelForSequenceClassification.from_pretrained("monologg/bert-base-cased-goemotions-original", num_labels=28)
    best_model.load_state_dict(torch.load(best_model_path))
    best_model.to(device)

    return best_model

# Training and evaluation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
best_model = train_and_evaluate_model()

# Evaluation function
def evaluate_model(model, dataloader):
    model.eval()
    y_true = []
    y_preds = []
    with torch.no_grad():
        for batch in tqdm(dataloader):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            labels = batch["labels"].cpu().numpy()
            y_preds.extend(preds)
            y_true.extend(labels)

    label_names = dataset["train"].features["labels"].feature.names
    print(classification_report(y_true, y_preds, target_names=label_names))

# Validate the best model
print("\nValidation Results:")
evaluate_model(best_model, val_dataloader)

# Test the best model
print("\nTest Results:")
evaluate_model(best_model, test_dataloader)


Map:   0%|          | 0/39069 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

Map:   0%|          | 0/39069 [00:00<?, ? examples/s]

Map:   0%|          | 0/5426 [00:00<?, ? examples/s]

Map:   0%|          | 0/5427 [00:00<?, ? examples/s]


Training with learning rate: 4e-05


pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

100%|██████████| 2442/2442 [13:43<00:00,  2.96it/s]


Epoch 1, Loss: 0.4696796557474744
Validation Loss: 2.1674724130507776
Saved best model with validation loss: 2.1674724130507776


100%|██████████| 2442/2442 [13:47<00:00,  2.95it/s]


Epoch 2, Loss: 0.2177550647341792
Validation Loss: 2.5503090114168385

Best learning rate: 4e-05
Best epoch: 1
Loading best model from best_model_lr_4e-05.pt


  best_model.load_state_dict(torch.load(best_model_path))



Validation Results:


100%|██████████| 340/340 [00:40<00:00,  8.29it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                precision    recall  f1-score   support

    admiration       0.66      0.69      0.68       488
     amusement       0.77      0.81      0.79       297
         anger       0.49      0.44      0.46       192
     annoyance       0.24      0.33      0.28       247
      approval       0.32      0.32      0.32       355
        caring       0.38      0.38      0.38       138
     confusion       0.44      0.28      0.34       136
     curiosity       0.41      0.55      0.47       205
        desire       0.43      0.31      0.36        64
disappointment       0.37      0.17      0.23       129
   disapproval       0.35      0.35      0.35       246
       disgust       0.48      0.42      0.45        74
 embarrassment       0.46      0.46      0.46        28
    excitement       0.31      0.19      0.24        78
          fear       0.66      0.50      0.57        74
     gratitude       0.82      0.80      0.81       297
         grief       0.33      0.30      0.32  

100%|██████████| 340/340 [00:41<00:00,  8.27it/s]

                precision    recall  f1-score   support

    admiration       0.63      0.64      0.64       504
     amusement       0.73      0.83      0.78       252
         anger       0.49      0.42      0.45       197
     annoyance       0.27      0.32      0.30       286
      approval       0.32      0.32      0.32       318
        caring       0.35      0.37      0.36       114
     confusion       0.39      0.30      0.34       139
     curiosity       0.41      0.56      0.48       233
        desire       0.42      0.28      0.34        74
disappointment       0.33      0.17      0.23       127
   disapproval       0.34      0.42      0.37       220
       disgust       0.41      0.37      0.39        84
 embarrassment       0.48      0.43      0.46        30
    excitement       0.49      0.36      0.41        84
          fear       0.55      0.47      0.51        74
     gratitude       0.82      0.80      0.81       288
         grief       0.20      0.17      0.18  


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**RoBERTa base fine turned on full dataset, using the best learning rate we found with grid search**

In [None]:
# Initializing the Roberta tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Tokenize the dataset
def tokenize_function(example):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=128,
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Ensure labels are in the correct tensor format
def format_labels(example):
    example["labels"] = example["labels"][0] if isinstance(example["labels"], list) else example["labels"]
    return example

tokenized_datasets = tokenized_datasets.map(format_labels)

# Format dataset for PyTorch
data_collator = DataCollatorWithPadding(tokenizer)
columns = ["input_ids", "attention_mask", "labels"]
tokenized_datasets.set_format(type="torch", columns=columns)

train_dataloader = DataLoader(tokenized_datasets["train"], batch_size=16, shuffle=True, collate_fn=data_collator)
val_dataloader = DataLoader(tokenized_datasets["validation"], batch_size=16, collate_fn=data_collator)
test_dataloader = DataLoader(tokenized_datasets["test"], batch_size=16, collate_fn=data_collator)

# train and evaluate with grid search for learning rates
def train_and_evaluate_model(learning_rates=[4e-5]):
    best_model_path = None
    best_val_loss = float("inf")
    best_lr = None
    best_epoch = None

    for lr in learning_rates:
        print(f"\nTraining with learning rate: {lr}")
        model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=28)
        model.to(device)

        optimizer = AdamW(model.parameters(), lr=lr)
        num_training_steps = len(train_dataloader) * 1  # 1 epoch for testing
        lr_scheduler = get_scheduler(
            "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
        )

        loss_fn = torch.nn.CrossEntropyLoss()
        for epoch in range(2):
            model.train()
            total_loss = 0
            for batch in tqdm(train_dataloader):
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
                loss = loss_fn(outputs.logits, batch["labels"])
                total_loss += loss.item()
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                lr_scheduler.step()
            avg_loss = total_loss / len(train_dataloader)
            print(f"Epoch {epoch + 1}, Loss: {avg_loss}")

            # Save model if validation loss improves
            model.eval()
            val_loss = 0
            with torch.no_grad():
                for batch in val_dataloader:
                    batch = {k: v.to(device) for k, v in batch.items()}
                    outputs = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
                    loss = loss_fn(outputs.logits, batch["labels"])
                    val_loss += loss.item()
            avg_val_loss = val_loss / len(val_dataloader)
            print(f"Validation Loss: {avg_val_loss}")

            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                best_model_path = f"best_model_lr_{lr}.pt"
                best_lr = lr
                best_epoch = epoch + 1
                torch.save(model.state_dict(), best_model_path)
                print(f"Saved best model with validation loss: {best_val_loss}")

    # Loading the best model for evaluation
    print(f"\nBest learning rate: {best_lr}")
    print(f"Best epoch: {best_epoch}")
    print(f"Loading best model from {best_model_path}")
    best_model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=28)
    best_model.load_state_dict(torch.load(best_model_path))
    best_model.to(device)

    return best_model

# Training and evaluation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
best_model = train_and_evaluate_model()

# Evaluation
def evaluate_model(model, dataloader):
    model.eval()
    y_true = []
    y_preds = []
    with torch.no_grad():
        for batch in tqdm(dataloader):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            labels = batch["labels"].cpu().numpy()
            y_preds.extend(preds)
            y_true.extend(labels)

    label_names = dataset["train"].features["labels"].feature.names
    print(classification_report(y_true, y_preds, target_names=label_names))

# Validate the best model
print("\nValidation Results:")
evaluate_model(best_model, val_dataloader)

# Test the best model
print("\nTest Results:")
evaluate_model(best_model, test_dataloader)

Map:   0%|          | 0/39069 [00:00<?, ? examples/s]

Map:   0%|          | 0/39069 [00:00<?, ? examples/s]


Training with learning rate: 4e-05


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 2442/2442 [13:55<00:00,  2.92it/s]


Epoch 1, Loss: 1.6718792354933656
Validation Loss: 1.4210333403419046
Saved best model with validation loss: 1.4210333403419046


100%|██████████| 2442/2442 [13:59<00:00,  2.91it/s]


Epoch 2, Loss: 1.3289308757395357
Validation Loss: 1.4210333403419046

Best learning rate: 4e-05
Best epoch: 1
Loading best model from best_model_lr_4e-05.pt


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  best_model.load_state_dict(torch.load(best_model_path))



Validation Results:


100%|██████████| 340/340 [00:36<00:00,  9.38it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                precision    recall  f1-score   support

    admiration       0.70      0.74      0.72       488
     amusement       0.73      0.86      0.79       297
         anger       0.46      0.51      0.48       192
     annoyance       0.32      0.20      0.25       247
      approval       0.53      0.26      0.35       355
        caring       0.53      0.40      0.45       138
     confusion       0.47      0.33      0.39       136
     curiosity       0.44      0.60      0.51       205
        desire       0.48      0.50      0.49        64
disappointment       0.35      0.16      0.22       129
   disapproval       0.45      0.33      0.38       246
       disgust       0.42      0.46      0.44        74
 embarrassment       0.67      0.50      0.57        28
    excitement       0.41      0.31      0.35        78
          fear       0.58      0.61      0.60        74
     gratitude       0.83      0.87      0.85       297
         grief       0.00      0.00      0.00  

100%|██████████| 340/340 [00:36<00:00,  9.35it/s]

                precision    recall  f1-score   support

    admiration       0.67      0.73      0.70       504
     amusement       0.74      0.87      0.80       252
         anger       0.44      0.47      0.45       197
     annoyance       0.36      0.19      0.24       286
      approval       0.55      0.31      0.40       318
        caring       0.42      0.35      0.38       114
     confusion       0.50      0.40      0.44       139
     curiosity       0.45      0.63      0.52       233
        desire       0.52      0.41      0.45        74
disappointment       0.36      0.17      0.23       127
   disapproval       0.41      0.35      0.38       220
       disgust       0.44      0.44      0.44        84
 embarrassment       0.72      0.43      0.54        30
    excitement       0.42      0.35      0.38        84
          fear       0.51      0.66      0.58        74
     gratitude       0.82      0.89      0.85       288
         grief       0.00      0.00      0.00  


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**Saving the RoBERTa fine tuned model**




In [None]:
# Save the fine-tuned model and tokenizer
model_dir = "fine_tuned_roberta_goemotions"
best_model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

print(f"Model and tokenizer saved to {model_dir}")









**Using the model to predict the label of a random sentence**

In [None]:
from transformers import pipeline

# Load the fine-tuned model and tokenizer
model_dir = "fine_tuned_roberta_goemotions"
emotion_classifier = pipeline(
    "text-classification",
    model=model_dir,
    tokenizer=model_dir
)

# Map label indices to label names
label_names = [
    "admiration", "amusement", "anger", "annoyance", "approval", "caring", "confusion",
    "curiosity", "desire", "disappointment", "disapproval", "disgust", "embarrassment",
    "excitement", "fear", "gratitude", "grief", "joy", "love", "nervousness", "optimism",
    "pride", "realization", "relief", "remorse", "sadness", "surprise", "neutral"
]


reddit_comment = "haha that's so funny"
predictions = emotion_classifier(reddit_comment)


most_likely_label = predictions[0]['label']  # Get the label string
most_likely_score = predictions[0]['score']  # Get the confidence score

# Convert label name to index
label_index = int(most_likely_label.split('_')[1])  # Extract the number from "LABEL_X"
label_name = label_names[label_index]  # Get the corresponding label name


print(f"The predicted label is: {label_name} (Index: {label_index}, Confidence: {most_likely_score:.4f})")



Device set to use cuda:0


The predicted label is: amusement (Index: 1, Confidence: 0.9105)
