In [4]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm

In [5]:
df = pd.read_csv('../../../src/nlp/cleaned_dataset.csv')

In [6]:
tokenizer = AutoTokenizer.from_pretrained("SamLowe/roberta-base-go_emotions")
model = AutoModelForSequenceClassification.from_pretrained("SamLowe/roberta-base-go_emotions")
MAX_LENGHT = model.config.max_position_embeddings

# Setting cuda environment
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

tokenizer_config.json:   0%|          | 0.00/380 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

2025-08-21 14:34:30.162271: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-21 14:34:30.479146: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755779670.589830    4754 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755779670.624575    4754 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1755779670.882361    4754 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

cuda


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [7]:
goemotions_labels = [
    'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring',
    'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval',
    'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief',
    'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization',
    'relief', 'remorse', 'sadness', 'surprise', 'neutral', 'contradiction'
]

In [8]:
def chunk_tokens(token_ids, max_length=(MAX_LENGHT - 4)):
    for i in range(0, len(token_ids), max_length):
        yield token_ids[i:i+max_length]

def emotions_for_long_text(text, device):
    # Convert text into token IDs without adding special tokens yet
    token_ids = tokenizer.encode(text, add_special_tokens=False)
    emotions = []

    for token_chunk in chunk_tokens(token_ids):
        # Manually add special tokens at the start and end
        input_ids = [tokenizer.cls_token_id] + token_chunk + [tokenizer.sep_token_id]
        # Create an attention mask (1 for each token)
        attention_mask = [1] * len(input_ids)

        # Convert input IDs and mask into PyTorch tensors and move them to the target device (CPU/GPU)
        inputs = {
            "input_ids": torch.tensor([input_ids], device=device),
            "attention_mask": torch.tensor([attention_mask], device=device)
        }

        with torch.no_grad():
            outputs = model(**inputs)
        emotions.append(outputs.logits.softmax(dim=-1))

    # Stack all sentiment probability tensors and compute the mean across chunks
    return torch.mean(torch.stack(emotions), dim=0).squeeze(0)

In [9]:
def emotions_scores(text, device, labels):
    if len(text) >= (MAX_LENGHT - 2):
        scores_tensor = emotions_for_long_text(text, device)
    else:
        encoded_text = tokenizer(text, return_tensors='pt').to(device)
        
        with torch.no_grad():
            output = model(**encoded_text)
        
        scores_tensor = output.logits[0].softmax(dim=-1)

    scores = scores_tensor.detach().cpu().numpy()

    predicted_emotion = labels[scores.argmax()]

    scores_dict = {f"{emotion}_score": score for emotion, score in zip(labels, scores)}
    scores_dict['predicted_emotion'] = predicted_emotion
    scores_dict['predicted_confidence'] = scores[scores.argmax()]

    return scores_dict

In [10]:
INFORMAL_MODEL_NAME = "goemotions"
results = []
for i, row in tqdm(df.iterrows(), total=len(df)):
    try:
        text = row['clean_text']
        author = row['author']
        id = row['id']

        result = emotions_scores(text, device, labels=goemotions_labels)
        results.append({
            'author' : author,
            'id' : id,
            'model' : INFORMAL_MODEL_NAME,
            **result
        })

    except Exception as e:
        print(f"ERROR at row {i}: {e}")

  0%|          | 16/203612 [00:00<1:58:13, 28.70it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (549 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 203612/203612 [33:57<00:00, 99.91it/s] 


In [11]:
df = pd.DataFrame(results)
df.reset_index(drop=True, inplace=True)
df.to_csv('emotion_scores.csv', sep=',', encoding='utf-8', index=False)