In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from tqdm import tqdm
import os
import numpy as np

In [4]:
file_path = "datasets/file.csv" #add dataset path here (we used this to get columns for the extra, polluted and non polluted dataset)
df = pd.read_csv(file_path)
print(df.columns)
posts = df["tweet"] #change column to one with text 

#Loading emotion model and tokenizer.
model_name = "j-hartmann/emotion-english-distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #using gpu as its faster.
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
           

In [5]:
def predict_emotion_probs(text):
    """
    Predicts probabilities for all emotions for a given text.
    """
    try:
        if pd.isna(text) or text is None:
            return np.ones(7) / 7  # 7 emotions with equal probability
            
        text = str(text).strip()
        
        if not text:
            return np.ones(7) / 7
            
        encoding = tokenizer(text, truncation=True, padding=True, max_length=128, return_tensors="pt")
        encoding = {key: val.to(device) for key, val in encoding.items()}
        
        model.eval()
        with torch.no_grad():
            outputs = model(**encoding)
            probabilities = torch.softmax(outputs.logits, dim=1).cpu().numpy()[0]
            
        return probabilities
        
    except Exception as e:
        print(f"Error processing text: {str(e)[:100]}...")  # Print first 100 chars of error
        return np.ones(7) / 7  # Return equal probabilities on error



tqdm.pandas(desc="Predicting emotions")
emotion_probabilities = posts.progress_apply(predict_emotion_probs)
emotion_labels = ["anger", "joy", "sadness", "fear", "surprise", "disgust", "neutral"]
emotion_df = pd.DataFrame(emotion_probabilities.tolist(), columns=emotion_labels)
df = pd.concat([df, emotion_df], axis=1)

output_file = os.path.join("datasets", "emotional_analysis_results.csv")
df.to_csv(output_file, index=False)
print(f"Emotion predictions completed and saved to '{output_file}'.") #output is saved to datasets folder.

Predicting emotions:   0%|          | 0/1 [00:00<?, ?it/s]

Predicting emotions: 100%|██████████| 1/1 [00:00<00:00,  3.02it/s]

Emotion predictions completed and saved to 'datasets\test.csv'.



