In [27]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BitsAndBytesConfig
from scipy.special import softmax
import torch

plt.style.use('ggplot')

In [28]:
df = pd.read_csv('../../src/nlp/cleaned_dataset.csv')
df.head()

Unnamed: 0,author,clean_text,stemmed_text,lemmatized_text
0,---AI---,agreed deleted sorry took wrong way huge step ...,agre delet sorri took wrong way huge step ai a...,agree deleted sorry take wrong way huge step a...
1,---Spartacus---,end goal produce stupid population reliably vo...,end goal produc stupid popul reliabl vote equa...,end goal produce stupid population reliably vo...
2,---____--__-_-_-___-,gross understatement lol,gross understat lol,gross understatement lol
3,---chewie--,oh man let know months first lot afraid ever k...,oh man let know month first lot afraid ever ki...,oh man let know month first lot afraid ever ki...
4,---why-so-serious---,lock bathroom you re good,lock bathroom you re good,lock bathroom you re good


In [29]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

# Setting cuda environment
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

cuda


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [30]:
# Function to split a list of token into smaller chunks so that each chunk fits within the model's maximum token length.
# Special tokens (<s> and </s>), so max_length=510.
def chunk_tokens(token_ids, max_length=510):
    for i in range(0, len(token_ids), max_length):
        yield token_ids[i:i+max_length]

# Function to run sentiment analysis on a long text by:
# 1. Splitting it into token chunks
# 2. Running the model on each chunk separately
# 3. Averaging the sentiment scores across all chunks
def sentiment_for_long_text(text, device):
    # Convert text into token IDs without adding special tokens yet
    token_ids = tokenizer.encode(text, add_special_tokens=False)
    sentiments = []

    for token_chunk in chunk_tokens(token_ids):
        # Manually add special tokens at the start and end
        input_ids = [tokenizer.cls_token_id] + token_chunk + [tokenizer.sep_token_id]
        # Create an attention mask (1 for each token)
        attention_mask = [1] * len(input_ids)

        # Convert input IDs and mask into PyTorch tensors and move them to the target device (CPU/GPU)
        inputs = {
            "input_ids": torch.tensor([input_ids], device=device),
            "attention_mask": torch.tensor([attention_mask], device=device)
        }

        with torch.no_grad():
            outputs = model(**inputs)
        sentiments.append(outputs.logits.softmax(dim=-1))

    # Stack all sentiment probability tensors and compute the mean across chunks
    return torch.mean(torch.stack(sentiments), dim=0).squeeze(0)

In [31]:
def polarity_scores(text, device):
    if len(text) >= 514:
        scores_tensor = sentiment_for_long_text(text, device)
    else:
        encoded_text = tokenizer(text, return_tensors='pt').to(device)
        
        with torch.no_grad():
            output = model(**encoded_text)
        
        scores_tensor = output.logits[0].softmax(dim=-1)

    scores = scores_tensor.detach().cpu().numpy()

    labels = ['negative','neutral','positive']

    scores_dict = {
        'neg_percentage' : scores[0],
        'neu_percentage' : scores[1],
        'pos_percentage' : scores[2],
        'predicted_sentiment' : labels[scores.argmax()]
    }

    return scores_dict

In [None]:
INFORMAL_MODEL_NAME = "roberta"
results = []
for i, row in tqdm(df.iterrows(), total=len(df)):
    try:
        text = row['clean_text']
        author = row['author']

        if not isinstance(text, str) or len(text.strip()) == 0:
            results.append({
            'author': author,
            'model': INFORMAL_MODEL_NAME,
            'neg_percentage': None,
            'neu_percentage': None,
            'pos_percentage': None,
            'predicted_sentiment': None
            })
        else: 
            result = polarity_scores(text, device)
            results.append({
                'author' : author,
                'model' : INFORMAL_MODEL_NAME,
                **result
            })

    except Exception as e:
        print(f"ERROR at row {i}: {e}")

 34%|███▍      | 345/1000 [00:12<00:14, 43.96it/s]

Errore alla riga 339: object of type 'float' has no len()


 54%|█████▎    | 537/1000 [00:18<00:10, 44.48it/s]

Errore alla riga 531: object of type 'float' has no len()


 67%|██████▋   | 674/1000 [00:22<00:12, 26.19it/s]

Errore alla riga 674: object of type 'float' has no len()


 76%|███████▌  | 758/1000 [00:23<00:01, 148.33it/s]

Errore alla riga 751: object of type 'float' has no len()


100%|██████████| 1000/1000 [00:30<00:00, 32.44it/s]


In [None]:
df = pd.DataFrame(results)
df.reset_index(drop=True, inplace=True)

original_dataframe = pd.read_csv('../../src/nlp/sentiment_scores.csv')

df_concat = pd.concat([original_dataframe, df], ignore_index=True)
df_concat.to_csv('sentiment_scores.csv', sep=',', encoding='utf-8', index=False)