In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BitsAndBytesConfig
from scipy.special import softmax
import torch

plt.style.use('ggplot')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('../../src/nlp/cleaned_dataset.csv')
df.head()

Unnamed: 0,author,type,clean_text,stemmed_text,lemmatized_text
0,Coteup,post,talk structural issues 3rd parties face us pol...,talk structur issu 3rd parti face us polit sys...,talk structural issue 3rd party face u politic...
1,_alpinisto,post,discovered sitting president committed exact c...,discov sit presid commit exact crime nixon for...,discover sit president commit exact crime nixo...
2,PsychLegalMind,post,trump announced new weapons ukraine monday thr...,trump announc new weapon ukrain monday threate...,trump announce new weapon ukraine monday threa...
3,jaytee319,post,illinois considering new bill hb 3458 let some...,illinoi consid new bill hb 3458 let someon avo...,illinois consider new bill hb 3458 let someone...
4,the_original_Retro,post,recent days maga outspoken influencers rushing...,recent day maga outspoken influenc rush call t...,recent day maga outspoken influencers rush cal...


In [3]:
MODEL = f"finiteautomata/bertweet-base-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=False)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
MAX_LENGHT = model.config.max_position_embeddings

# Setting cuda environment
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

2025-08-10 16:26:07.703439: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-10 16:26:07.723140: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754835967.742631   13660 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754835967.748880   13660 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1754835967.766327   13660 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

cuda


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(130, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [4]:
# Function to split a list of token into smaller chunks so that each chunk fits within the model's maximum token length.
# Special tokens (<s> and </s>), so max_length - 2
def chunk_tokens(token_ids, max_length=(MAX_LENGHT - 4)):
    for i in range(0, len(token_ids), max_length):
        yield token_ids[i:i+max_length]

# Function to run sentiment analysis on a long text by:
# 1. Splitting it into token chunks
# 2. Running the model on each chunk separately
# 3. Averaging the sentiment scores across all chunks
def sentiment_for_long_text(text, device):
    # Convert text into token IDs without adding special tokens yet
    token_ids = tokenizer.encode(text, add_special_tokens=False)
    sentiments = []

    for token_chunk in chunk_tokens(token_ids):
        # Manually add special tokens at the start and end
        input_ids = [tokenizer.cls_token_id] + token_chunk + [tokenizer.sep_token_id]
        # Create an attention mask (1 for each token)
        attention_mask = [1] * len(input_ids)

        # Convert input IDs and mask into PyTorch tensors and move them to the target device (CPU/GPU)
        inputs = {
            "input_ids": torch.tensor([input_ids], device=device),
            "attention_mask": torch.tensor([attention_mask], device=device)
        }

        with torch.no_grad():
            outputs = model(**inputs)
        sentiments.append(outputs.logits.softmax(dim=-1))

    # Stack all sentiment probability tensors and compute the mean across chunks
    return torch.mean(torch.stack(sentiments), dim=0).squeeze(0)

In [5]:
def polarity_scores(text, device):
    if len(text) >= (MAX_LENGHT - 2):
        scores_tensor = sentiment_for_long_text(text, device)
    else:
        encoded_text = tokenizer(text, return_tensors='pt').to(device)
        
        with torch.no_grad():
            output = model(**encoded_text)
        
        scores_tensor = output.logits[0].softmax(dim=-1)

    scores = scores_tensor.detach().cpu().numpy()

    labels = ['negative','neutral','positive']

    scores_dict = {
        'neg_percentage' : scores[0],
        'neu_percentage' : scores[1],
        'pos_percentage' : scores[2],
        'predicted_sentiment' : labels[scores.argmax()]
    }

    return scores_dict

In [6]:
INFORMAL_MODEL_NAME = "bertweet"
results = []
for i, row in tqdm(df.iterrows(), total=len(df)):
    try:
        text = row['clean_text']
        author = row['author']

        if not isinstance(text, str) or len(text.strip()) == 0:
            results.append({
            'author': author,
            'model': INFORMAL_MODEL_NAME,
            'neg_percentage': None,
            'neu_percentage': None,
            'pos_percentage': None,
            'predicted_sentiment': None
            })
        else: 
            result = polarity_scores(text, device)
            results.append({
                'author' : author,
                'model' : INFORMAL_MODEL_NAME,
                **result
            })

    except Exception as e:
        print(f"ERROR at row {i}: {e}")

  0%|          | 1/279018 [00:00<33:07:38,  2.34it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (209 > 128). Running this sequence through the model will result in indexing errors
100%|██████████| 279018/279018 [47:01<00:00, 98.90it/s] 


In [7]:
df = pd.DataFrame(results)
df.reset_index(drop=True, inplace=True)

original_dataframe = pd.read_csv('../../src/nlp/sentiment_scores.csv')

df_concat = pd.concat([original_dataframe, df], ignore_index=True)
df_concat.to_csv('sentiment_scores.csv', sep=',', encoding='utf-8', index=False)