In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch
from tqdm import tqdm
import os
import numpy as np

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

file_path = "datasets\political_leaning.csv"
df = pd.read_csv(file_path)[0:1]

#load polarity model and tokenizer.
model_name = "sarkerlab/SocBERT-base"  
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.to(device)


  file_path = "datasets\political_leaning.csv"


Using device: cuda


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at sarkerlab/SocBERT-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(74000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [3]:

def predict_polarity(text):
    """
    Predicts the sentiment polarity (positive, negative, neutral) for a given text.
    """
    encoding = tokenizer(text, truncation=True, padding=True, max_length=128, return_tensors="pt")
    encoding = {key: val.to(device) for key, val in encoding.items()}
    
    model.eval()
    with torch.no_grad():
        logits = model(**encoding).logits
        probabilities = torch.softmax(logits, dim=-1).cpu().numpy()[0]
    
    return {"positive": probabilities[1], "neutral": 0.0, "negative": probabilities[0]}


In [5]:
def predict_polarity(text):
    if not isinstance(text, str) or not text.strip():
        return None

    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1).cpu().numpy()[0]

    if len(probs) == 2:
        probs = [probs[0], 0.0, probs[1]]

    return probs

print("Starting sentiment polarity predictions...")

results = []
valid_indices = []

for idx, post in tqdm(enumerate(df['post']), desc="Processing posts", total=len(df)):
    result = predict_polarity(post)
    if result is not None:
        results.append(result)
        valid_indices.append(idx)

polarity_df = pd.DataFrame(results, columns=['positive', 'neutral', 'negative'])
df_clean = df.iloc[valid_indices].copy()
df_clean[['positive', 'neutral', 'negative']] = polarity_df

output_file = os.path.join("datasets", "polarity_analysis_results.csv")
df_clean.to_csv(output_file, index=False)
print(f"Sentiment polarity predictions completed and saved to '{output_file}'")


Starting sentiment polarity predictions...


Processing posts: 100%|██████████| 1/1 [00:01<00:00,  1.60s/it]

Sentiment polarity predictions completed and saved to 'datasets\polarity_analysis_results.csv'



