In [1]:
pip install -U torch transformers tqdm sentencepiece


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [8]:
import pandas as pd, torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from tqdm.auto import tqdm

df = pd.read_csv("../data/clean_gvfc_sentiment_v2.csv")
print(df.shape)

(1299, 9)


In [32]:
MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
model     = AutoModelForSequenceClassification.from_pretrained(MODEL)

device      = 0 if torch.cuda.is_available() else -1          # -1 → CPU
sent_pipe   = pipeline("sentiment-analysis",
                       model=model,
                       tokenizer=tokenizer,
                       device=device,
                       return_all_scores=True,
                       truncation=True,
                       max_length=512,
                       batch_size=32)                         # tune for RAM

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


In [33]:
LABEL2SCORE = {"positive": 1, "neutral": 0, "negative": -1}

def weighted_score(result):
    """Turn list of {'label','score'} dicts into a single number."""
    return sum(d["score"] * LABEL2SCORE[d["label"].lower()] for d in result)

In [34]:
def chunk_by_tokens(text, tokenizer, tokens_per_chunk=450):
    """
    Split a long string into pieces, each ≤ tokens_per_chunk,
    **without** needing sentence tokenisation.
    """
    # Encode once to avoid repeated tokenisation
    tokens = tokenizer.encode(text, add_special_tokens=False)
    for i in range(0, len(tokens), tokens_per_chunk):
        chunk_ids = tokens[i : i + tokens_per_chunk]
        yield tokenizer.decode(chunk_ids, skip_special_tokens=True)

In [35]:
# Add this cell to debug the output structure
test_headline = df["headline_clean"].iloc[0]
result = sent_pipe(test_headline)
print(f"Test headline: {test_headline}")
print(f"Type of result: {type(result)}")
print(f"Result structure: {result}")

Test headline: cremated remains of las vegas mass shooter to be kept in safe deposit box, brother says
Type of result: <class 'list'>
Result structure: [[{'label': 'negative', 'score': 0.1584518402814865}, {'label': 'neutral', 'score': 0.8270642161369324}, {'label': 'positive', 'score': 0.014483900740742683}]]


In [36]:
tqdm.pandas()

# 1️⃣  Headlines – text is short, single pass
df["sent_head"] = df["headline_clean"].progress_apply(
    lambda x: weighted_score(sent_pipe(x)[0])
)

# 2️⃣  Bodies – may be long, so chunk then average
def body_sentiment(text):
    if not isinstance(text, str) or text.strip() == "":
        return None
    chunks  = list(chunk_by_tokens(text, tokenizer))
    results = sent_pipe(chunks)               # returns list of lists
    scores  = [weighted_score(r) for r in results]
    return sum(scores) / len(scores)

df["sent_body"] = df["body_clean"].progress_apply(body_sentiment)

100%|██████████| 1299/1299 [01:06<00:00, 19.58it/s]
100%|██████████| 1299/1299 [01:22<00:00, 15.68it/s]


In [37]:
print(df[["sent_head", "sent_body"]].describe())
print(df.head(3)[["headline_text", "sent_head", "sent_body"]])

         sent_head    sent_body
count  1299.000000  1299.000000
mean     -0.402119    -0.258562
std       0.340166     0.410197
min      -0.928732    -0.932480
25%      -0.689916    -0.616164
50%      -0.450826    -0.259065
75%      -0.131545     0.024327
max       0.829489     0.967569
                                       headline_text  sent_head  sent_body
0  Cremated remains of Las Vegas mass shooter to ...  -0.143968  -0.324479
1  Florida shooter a troubled loner with white su...  -0.777282  -0.812513
2  Vernon Hills teen accused of wearing white sup...  -0.472589  -0.116213


In [39]:
df.to_csv("../data/gvfc_with_sentiment.csv", index=False)