In [52]:
import pandas as pd, torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from tqdm.auto import tqdm

df = pd.read_csv("../data/full_articles_clean.csv")
print(df.shape)

(1299, 4)


In [54]:
import re
import html

# ──────────────────────────────────────────────────────────
# Regex pre‑compilation (faster when called thousands of times)
URL_RE      = re.compile(r'https?://\S+|www\.\S+')
EMAIL_RE    = re.compile(r'\b\S+@\S+\b')
HTML_ENT_RE = re.compile(r'&[a-z]+;')
WS_RE       = re.compile(r'\s+')

def light_clean(text: str) -> str:
    """
    Minimal text cleaning for sentiment analysis.
    • lowercases
    • strips URLs, e‑mail addresses, HTML entities (&amp;, &quot;, …)
    • converts common HTML entities to utf‑8 (e.g. &amp; -> &)
    • normalises fancy quotes -> straight quotes
    • collapses repeated whitespace
    Leaves punctuation, stop‑words, and emojis intact because
    they often carry sentiment signal.
    """
    if not isinstance(text, str):
        return ""

    # 1) lowercase
    text = text.lower()

    # 2) remove URLs & e‑mails
    text = URL_RE.sub(" ", text)
    text = EMAIL_RE.sub(" ", text)

    # 3) unescape & drop remaining HTML entities
    text = html.unescape(text)           # &amp; -> &
    text = HTML_ENT_RE.sub(" ", text)

    # 4) replace “smart quotes” with "
    text = text.replace("“", '"').replace("”", '"') \
               .replace("‘", "'").replace("’", "'")

    # 5) collapse whitespace
    text = WS_RE.sub(" ", text).strip()

    return text

In [55]:
df["headline_clean"] = df["headline"].apply(light_clean)
df["body_clean"]     = df["body_text"].apply(light_clean)

In [56]:
MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
model     = AutoModelForSequenceClassification.from_pretrained(MODEL)

device      = 0 if torch.cuda.is_available() else -1          # -1 → CPU
sent_pipe   = pipeline("sentiment-analysis",
                       model=model,
                       tokenizer=tokenizer,
                       device=device,
                       return_all_scores=True,
                       truncation=True,
                       max_length=512,
                       batch_size=32)                         # tune for RAM

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


In [57]:
LABEL2SCORE = {"positive": 1, "neutral": 0, "negative": -1}

def weighted_score(result):
    """Turn list of {'label','score'} dicts into a single number."""
    return sum(d["score"] * LABEL2SCORE[d["label"].lower()] for d in result)

In [58]:
def chunk_by_tokens(text, tokenizer, tokens_per_chunk=450):
    """
    Split a long string into pieces, each ≤ tokens_per_chunk,
    **without** needing sentence tokenisation.
    """
    # Encode once to avoid repeated tokenisation
    tokens = tokenizer.encode(text, add_special_tokens=False)
    for i in range(0, len(tokens), tokens_per_chunk):
        chunk_ids = tokens[i : i + tokens_per_chunk]
        yield tokenizer.decode(chunk_ids, skip_special_tokens=True)

In [59]:
# Add this cell to debug the output structure
test_headline = df["headline_clean"].iloc[0]
result = sent_pipe(test_headline)
print(f"Test headline: {test_headline}")
print(f"Type of result: {type(result)}")
print(f"Result structure: {result}")

Test headline: cremated remains of las vegas mass shooter to be kept in safe deposit box, brother says
Type of result: <class 'list'>
Result structure: [[{'label': 'negative', 'score': 0.1584518402814865}, {'label': 'neutral', 'score': 0.8270642161369324}, {'label': 'positive', 'score': 0.014483900740742683}]]


In [60]:
tqdm.pandas()

# 1️⃣  Headlines – text is short, single pass
df["sent_head"] = df["headline_clean"].progress_apply(
    lambda x: weighted_score(sent_pipe(x)[0])
)

# 2️⃣  Bodies – may be long, so chunk then average
def body_sentiment(text):
    if not isinstance(text, str) or text.strip() == "":
        return None
    chunks  = list(chunk_by_tokens(text, tokenizer))
    results = sent_pipe(chunks)               # returns list of lists
    scores  = [weighted_score(r) for r in results]
    return sum(scores) / len(scores)

df["sent_body"] = df["body_clean"].progress_apply(body_sentiment)

100%|██████████| 1299/1299 [01:01<00:00, 21.06it/s]
100%|██████████| 1299/1299 [18:12<00:00,  1.19it/s] 


In [37]:
print(df[["sent_head", "sent_body"]].describe())
print(df.head(3)[["headline_text", "sent_head", "sent_body"]])

         sent_head    sent_body
count  1299.000000  1299.000000
mean     -0.402119    -0.258562
std       0.340166     0.410197
min      -0.928732    -0.932480
25%      -0.689916    -0.616164
50%      -0.450826    -0.259065
75%      -0.131545     0.024327
max       0.829489     0.967569
                                       headline_text  sent_head  sent_body
0  Cremated remains of Las Vegas mass shooter to ...  -0.143968  -0.324479
1  Florida shooter a troubled loner with white su...  -0.777282  -0.812513
2  Vernon Hills teen accused of wearing white sup...  -0.472589  -0.116213


In [62]:
df.to_csv("../data/full_articles_with_sentiment.csv", index=False)