# Sentiment Analysis for Full Dataset

## Imports

In [1]:
from transformers import pipeline

In [29]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import pandas as pd
import numpy as np
from scipy.special import softmax

from tqdm import tqdm

## Load Model

In [3]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)

model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Load Data

In [7]:
total_data = pd.read_parquet("../data/processed/news-consolidated-v1.parquet")

headlines = total_data["headline"]
headlines.head(10)

0       a g calls for infrastructure protection summit
1    epa still trying to recover chemical clean up ...
2    expressions of interest sought to build livestock
3           iraq to pay for own rebuilding white house
4           meeting to focus on broken hill water woes
5    more water restrictions predicted for northern...
6             mugabe to touch down in paris for summit
7                national gallery gets all clear after
8                           omodei to stay in politics
9               osullivan in world cross country doubt
Name: headline, dtype: object

## Run Sentiment Analysis

In [30]:
def calculate_sentiment(input_df, batch_size=32):
    sentiment_scores = []
    sentiment_labels = []

    for i in tqdm(range(0, len(input_df), batch_size)):
        batch = input_df["headline"][i : i + batch_size]

        inputs = tokenizer(
            batch.tolist(), padding=True, truncation=True, return_tensors="pt"
        )
        outputs = model(**inputs)
        logits = outputs.logits

        probs = softmax(logits.detach().numpy(), axis=1)

        sentiment_scores.extend(probs)
        sentiment_labels.extend(np.argmax(probs, axis=1))

    input_df["sentiment_score"] = np.max(np.array(sentiment_scores), axis=1)
    input_df["sentiment_label"] = np.where(
        np.array(sentiment_labels) == 0,
        "negative",
        np.where(np.array(sentiment_labels) == 1, "neutral", "positive"),
    )

    return input_df

In [32]:
sentiments = calculate_sentiment(total_data, batch_size=50)

100%|██████████| 1811/1811 [09:29<00:00,  3.18it/s]


In [33]:
sentiments.head(10)

Unnamed: 0,id,source,date,headline,embedding,url,sentiment_score,sentiment_label
0,0,abc,2003-02-19,a g calls for infrastructure protection summit,"[0.42550426721572876, 0.5782315135002136, 0.09...",,0.766538,neutral
1,1,abc,2003-02-19,epa still trying to recover chemical clean up ...,"[0.33238619565963745, -0.3517177700996399, 0.5...",,0.56363,negative
2,2,abc,2003-02-19,expressions of interest sought to build livestock,"[0.4847770035266876, 0.10000099241733551, -0.0...",,0.843926,neutral
3,3,abc,2003-02-19,iraq to pay for own rebuilding white house,"[0.4847399592399597, 0.20435450971126556, 0.19...",,0.762468,neutral
4,4,abc,2003-02-19,meeting to focus on broken hill water woes,"[0.3507457375526428, 0.43837735056877136, -0.0...",,0.720201,neutral
5,5,abc,2003-02-19,more water restrictions predicted for northern...,"[0.1861242949962616, -0.08368571102619171, 0.0...",,0.539765,neutral
6,6,abc,2003-02-19,mugabe to touch down in paris for summit,"[0.3408227562904358, 0.33002549409866333, -0.1...",,0.921871,neutral
7,7,abc,2003-02-19,national gallery gets all clear after,"[0.010992174036800861, 0.31676802039146423, -0...",,0.769341,neutral
8,8,abc,2003-02-19,omodei to stay in politics,"[0.15744999051094055, 0.16422423720359802, -0....",,0.834178,neutral
9,9,abc,2003-02-19,osullivan in world cross country doubt,"[0.09738999605178833, 0.4430299699306488, -0.3...",,0.872147,neutral


## Save Data

In [34]:
sentiments.to_parquet("../data/processed/news-consolidated-v2.parquet")