# Sentiment Analysis for Headlines

## Imports

In [2]:
from transformers import pipeline

## Loading Model

In [3]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax


MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)

model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Calculating Sentiment

In [50]:
def calculate_sentiment(text_input):
    sentiment_score = []

    for i in range(len(text_input)):
        text = text_input[i]

        inputs = tokenizer(text, return_tensors="pt")
        outputs = model(**inputs)
        logits = outputs.logits

        probs = softmax(logits.detach().numpy(), axis=1)

        sentiment_score.append(probs[0])

    sentiment_score = np.array(sentiment_score)
    sentiment_high = np.argmax(sentiment_score, axis=1)

    # reduce the sentiment score to only the highest score
    sentiment_score = np.max(sentiment_score, axis=1)

    # convert the sentiment score to a label
    sentiment_label = [
        "negative" if x == 0 else "neutral" if x == 1 else "positive"
        for x in sentiment_high
    ]

    # return the sentiment score and label for each text input as pandas dataframe
    return pd.DataFrame(
        {"sentiment_score": sentiment_score, "sentiment_label": sentiment_label}
    )

## Running Model on Cumulative Headline Data

In [53]:
import pandas as pd

# path_to_data = "../data/processed/totalClimateData.parquet"
path_to_data = "../data/processed/climateWithEmbeds.parquet"

# read headlines from parquet file

# get top 10 headlines
data = pd.read_parquet(path_to_data)

# get the headlines
headlines = data.iloc[:, 2].tolist()

calculate_sentiment(headlines)

Unnamed: 0,sentiment_score,sentiment_label
0,0.713476,positive
1,0.621377,negative
2,0.908332,neutral
3,0.752089,neutral
4,0.702635,negative
...,...,...
9512,0.751828,negative
9513,0.538184,neutral
9514,0.763771,neutral
9515,0.804313,negative


In [None]:
# merge the sentiment score and label with the original data
data = pd.concat([data, calculate_sentiment(headlines)], axis=1)

# save the data with sentiment score and label
data.to_parquet("../data/processed/climateWithSentiment.parquet")