# Sentiment analysis

In [None]:
import pandas as pd
import html
from transformers import pipeline
from tqdm.notebook import tqdm
import matplotlib
import matplotlib.pyplot as plt

In [None]:
mastodon_data = pd.read_feather(
    "../1_data_collection/data/8_mastodon_final_data.feather"
)

In [None]:
mastodon_data["text_combine"] = mastodon_data["text_combine"].apply(
    lambda x: html.unescape(x)
)

## Run RoBERTA sentiment pipeline

In [None]:
sentiment_pipeline = pipeline(
    task="sentiment-analysis",
    model="cardiffnlp/twitter-roberta-base-sentiment",
    max_length=512,
    truncation=True,
    top_k=None,
)
label_map = {"LABEL_0": "Negative", "LABEL_1": "Neutral", "LABEL_2": "Positive"}

In [None]:
sentiment_results_list = []
for index, row in tqdm(mastodon_data.iterrows(), total=len(mastodon_data)):
    text = row["text_combine"]
    post_id = row["id"]
    detect_lang = row["gl_detect_lang"]

    sentiment_results = sentiment_pipeline(text)[0]
    sentiment_results = {
        label_map[res_dict["label"]]: [res_dict["score"]]
        for res_dict in sentiment_results
    }
    sentiment_results = pd.DataFrame(sentiment_results)
    sentiment_results["post_id"] = post_id
    sentiment_results["gl_detect_lang"] = detect_lang
    sentiment_results_list.append(sentiment_results)
sentiment_results = pd.concat(sentiment_results_list)

In [None]:
sentiment_results["sum_sentiment"] = (
    sentiment_results["Positive"] - sentiment_results["Negative"]
)

In [None]:
sentiment_results["sentiment_class"] = sentiment_results[
    ["Negative", "Neutral", "Positive"]
].idxmax(axis=1)

In [None]:
sentiment_results.to_csv("./data/1_sentiment_results.csv", index=False)