# Sentiment Analysis Pipeline

In [None]:
# !python -m pip install boto3 python-dotenv transformers torch joblib

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import glob
import os
from dotenv import load_dotenv
import sys
from time import time

# Roberta model
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax
import torch
import math
from transformers import RobertaTokenizer, RobertaForSequenceClassification


### 1. Setup S3 on VSCode (recommended) or GoogleDrive

In [None]:
# 1. Running on VSCode:
scripts_folder = os.path.join(os.getcwd(), '..', 'Scripts')
sys.path.append(scripts_folder)
import s3

env_path = "../Scripts/.env"
load_dotenv(env_path)


# 2. Running on Google Drive:
# from google.colab import drive
# drive.mount('/content/drive')
# env_path = "<path_here>"
# load_dotenv(env_path)
# %cd /content/drive/MyDrive/Group_project/Code/
# import s3

### 2. Download Required Data

In [None]:
data_to_download = "processed/news/gnews_artifacts/" # could instead use "processed/reddit/comments_artifacts/" or other dataset in same format
file_path = "processed/news/gnews_artifacts/"

s3.download_all(data_to_download)

dfs = [
    (f, pd.read_parquet(f)) for f in s3.s3_to_local_path(data_to_download).glob("*")
]
print(f"{sum(len(df) for _, df in dfs)} rows.")


### 3. Setup Runtime

In [None]:
# Connect to GPU
if torch.cuda.is_available():
  device = torch.device("cuda")
  print("Using GPU:", torch.cuda.get_device_name(0))
else:
  device = torch.device("cpu")
  print("Using CPU")

### 4. Define Model and Related Functions

In [None]:
# Define model
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL).to(device)

# Function to get sentiment scores for a chunk
def get_sentiment_scores(chunk):
    encoded_text = tokenizer(chunk, return_tensors='pt')
    output = model(**encoded_text)
    #extracting and normalising sentiment scores
    scores = output[0][0].detach().numpy()
    #normalising the score
    return softmax(scores)

# Function to split chunks if needed since roberta can do 512 at max (max 512 tokens can be processed at a time)
def analyse_large_text(text):
    tokens = tokenizer.encode(text, add_special_tokens=False)  # Encode to token IDs without special tokens
    chunk_size = 512
    # Overlap between chunks to avoid missing context between chunks
    stride = 256
    sentiment_scores = []
    token_lengths = []

    for i in range(0, len(tokens), stride):
        chunk = tokens[i:min(i + chunk_size, len(tokens))]
        chunk_text = tokenizer.decode(chunk)  # Decode back to text
        inputs = tokenizer(chunk_text, return_tensors='pt', truncation=True, max_length=512).to(device)
        output = model(**inputs)
        scores = output[0][0].detach().cpu().numpy()
        sentiment_scores.append(scores)
        token_lengths.append(len(chunk))

    # Weighted average of scores by chunk length
    sentiment_scores = np.array(sentiment_scores)
    weighted_scores = np.average(sentiment_scores, axis=0, weights=token_lengths)
    compound_score = weighted_scores[2] - weighted_scores[0]
    normalised_compound = compound_score / math.sqrt(compound_score**2 + 20)

    # Return final aggregated sentiment
    return {
        'roberta_pos': weighted_scores[2],
        'roberta_neu': weighted_scores[1],
        'roberta_neg': weighted_scores[0],
        'roberta_compound': compound_score,
        'roberta_normalised_compound': normalised_compound,
    }

### 5. Apply Model on Data

In [None]:
all_paths = []
for path, df in tqdm(dfs):
    out_path = Path(str(path).replace("artifacts", "twitter_roberta")) # Assumes that there is 'artifacts' in input path
    out_path.parent.mkdir(exist_ok=True, parents=True)
    all_paths.append(out_path)
    if not out_path.exists():
        print(out_path)
        df[['roberta_pos', 'roberta_neu', 'roberta_neg', 'roberta_compound', 'roberta_normalised_compound']] = df['text'].apply(lambda x: pd.Series(analyse_large_text(x)))
        df.to_parquet(out_path)

### 6. Upload to S3

In [None]:
s3.upload_all(file_path.replace("artifacts", "twitter_roberta"))