# Sentiment Analysis on Tweets

Dataset used: [tweet_eval dataset (emotion subset)](https://huggingface.co/datasets/cardiffnlp/tweet_eval)

## Imports

In [17]:
import time
from datasets import load_dataset
from transformers import AutoTokenizer
import torch
import html
import copy

In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

## Loading & Preprocessing Data

### Data loading

In [19]:
ds_train = load_dataset("cardiffnlp/tweet_eval", "emotion", split='train')
ds_test = load_dataset("cardiffnlp/tweet_eval", "emotion", split='test')
ds_val = load_dataset("cardiffnlp/tweet_eval", "emotion", split='validation')

In [20]:
ds_train[0]

{'text': "“Worry is a down payment on a problem you may never have'. \xa0Joyce Meyer.  #motivation #leadership #worry",
 'label': 2}

### Preprocessing

preprocessing ideas:

Replace common contractions (e.g., "don't" → "do not") to improve tokenization.

Remove special characters and punctuation that don't contribute to emotion.

Normalizing all tweets to lowercase

In [21]:
def lower_text(example):
    example["text"] = str(example["text"]).lower()
    return example

ds_train = ds_train.map(lower_text)
ds_test = ds_test.map(lower_text)
ds_val = ds_val.map(lower_text)

In [22]:
def apply_preprocess(example):
    text = example['text']
    
    new_text = []

    # change all tags to users to "@user" and all links to "http"
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)

    cleaned_text = " ".join(new_text)
    cleaned_text = html.unescape(cleaned_text)

    example['text'] = cleaned_text

    return example
 
ds_train = ds_train.map(apply_preprocess)
ds_val = ds_val.map(apply_preprocess)
ds_test = ds_test.map(apply_preprocess)

### Tokenize the features

The label is already an integer, so only the text (the tweets themselves) needs to be tokenized

In [23]:
import copy

In [24]:
BERT_MODEL = "google-bert/bert-base-uncased"
ROBERTA_MODEL = "cardiffnlp/twitter-roberta-base"

bert_tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL, use_fast=False) # just used what hugging face docs had, we can change this tho
roberta_tokenizer = AutoTokenizer.from_pretrained(ROBERTA_MODEL, use_fast=False)

def bert_tokenization(example): return bert_tokenizer(example['text'], padding='max_length', max_length=128, truncation=True)
def roberta_tokenization(example): return roberta_tokenizer(example['text'], padding='max_length', max_length=128, truncation=True)

# create copies of dataset for BERT and ROBERTA tokenization
ds_train_bert = copy.deepcopy(ds_train)
ds_test_bert = copy.deepcopy(ds_test)
ds_val_bert = copy.deepcopy(ds_val)

ds_train_roberta = copy.deepcopy(ds_train)
ds_test_roberta = copy.deepcopy(ds_test)
ds_val_roberta = copy.deepcopy(ds_val)

#### BERT MODEL

In [25]:
ds_train_tokenized_bert = ds_train_bert.map(bert_tokenization, batched=True)
ds_test_tokenized_bert = ds_test_bert.map(bert_tokenization, batched=True)
ds_val_tokenized_bert = ds_val_bert.map(bert_tokenization, batched=True)

Change format of BERT tokenized datasets into tensors, so that we can use PyTorch

The `input_ids`, `token_type_ids`, and `attention_mask` columns will be the actual inputs to the model

In [26]:
ds_train_tokenized_bert.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
ds_test_tokenized_bert.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
ds_val_tokenized_bert.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])

ds_train_tokenized_bert.format # outputting some metadata of the tokenized training set, formatted for pytorch

{'type': 'torch',
 'format_kwargs': {},
 'columns': ['input_ids', 'token_type_ids', 'attention_mask', 'label'],
 'output_all_columns': False}

In [27]:
# 1. Define a filter to find None values
def find_none(example):
    return example['text'] is None

# 2. Apply it to the training set
bad_rows = ds_train.filter(find_none)

print(f"Total rows in train: {len(ds_train)}")
print(f"Rows with None:      {len(bad_rows)}")

Total rows in train: 3257
Rows with None:      0


#### ROBERTA MODEL

In [28]:
ds_train_tokenized_roberta = ds_train_roberta.map(roberta_tokenization, batched=True)
ds_test_tokenized_roberta = ds_test_roberta.map(roberta_tokenization, batched=True)
ds_val_tokenized_roberta = ds_val_roberta.map(roberta_tokenization, batched=True)

# changing format of ROBERTA tokenized datasets into tensors for Pytorch
ds_train_tokenized_roberta.set_format(type='torch', columns=['input_ids', 'attention_mask'])
ds_test_tokenized_roberta.set_format(type='torch', columns=['input_ids', 'attention_mask'])
ds_val_tokenized_roberta.set_format(type='torch', columns=['input_ids', 'attention_mask'])

In [32]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

print("\n" + "="*30)
print(" TRAINING MODEL 1: BERT")
print("="*30)

model_bert = AutoModelForSequenceClassification.from_pretrained(BERT_MODEL, num_labels=4)

args_bert = TrainingArguments(
    output_dir="./results_bert",
    num_train_epochs=3,              # 3 loops is standard
    per_device_train_batch_size=16,  # Reduce to 8 if you get CUDA OOM error
    per_device_eval_batch_size=64,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    load_best_model_at_end=True,
    weight_decay=0.01,
    report_to="none"                 # Disable wandb logging to keep output clean
)

trainer_bert = Trainer(
    model=model_bert,
    args=args_bert,
    train_dataset=ds_train_tokenized_bert,
    eval_dataset=ds_val_tokenized_bert,
    compute_metrics=compute_metrics,
)

trainer_bert.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



 TRAINING MODEL 1: BERT


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.633448,0.786096,0.633162,0.7467,0.6413
2,No log,0.5813,0.791444,0.720166,0.738777,0.709041
3,0.505600,0.823054,0.791444,0.72393,0.735187,0.718029


TrainOutput(global_step=612, training_loss=0.43825701327105754, metrics={'train_runtime': 39.244, 'train_samples_per_second': 248.981, 'train_steps_per_second': 15.595, 'total_flos': 642726071829504.0, 'train_loss': 0.43825701327105754, 'epoch': 3.0})