# Sentiment Analysis on Tweets

Dataset used: [tweet_eval dataset (emotion subset)](https://huggingface.co/datasets/cardiffnlp/tweet_eval)

## Imports

In [35]:
import time
from datasets import load_dataset
from transformers import AutoTokenizer
import torch
import html
import copy

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

## Loading & Preprocessing Data

### Data loading

In [3]:
ds_train = load_dataset("cardiffnlp/tweet_eval", "emotion", split='train')
ds_test = load_dataset("cardiffnlp/tweet_eval", "emotion", split='test')
ds_val = load_dataset("cardiffnlp/tweet_eval", "emotion", split='validation')

In [4]:
ds_train[0]

{'text': "“Worry is a down payment on a problem you may never have'. \xa0Joyce Meyer.  #motivation #leadership #worry",
 'label': 2}

### Preprocessing

preprocessing ideas:

Replace common contractions (e.g., "don't" → "do not") to improve tokenization.

Remove special characters and punctuation that don't contribute to emotion.

Normalizing all tweets to lowercase

In [5]:
def lower_text(example):
    example["text"] = str(example["text"]).lower()
    return example

ds_train = ds_train.map(lower_text)
ds_test = ds_test.map(lower_text)
ds_val = ds_val.map(lower_text)

In [6]:
def apply_preprocess(example):
    text = example['text']
    
    new_text = []

    # change all tags to users to "@user" and all links to "http"
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)

    cleaned_text = " ".join(new_text)
    cleaned_text = html.unescape(cleaned_text)

    example['text'] = cleaned_text

    return example
 
ds_train = ds_train.map(apply_preprocess)
ds_val = ds_val.map(apply_preprocess)
ds_test = ds_test.map(apply_preprocess)

Map: 100%|██████████| 3257/3257 [00:00<00:00, 13093.37 examples/s]
Map: 100%|██████████| 374/374 [00:00<00:00, 9562.20 examples/s]
Map: 100%|██████████| 1421/1421 [00:00<00:00, 14625.78 examples/s]


### Tokenize the features

The label is already an integer, so only the text (the tweets themselves) needs to be tokenized

In [25]:
import copy

In [26]:
BERT_MODEL = "google-bert/bert-base-uncased"
ROBERTA_MODEL = "cardiffnlp/twitter-roberta-base"

bert_tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL, use_fast=False) # just used what hugging face docs had, we can change this tho
roberta_tokenizer = AutoTokenizer.from_pretrained(ROBERTA_MODEL, use_fast=False)

def bert_tokenization(example): return bert_tokenizer(example['text'], padding='max_length', max_length=128, truncation=True)
def roberta_tokenization(example): return roberta_tokenizer(example['text'], padding='max_length', max_length=128, truncation=True)

# create copies of dataset for BERT and ROBERTA tokenization
ds_train_bert = copy.deepcopy(ds_train)
ds_test_bert = copy.deepcopy(ds_test)
ds_val_bert = copy.deepcopy(ds_val)

ds_train_roberta = copy.deepcopy(ds_train)
ds_test_roberta = copy.deepcopy(ds_test)
ds_val_roberta = copy.deepcopy(ds_val)

#### BERT MODEL

In [27]:
ds_train_tokenized_bert = ds_train_bert.map(bert_tokenization, batched=True)
ds_test_tokenized_bert = ds_test_bert.map(bert_tokenization, batched=True)
ds_val_tokenized_bert = ds_val_bert.map(bert_tokenization, batched=True)

Change format of BERT tokenized datasets into tensors, so that we can use PyTorch

The `input_ids`, `token_type_ids`, and `attention_mask` columns will be the actual inputs to the model

In [28]:
ds_train_tokenized_bert.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'], device=device)
ds_test_tokenized_bert.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'], device=device)
ds_val_tokenized_bert.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'], device=device)

ds_train_tokenized_bert.format # outputting some metadata of the tokenized training set, formatted for pytorch

{'type': 'torch',
 'format_kwargs': {'device': device(type='cpu')},
 'columns': ['input_ids', 'token_type_ids', 'attention_mask', 'label'],
 'output_all_columns': False}

#### ROBERTA MODEL

In [29]:
ds_train_tokenized_roberta = ds_train_roberta.map(roberta_tokenizer, batched=True)
ds_test_tokenized_roberta = ds_test_roberta.map(roberta_tokenizer, batched=True)
ds_val_tokenized_roberta = ds_val_roberta.map(roberta_tokenizer, batched=True)

# changing format of ROBERTA tokenized datasets into tensors for Pytorch
ds_train_tokenized_roberta.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'], device=device)
ds_test_tokenized_roberta.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'], device=device)
ds_val_tokenized_roberta.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'], device=device)

Map:   0%|          | 0/3257 [00:00<?, ? examples/s]


ValueError: text input must be of type `str` (single example), `list[str]` (batch or single pretokenized example) or `list[list[str]]` (batch of pretokenized examples).

In [34]:
ds_train_bert['text']

Column(["“worry is a down payment on a problem you may never have'. \xa0joyce meyer.  #motivation #leadership #worry", "my roommate: it's okay that we can't spell because we have autocorrect. #terrible #firstworldprobs", "no but that's so cute. atsu was probably shy about photos before but cherry helped her out uwu", "rooneys fucking untouchable isn't he? been fucking dreadful again, depay has looked decent(ish)tonight", "it's pretty depressing when u hit pan on ur favourite highlighter"])

Column(["“worry is a down payment on a problem you may never have'. \xa0joyce meyer.  #motivation #leadership #worry", "my roommate: it's okay that we can't spell because we have autocorrect. #terrible #firstworldprobs", "no but that's so cute. atsu was probably shy about photos before but cherry helped her out uwu", "rooneys fucking untouchable isn't he? been fucking dreadful again, depay has looked decent(ish)tonight", "it's pretty depressing when u hit pan on ur favourite highlighter"])