# Sentiment Analysis on Tweets

Dataset used: [tweet_eval dataset (emotion subset)](https://huggingface.co/datasets/cardiffnlp/tweet_eval)

## Imports

In [1]:
import time
from datasets import load_dataset
from transformers import AutoTokenizer
import torch


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

## Loading & Preprocessing Data

### Data loading

In [3]:
ds_train = load_dataset("cardiffnlp/tweet_eval", "emotion", split='train')
ds_test = load_dataset("cardiffnlp/tweet_eval", "emotion", split='test')
ds_val = load_dataset("cardiffnlp/tweet_eval", "emotion", split='validation')

In [4]:
ds_train[0]

{'text': "â€œWorry is a down payment on a problem you may never have'. \xa0Joyce Meyer.  #motivation #leadership #worry",
 'label': 2}

### Tokenize the features

The label is already an integer, so only the text (the tweets themselves) needs to be tokenized

In [5]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=False) # just used what hugging face docs had, we can change this tho

def tokenization(example): 
    return tokenizer(example['text'])

ds_train_tokenized = ds_train.map(tokenization, batched=True)
ds_test_tokenized = ds_test.map(tokenization, batched=True)
ds_val_tokenized = ds_val.map(tokenization, batched=True)

Change format of tokenized datasets into tensors, so that we can use PyTorch

The `input_ids`, `token_type_ids`, and `attention_mask` columns will be the actual inputs to the model

In [6]:
ds_train_tokenized.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'], device=device)
ds_test_tokenized.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'], device=device)
ds_val_tokenized.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'], device=device)

ds_train_tokenized.format # outputting some metadata of the tokenized training set, formatted for pytorch

{'type': 'torch',
 'format_kwargs': {'device': device(type='cpu')},
 'columns': ['input_ids', 'token_type_ids', 'attention_mask', 'label'],
 'output_all_columns': False}

In [7]:
ds_train_tokenized[0]

{'label': tensor(2),
 'input_ids': tensor([  101,  1523,  4737,  2003,  1037,  2091,  7909,  2006,  1037,  3291,
          2017,  2089,  2196,  2031,  1005,  1012, 11830, 11527,  1012,  1001,
         14354,  1001,  4105,  1001,  4737,   102]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1])}