# Sentiment Analysis

**Author**: Chris Oswald

**Course**: CS676/ECE689 Advanced Topics in Deep Learning (Spring 2024)

## Setup

In [None]:
!pip install datasets
!pip install accelerate -U
!pip install transformers -U

In [None]:
# Load packages
import json
import os
import math
import time
from typing import Tuple

import numpy as np
import matplotlib.pyplot as plt

import torch
from torch import nn
import torch.nn.functional as F
from torchvision.datasets import MNIST
from torchvision import transforms
from torch.utils.data import DataLoader

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Create directories
from google.colab import drive
drive.mount('/content/drive')

models_dir = "/content/drive/MyDrive/Colab Notebooks/Models"
os.makedirs(models_dir, exist_ok=True)

data_dir = "/content/drive/MyDrive/Colab Notebooks/Data"
os.makedirs(data_dir, exist_ok=True)

## Question 4: BERT for sentiment analysis

For the last problem, we are going to learn how to use the huggingface library to train a simple BERT classifier for sentiment analysis.

We will use the IMDB dataset. You can find the dataset from huggingface using the following command:

```
from datasets import load_dataset
imdb = load_dataset("imdb")
```
To access BERT, use
```
from transformers import BertForSequenceClassification
#load pre-trained BERT
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                      num_labels = len(label_dict),
                                                      output_attentions = False,
                                                      output_hidden_states = False)
```
To reduce training complexity, you can choose to freeze the weight of the pretrained BERT model and only train the classifier. The classifier should have a minimum of 3 layers.
You might find https://huggingface.co/blog/sentiment-analysis-python and https://github.com/baotramduong/Twitter-Sentiment-Analysis-with-Deep-Learning-using-BERT/blob/main/Notebook.ipynb helpful.



In [None]:
from datasets import load_dataset, load_metric
from transformers import (
    BertForSequenceClassification,
    BertTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments
)

In [None]:
# Import data
imdb = load_dataset("imdb")

# Load BERT
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Preprocess text data
subset_size = 10000
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
preprocess_func = lambda x: tokenizer(x['text'], truncation=True, padding=True)
train_data = imdb['train'].shuffle(seed=999).map(preprocess_func).select(range(subset_size))
test_data = imdb['test'].shuffle(seed=999).map(preprocess_func).select(range(int(subset_size * 0.1)))

In [None]:
# Create data collator (adds tokens, formats text as tensors)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
data_collator

DataCollatorWithPadding(tokenizer=BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}, padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

In [None]:
# Define metrics function
def compute_metrics(pred_tuple):
    logits, labels = pred_tuple
    predictions = np.argmax(logits, axis=-1)

    accuracy = load_metric('accuracy', trust_remote_code=True).compute(
        predictions=predictions, references=labels
    )['accuracy']

    f1 = load_metric('f1', trust_remote_code=True).compute(
        predictions=predictions, references=labels
    )['f1']

    precision = load_metric('precision', trust_remote_code=True).compute(
        predictions=predictions, references=labels
    )['precision']

    recall = load_metric('recall', trust_remote_code=True).compute(
        predictions=predictions, references=labels
    )['recall']

    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision':precision,
        'recall':recall,
    }

In [None]:
# Freeze weights of pretrained BERT model
for param in model.base_model.parameters():
    param.requires_grad = False

In [None]:
# Add additional layers to classifier
class CustomClassifier(nn.Module):

    def __init__(self, n_labels: int, dropout: float):
        super().__init__()
        self.classifier = nn.Sequential(
            nn.Linear(768, 768),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(768, 768),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(768, n_labels),
            # nn.LogSoftmax(dim=-1)
        )

    def forward(self, x):
        return self.classifier(x)

model.classifier = CustomClassifier(n_labels=2, dropout=0.1)

In [None]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
# Define training args and trainer
batch_size = 128
n_epochs = 10
lr = 2e-5
data_dir = os.path.join(os.getcwd(), 'data')
hf_data_dir = os.path.join(data_dir, 'HFTransformer')
os.makedirs(hf_data_dir, exist_ok=True)

train_args = TrainingArguments(
    output_dir=hf_data_dir,
    logging_dir=hf_data_dir,
    save_steps=500,
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=n_epochs,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    weight_decay=0.01,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
# Train model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.453709,0.814,0.816929,0.780075,0.857438
2,No log,0.45767,0.797,0.805742,0.750446,0.869835
3,No log,0.441484,0.815,0.807892,0.812109,0.803719
4,No log,0.441743,0.817,0.818632,0.786667,0.853306
5,No log,0.438146,0.816,0.815631,0.791829,0.840909
6,No log,0.443539,0.81,0.815534,0.769231,0.867769
7,0.491300,0.436753,0.814,0.816206,0.782197,0.853306
8,0.491300,0.436078,0.813,0.8154,0.780718,0.853306
9,0.491300,0.434504,0.817,0.818272,0.787763,0.85124


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.453709,0.814,0.816929,0.780075,0.857438
2,No log,0.45767,0.797,0.805742,0.750446,0.869835
3,No log,0.441484,0.815,0.807892,0.812109,0.803719
4,No log,0.441743,0.817,0.818632,0.786667,0.853306
5,No log,0.438146,0.816,0.815631,0.791829,0.840909
6,No log,0.443539,0.81,0.815534,0.769231,0.867769
7,0.491300,0.436753,0.814,0.816206,0.782197,0.853306
8,0.491300,0.436078,0.813,0.8154,0.780718,0.853306
9,0.491300,0.434504,0.817,0.818272,0.787763,0.85124
10,0.491300,0.43328,0.82,0.820359,0.793436,0.849174


TrainOutput(global_step=790, training_loss=0.4896511029593552, metrics={'train_runtime': 1105.7261, 'train_samples_per_second': 90.438, 'train_steps_per_second': 0.714, 'total_flos': 2.667264823376525e+16, 'train_loss': 0.4896511029593552, 'epoch': 10.0})

In [None]:
# Evaluate model
trainer.evaluate()

{'eval_loss': 0.43328022956848145,
 'eval_accuracy': 0.82,
 'eval_f1': 0.8203592814371257,
 'eval_precision': 0.7934362934362934,
 'eval_recall': 0.8491735537190083,
 'eval_runtime': 11.0116,
 'eval_samples_per_second': 90.813,
 'eval_steps_per_second': 0.727,
 'epoch': 10.0}

In [None]:
from transformers import pipeline
model_test = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
model_test([
    'Absolutely loved it',
    'This movie was sick',
    'It was alright',
    'I will never watch it again',
    'They should win awards for their lack of creativity',
]) # Struggles with 2000's slang, but other than that, not bad

[{'label': 'LABEL_1', 'score': 0.9104171395301819},
 {'label': 'LABEL_0', 'score': 0.7015535831451416},
 {'label': 'LABEL_1', 'score': 0.7846377491950989},
 {'label': 'LABEL_0', 'score': 0.5126023888587952},
 {'label': 'LABEL_0', 'score': 0.7759935259819031}]

### References

- https://huggingface.co/blog/sentiment-analysis-python
- https://colab.research.google.com/drive/1t-NJadXsPTDT6EWIR0PRzpn5o8oMHzp3?usp=sharing (HuggingFace)
- https://colab.research.google.com/drive/1G4nvWf6NtytiEyiIkYxs03nno5ZupIJn?usp=sharing (HuggingFace)
- https://github.com/huggingface/transformers