<div style="text-align: center; color: red;"><h2>NLP Project: Amazon Reviews Sentiment Analysis by GenMinds (Deep Learning notebook)</h2></div>

In this notebook, we will fine-tune a pre-trained BERT model and see how well it performs.

# Overview

$\textbf{BERT (Bidirectional Encoder Representations from Transformers)}$ is a deep learning model designed for natural language understanding tasks. It uses a Transformer encoder architecture to learn contextual representations of text by considering both the left and right context of each word. Pre-trained on large corpora through masked language modeling, BERT can be fine-tuned on specific tasks such as classification, question answering, and named entity recognition.

<figure style="width: 600px; height: 500px; display: block; margin: auto;">
    <img src="https://miro.medium.com/v2/resize:fit:876/0*ViwaI3Vvbnd-CJSQ.png">
</figure>

# Imports

In [None]:
import pandas as pd

import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer
import torch.optim as optim
from sklearn.metrics import accuracy_score

from string import punctuation
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

# Load the dataset

In [1]:
df = pd.read_csv("./GenMinds_Balanced_Dataset_All_Beauty.csv", index_col=0)
df.head()

Unnamed: 0,review_text,label
0,one star bad,0
1,three stars okay,1
2,missing bottle defective sprayer instead 4 bot...,0
3,disappointed little disappointed product case ...,1
4,don’t stay began peeling 20 minutes putting on,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 168848 entries, 0 to 168920
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   review_text  168848 non-null  object
 1   label        168848 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 3.9+ MB


In [None]:
# Quick index reset (although unnecessary)
df.reset_index(drop=True, inplace=True)

# Convert reviews and labels to lists

In [5]:
reviews = df["review_text"].to_list()
labels = df["label"].to_list()

# BERT


### Loading and adjusting prediction header

In [None]:
# model & tokenizer
bert_model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# PyTorch model
class BERTClassifier(nn.Module):
    def __init__(self, bert_model, hidden_size=128, num_classes=3):
        super(BERTClassifier, self).__init__()
        self.bert = bert_model
        self.fc = nn.Linear(bert_model.config.hidden_size, hidden_size)  # Fully connected layer
        self.dropout = nn.Dropout(0.3)
        self.output = nn.Linear(hidden_size, num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask):
        # Get BERT's output
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # pooled output (representation of the entire sequence)
        pooled_output = output.pooler_output
        # Pass through fully connected layers
        x = self.fc(pooled_output)
        x = torch.relu(x)
        x = self.dropout(x)
        x = self.output(x)
        x = self.softmax(x)
        return x

# Initialize
model = BERTClassifier(bert_model)

# Check model architecture
print(model)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

BERTClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

### Tokenization

In [7]:
def tokenize_texts(texts, max_len=128):
    return tokenizer(
        texts,
        padding='max_length',
        truncation=True,
        max_length=max_len,
        return_tensors='pt'
    )

# Tokenize train
train_tokens = tokenize_texts(reviews)

### Tensor conversion

In [None]:
# Convert labels to tensors
labels_tensor = torch.tensor(labels)

# Create PyTorch Dataset class
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

# Create dataset
train_dataset = TextDataset(train_tokens['input_ids'], train_tokens['attention_mask'], labels_tensor)

# Create data loader
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)

### Optimizer and Loss

In [None]:
# Optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

# Training loop
epochs = 5
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BERTClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

### Training

In [10]:
# Training process
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()  # Zero the gradients
        outputs = model(input_ids, attention_mask)  # Forward pass
        loss = criterion(outputs, labels)  # Compute the loss
        loss.backward()  # Backward pass
        optimizer.step()  # Update the weights
        
        total_loss += loss.item()
        _, preds = torch.max(outputs, dim=1)
        correct_predictions += torch.sum(preds == labels)
        total_samples += labels.size(0)
        avg_loss = total_loss / total_samples

    epoch_accuracy = correct_predictions / total_samples
    print(f"Epoch {epoch+1}/{epochs} - Loss: {avg_loss:.4f} - Accuracy: {epoch_accuracy:.4f}")

Epoch 1/5 - Loss: 0.0241 - Accuracy: 0.7736
Epoch 2/5 - Loss: 0.0230 - Accuracy: 0.8100
Epoch 3/5 - Loss: 0.0225 - Accuracy: 0.8283
Epoch 4/5 - Loss: 0.0220 - Accuracy: 0.8450
Epoch 5/5 - Loss: 0.0217 - Accuracy: 0.8550


### Evaluation

In [11]:
# Evaluation process (on test set)
model.eval()
preds, true_labels = [], []

with torch.no_grad():
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask)
        _, predicted = torch.max(outputs, dim=1)

        preds.extend(predicted.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Compute accuracy on test set
test_accuracy = accuracy_score(true_labels, preds)
print(f"Test Accuracy: {test_accuracy:.4f}")

Test Accuracy: 0.8770


# Text pre-processing functions

The following functions exist in `utils.py` and can be imported as follows:

```python
from utils import *
```

In case you don't have access to the file, you can execute the cell below to define them.

In [None]:
# These functions exist in utils.py

def remove_stopwords(data: str):
    stop_words = stopwords.words('english')
    stop_words = set(stop_words)
    stop_words
    stop_words.discard('not') # 'not', 'but' and 'such' are important to keep
    stop_words.discard('such')
    stop_words.discard('but')
    stop_words.update(['it', 'br', "it's"])
    return ' '.join([word for word in data.split() if word not in stop_words])

def remove_punctuation(data: str):
    clean_text = data.translate(str.maketrans('', '', punctuation))
    return clean_text

def remove_emojis(data: str):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                    "]+", re.UNICODE)
    return re.sub(emoj, '', data)

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Testing on new reviews

In [13]:
text_list = ["I wasn't expecting much, but this exceeded my expectations." 
             "The build quality feels premium, and the instructions were "
             "easy to follow. Setup took under 10 minutes, and it's been "
             "working flawlessly ever since. I've used it daily without "
             "any issues. Definitely worth the price and would happily "
             "recommend it to anyone looking for a reliable option in this category.", # Positive

             "It works as described, but there's nothing particularly "
             "special about it. The performance is fine for basic tasks, "
             "though I did notice some minor lag during more demanding use. "
             "Packaging was a bit flimsy, but the item arrived intact. "
             "If you're looking for something simple and functional, "
             "this will do, just don't expect premium quality or standout features.", # Neutral

             "Pretty disappointed overall. The product didn't match the "
             "pictures and felt cheaply made. It stopped working properly "
             "after just a few uses, and customer service was slow to respond. "
             "I tried troubleshooting it myself, but nothing helped. For the price, "
             "I expected a much better experience. Wouldn't buy again, and I'd "
             "recommend looking elsewhere if you need something dependable.", # Negative
            ]

text_list = [remove_stopwords(text.lower()) for text in text_list]
text_list = [remove_punctuation(text) for text in text_list]

### Prediction function

In [None]:
def predict_texts(model, texts: list[str], tokenizer, max_len=100, device="cpu"):
    model.eval() # Set model to evaluation mode

    # Tokenize the input texts
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=max_len)
    inputs.pop("token_type_ids", None)

    # Move inputs to the correct device
    inputs = {key: val.to(device) for key, val in inputs.items()}
    model.to(device)

    with torch.no_grad():
        logits = model(**inputs)  # direct output from the model
        preds = torch.argmax(logits, dim=1)

    label_map = {0: "Negative", 1: "Neutral", 2: "Positive"} # Map indices to labels
    predicted_labels = [label_map[int(p)] for p in preds]

    return predicted_labels

In [15]:
predict_texts(model, text_list, tokenizer)

['Positive', 'Neutral', 'Negative']

In [16]:
predict_texts(model, ["This is an excellent shampoo! It's very good for my hair.",], tokenizer)

['Positive']

In [None]:
predict_texts(model, ["The product was okay overall. It didn't cause any irritation, but I also didn't notice any significant difference after using it for a few weeks. The texture is nice and it absorbs well, but the scent might not be for everyone.",], tokenizer)

['Neutral']

# Saving

In [18]:
torch.save(model.state_dict(), "BERT_fine-tuned.pth")
tokenizer.save_pretrained("bert_tokenizer")

('bert_tokenizer/tokenizer_config.json',
 'bert_tokenizer/special_tokens_map.json',
 'bert_tokenizer/vocab.txt',
 'bert_tokenizer/added_tokens.json')

# Model reloading

In [None]:
# Reload tokenizer
tokenizer = BertTokenizer.from_pretrained("bert_tokenizer")

# Reload BERT backbone
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Rebuild the classifier
class BERTClassifier(nn.Module):
    def __init__(self, bert_model, hidden_size=128, num_classes=3):
        super(BERTClassifier, self).__init__()
        self.bert = bert_model
        self.fc = nn.Linear(bert_model.config.hidden_size, hidden_size)
        self.dropout = nn.Dropout(0.3)
        self.output = nn.Linear(hidden_size, num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = output.pooler_output
        x = self.fc(pooled_output)
        x = torch.relu(x)
        x = self.dropout(x)
        x = self.output(x)
        x = self.softmax(x)
        return x

# Initialize and load weights
model = BERTClassifier(bert_model)
model.load_state_dict(torch.load("BERT_fine-tuned.pth"))
model.eval()

  model.load_state_dict(torch.load("BERT_fine-tuned.pth"))


BERTClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis