# BERT Sentiment Classifier with PyTorch

In [1]:
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BertTokenizer, BertModel
from transformers import BertForSequenceClassification
import torch
import pandas as pd
import numpy as np
import torchmetrics

from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import lightning as pl

from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
%load_ext watermark

In [3]:
%watermark -p streamlit,transformers,torch,pandas,lightning

streamlit   : 1.33.0
transformers: 4.40.1
torch       : 2.2.2
pandas      : 2.0.3
lightning   : 2.2.4



## Local training

### Data preprocessing

In [18]:
# Data can be found in the csb-sentiment-analysis bucket

df=pd.concat([
    pd.read_csv("../data/farisdurrani/twitter_filtered.csv"),
    pd.read_csv("../data/farisdurrani/facebook_filtered.csv")
])

In [19]:
df = df.dropna(subset=['sentiment'], axis=0)
df['Target'] = df['sentiment'].apply(lambda x: 1 if x==0 else np.sign(x)+1).astype(int)
df.head()

Unnamed: 0,platform,bodyText,sentiment,date,country,Target
0,Twitter,@Kenichan I dived many times for the ball. Man...,0.4939,2009-04-06,,2
1,Twitter,"@nationwideclass no, it's not behaving at all....",-0.4939,2009-04-06,,0
2,Twitter,Need a hug,0.4767,2009-04-06,,2
3,Twitter,@LOLTrish hey long time no see! Yes.. Rains a...,0.6208,2009-04-06,,2
4,Twitter,@Tatiana_K nope they didn't have it,0.0,2009-04-06,,1


In [20]:
df_train, _df = train_test_split(df, stratify=df['Target'], test_size=0.2)
df_val, df_test = train_test_split(_df, stratify=_df['Target'], test_size=0.5)

### Load pretrained BERT model and tokenizer

In [4]:
# Model can be downloaded https://hf-mirror.com/google/bert_uncased_L-2_H-128_A-2/tree/main
# or from the csb-sentiment-analysis bucket

PRETRAINED_MODEL_DIR = '../models/bert_uncased_L-2_H-128_A-2'
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_DIR)
model = BertModel.from_pretrained(PRETRAINED_MODEL_DIR)

### Create dataloader

In [28]:
class BertDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=100):
        super(BertDataset, self).__init__()
        self.df=df
        self.tokenizer=tokenizer
        self.target=self.df['Target']
        self.max_length=max_length
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        
        X = self.df['bodyText'].values[idx]
        y = self.target.values[idx]
        
        inputs = self.tokenizer.encode_plus(
            X,
            pad_to_max_length=True,
            add_special_tokens=True,
            return_attention_mask=True,
            max_length=self.max_length,
            truncation=True
        )
        ids = inputs["input_ids"]
        token_type_ids = inputs["token_type_ids"]
        mask = inputs["attention_mask"]

        x = {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long)
            }
        y = torch.tensor(y, dtype=torch.long)
        
        return x, y

In [29]:
train_ds= BertDataset(df_train, tokenizer, max_length=100)
train_loader=DataLoader(dataset=train_ds, batch_size=512)
eval_ds= BertDataset(df_test, tokenizer, max_length=100)
eval_loader=DataLoader(dataset=eval_ds, batch_size=512)

In [30]:
sample_batch = next(iter(train_loader))

In [40]:
x, y = sample_batch
for k, v in x.items():
    print(">", k, ":")
    print("-"*len(k))
    print(v[0, :])
print()
print("> target")
print("-"*6)
print(y[0])

> ids :
---
tensor([  101,  7842,  4246,  4948,  4542,  4122,  7084,  2000,  2272,  2461,
         2085,  1060,  1012,  8299,  1024,  1013,  1013,  4714,  3126,  2140,
         1012,  4012,  1013,  1051, 10354,  2615, 28311,   102,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0])
> mask :
----
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0

In [41]:
# tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
# model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

In [42]:
tokenizer.encode_plus("I love pizza",
                      max_length = 10,           # Pad & truncate all sentences.
                        padding = 'max_length',
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt')

{'input_ids': tensor([[  101,  1045,  2293, 10733,   102,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0]])}

### Define model

In [8]:
class SentimentBERT(nn.Module):
    def __init__(self, bert_model):
        super().__init__()
        self.bert_module = bert_model
        self.dropout = nn.Dropout(0.1)
        self.final = nn.Linear(in_features=128, out_features=3, bias=True) 
        
        self.bert_module.requires_grad_(False)
        for param in self.bert_module.encoder.parameters():
            param.requires_grad = True
        
    def forward(self, inputs):
        ids, mask, token_type_ids = inputs['ids'], inputs['mask'], inputs['token_type_ids']
        # print(ids.size(), mask.size(), token_type_ids.size())
        x = self.bert_module(ids, mask, token_type_ids)
        x = self.dropout(x['pooler_output'])
        out = self.final(x)
        return out

In [45]:
bert_model = SentimentBERT(model)
print(bert_model)

SentimentBERT(
  (bert_module): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-12, elementwis

In [46]:
total_parameters = sum([np.prod(p.size()) for p in bert_model.parameters()])
model_parameters = filter(lambda p: p.requires_grad, bert_model.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
print(f"Total params : {total_parameters} - Trainable : {params} ({params/total_parameters*100}% of total)")

Total params : 4386436 - Trainable : 397060 (9.05199574324121% of total)


### Training and evaluation utils

In [44]:
import time


def train(model, dataloader, loss_fn, optimizer):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 50
    start_time = time.time()

    for idx, (inputs, label) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model(inputs)
        
        loss = loss_fn(predicted_label, label)
        loss.backward()
        optimizer.step()
        
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        
        if idx % log_interval == 0:
            elapsed = time.time() - start_time
            print(
                "Epoch {:3d} | {:5d}/{:5d} batches "
                "| accuracy {:8.3f} | loss {:8.3f} ({:.3f}s)".format(
                    epoch, idx, len(dataloader), total_acc / total_count, loss.item(), elapsed
                )
            )
            total_acc, total_count = 0, 0
            start_time = time.time()


def evaluate(model, dataloader, loss_fn):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (inputs, label) in enumerate(dataloader):
            predicted_label = model(inputs)
            loss = loss_fn(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc / total_count

In [47]:
EPOCHS = 2
BATCH_SIZE=512
LEARNING_RATE = 1e-3

optimizer = torch.optim.Adam([p for p in bert_model.parameters() if p.requires_grad], LEARNING_RATE)
loss_fn = nn.CrossEntropyLoss()

train_ds= BertDataset(df_train, tokenizer, max_length=100)
train_loader=DataLoader(dataset=train_ds,batch_size=BATCH_SIZE, pin_memory=True, shuffle=True)
eval_ds= BertDataset(df_test, tokenizer, max_length=100)
eval_loader=DataLoader(dataset=eval_ds,batch_size=BATCH_SIZE, pin_memory=True)

In [None]:
for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(bert_model, train_loader, loss_fn=loss_fn, optimizer=optimizer)
    accu_val = evaluate(bert_model, valid_loader, loss_fn=loss_fn, optimizer=optimizer)
    
    print("-" * 59)
    print(
        "| end of epoch {:3d} | time: {:5.2f}s | "
        "valid accuracy {:8.3f} ".format(
            epoch, time.time() - epoch_start_time, accu_val
        )
    )
    print("-" * 59)

## Train model on Vertex

The first step is to build a Docker image from the Dockerfile

`Dockerfile here`

Then, we need to write a little `build.sh` script to build and push the image to Artefact Registry

```bash
# build.sh

export PROJECT_ID=...
export IMAGE_REPO_NAME=pt_bert_sentiment
export IMAGE_TAG=dev
export IMAGE_URI=eu.gcr.io/$PROJECT_ID/$IMAGE_REPO_NAME:$IMAGE_TAG

gcloud builds submit --tag $IMAGE_URI .
```

We will then launch a job, with 2 options:

#### 1. Option 1: via gcloud : Create a little `job.sh` script as below 

```bash
# job.sh

export PROJECT_ID=...
export BUCKET="csb-sentiment-analysis"
export REGION="europe-west4"
export SERVICE_ACCOUNT=...
export JOB_NAME="pytorch_bert_training"
export MACHINE_TYPE="n1-standard-8"  # We can specify GPUs here
export ACCELERATOR_TYPE="NVIDIA_TESLA_T4"
export IMAGE_URI="eu.gcr.io/$PROJECT_ID/pt_bert_sentiment:dev"


gcloud ai custom-jobs create \
--region=$REGION \
--display-name=$JOB_NAME \
--worker-pool-spec=machine-type=$MACHINE_TYPE,accelerator-type=$ACCELERATOR_TYPE,accelerator-count=1,replica-count=1,container-image-uri=$IMAGE_URI \
--service-account=$SERVICE_ACCOUNT \
--args=\
--training-file=gs://$BUCKET/data/train.csv,\
--validation-file=gs://$BUCKET/data/eval.csv,\
--testing-file=gs://$BUCKET/data/test.csv,\
--job-dir=gs://$BUCKET/model/model.pt,\
--epochs=1,\
--batch-size=128,\
--learning-rate=0.0001
```

#### option 2: via the Python Client

```python
from google.cloud import aiplatform

PROJECT_ID=...
BUCKET="csb-sentiment-analysis"

my_job = aiplatform.CustomContainerTrainingJob(
    display_name='pytorch_bert_training',
    container_uri='eu.gcr.io/{PROJECT_ID}/pt_bert_sentiment:dev',
    staging_bucket='gs://{BUCKET}')
    
my_job.run(replica_count=1,
           machine_type='n1-standard-8',
           accelerator_type='NVIDIA_TESLA_T4',
           accelerator_count=1)
           
```

### Monitor the job

In [None]:
!gcloud ai custom-jobs stream-logs projects/1011434374459/locations/europe-west4/customJobs/8968484625693278208

## Inference with trained model

In [9]:
from google.cloud import storage

storage_client = storage.Client()
bucket = storage_client.bucket("csb-sentiment-analysis")
blob = bucket.blob("model/model.pt")
loaded_model = SentimentBERT(model)

with blob.open("rb") as f:
    loaded_model.load_state_dict(torch.load(f, map_location=torch.device('cpu')))
    
loaded_model.eval() 

SentimentBERT(
  (bert_module): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-12, elementwis

In [33]:
def sentiment_score(comment):
    mapping = {0: 'Negative', 1: 'Neutral', 2: 'Positive'}
    inputs = tokenizer(comment, return_tensors='pt')
    ids = inputs["input_ids"]
    token_type_ids = inputs["token_type_ids"]
    mask = inputs["attention_mask"]

    x = {
        'ids': ids,
        'mask': mask,
        'token_type_ids': token_type_ids
        }
    result = loaded_model(x)
    y = nn.Softmax()(result)
    for n, x in enumerate(y[0]):
        print(f"{mapping[n]}: {100*x:.2f}%")

In [34]:
sentiment_score("I hate watching this")

Negative: 99.76%
Neutral: 0.15%
Positive: 0.09%


In [35]:
sentiment_score("I really love this ring, it's so beautiful !")

Negative: 0.06%
Neutral: 0.10%
Positive: 99.84%


In [39]:
sentiment_score("This place is a scam, i highly don't recommend")

Negative: 97.73%
Neutral: 0.49%
Positive: 1.77%


In [40]:
sentiment_score("I don't know what to say")

Negative: 0.07%
Neutral: 99.87%
Positive: 0.06%


In [42]:
sentiment_score("The sky is blue")

Negative: 0.06%
Neutral: 99.89%
Positive: 0.06%


In [48]:
sentiment_score("the cartier trinity is ugly bruh")


Negative: 96.94%
Neutral: 2.93%
Positive: 0.13%


In [49]:
sentiment_score("I have no idea what wedding band to get for this - any ideas? for now I have the cartier trinity ring underneath which works surprisingly well but will be changed to the wedding band, once we get married in 2025!")


Negative: 6.61%
Neutral: 0.61%
Positive: 92.78%
