###  Sentiment Analysis using a GRU-based encoder with BERT embeddings

In [None]:
!pip install torch==1.8.0+cu111 torchvision==0.9.0+cu111 torchtext==0.9 transformers==4.25.1 torchaudio==0.8.0 -f https://download.pytorch.org/whl/torch_stable.html

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting torch==1.8.0+cu111
  Downloading https://download.pytorch.org/whl/cu111/torch-1.8.0%2Bcu111-cp38-cp38-linux_x86_64.whl (1982.2 MB)
[K     |█████████████▌                  | 834.1 MB 1.3 MB/s eta 0:14:43tcmalloc: large alloc 1147494400 bytes == 0x3ace6000 @  0x7f4f86e90615 0x5d6f4c 0x51edd1 0x51ef5b 0x4f750a 0x4997a2 0x4fd8b5 0x4997c7 0x4fd8b5 0x49abe4 0x4f5fe9 0x55e146 0x4f5fe9 0x55e146 0x4f5fe9 0x55e146 0x5d8868 0x5da092 0x587116 0x5d8d8c 0x55dc1e 0x55cd91 0x5d8941 0x49abe4 0x55cd91 0x5d8941 0x4990ca 0x5d8868 0x4997a2 0x4fd8b5 0x49abe4
[K     |█████████████████               | 1055.7 MB 1.2 MB/s eta 0:12:56tcmalloc: large alloc 1434370048 bytes == 0x7f33c000 @  0x7f4f86e90615 0x5d6f4c 0x51edd1 0x51ef5b 0x4f750a 0x4997a2 0x4fd8b5 0x4997c7 0x4fd8b5 0x49abe4 0x4f5fe9 0x55e146 0x4f5fe9 0x55e146 0x4f5fe9 0x55e14

In [None]:
import torch
import torch.nn as nn
import torchtext
import spacy
from torchtext.legacy import data, datasets
from transformers import BertTokenizer, BertModel
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(torch.cuda.is_available())
print('Using', torch.cuda.get_device_name())

True
Using Tesla T4


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_seq_len = tokenizer.max_model_input_sizes['bert-base-uncased']
print('Max sequence length:',max_seq_len)
print('Size of Vocabulary:', len(tokenizer.vocab))

Max sequence length: 512
Size of Vocabulary: 30522


In [None]:
TEXT = data.Field(tokenize = lambda x: tokenizer.tokenize(x)[:max_seq_len-2], preprocessing = tokenizer.convert_tokens_to_ids,
                  init_token = tokenizer.cls_token_id, pad_token = tokenizer.pad_token_id, eos_token = tokenizer.eos_token_id,
                  unk_token = tokenizer.unk_token_id, batch_first = True, use_vocab = False,)

LABEL = data.LabelField(dtype = torch.float)

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
train_data, val_data = train_data.split()
print(len(train_data), len(test_data), len(val_data))

17500 25000 7500


In [None]:
batch_size = 64
LABEL.build_vocab(train_data)
train_loader, val_loader, test_loader = data.BucketIterator.splits((train_data, val_data, test_data), batch_size = batch_size, device = device)

## GRU + BERT Classification  Model

---



In [None]:
BERT = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
class GRU_BERT(nn.Module):
    def __init__(self, BERT, hidden_dim):
        super().__init__()

        emb_dim = BERT.config.to_dict()['hidden_size']
        self.BERT = BERT
        self.grulayers = nn.GRU(emb_dim, hidden_dim, num_layers = 2, bidirectional = True, dropout = 0.3, batch_first = True)
        self.fc = nn.Linear(2*hidden_dim, 1)

    def forward(self, text):
        with torch.no_grad():
            embedded = self.BERT(text)[0] # (seq_len, batch_size) ->  (seq_len, batch_size, emb_dim)

        _, hidden_all_layers = self.grulayers(embedded) # hidden = (2*layers, batch_size, hidden_dim) [no. of directions=2]

        hidden_last_layer = nn.Dropout(0.3)(torch.cat((hidden_all_layers[-2,:,:], hidden_all_layers[-1,:,:]), dim = 1)) # hidden = (batch_size, 2*hidden_dim)
        return self.fc(hidden_last_layer)

In [None]:
model = GRU_BERT(BERT=BERT, hidden_dim=256).to(device)
print(model)

GRU_BERT(
  (BERT): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
   

### Freeze parameters of BERT

In [None]:
for name, param in model.named_parameters():
    if name.startswith('BERT'):
        param.requires_grad = False
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name)

grulayers.weight_ih_l0
grulayers.weight_hh_l0
grulayers.bias_ih_l0
grulayers.bias_hh_l0
grulayers.weight_ih_l0_reverse
grulayers.weight_hh_l0_reverse
grulayers.bias_ih_l0_reverse
grulayers.bias_hh_l0_reverse
grulayers.weight_ih_l1
grulayers.weight_hh_l1
grulayers.bias_ih_l1
grulayers.bias_hh_l1
grulayers.weight_ih_l1_reverse
grulayers.weight_hh_l1_reverse
grulayers.bias_ih_l1_reverse
grulayers.bias_hh_l1_reverse
fc.weight
fc.bias


In [None]:
lr=0.001
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss_fn = nn.BCEWithLogitsLoss().to(device)

In [None]:
def train():
    losses, corrects = [], []
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()

        preds = model(batch.text)
        preds = preds.squeeze(1)
        loss = loss_fn(preds, batch.label)

        preds = torch.round(torch.sigmoid(preds))
        num_correct = (preds == batch.label).sum().float()

        loss.backward()
        optimizer.step()

        losses.append(loss.item())
        corrects.append(num_correct.item()/len(batch.label))

    return sum(losses) / len(train_loader), sum(corrects) / len(train_loader)

In [None]:
def evaluate(loader):
    losses, corrects = [], []
    model.eval()
    with torch.no_grad():
        for batch in loader:
            preds = model(batch.text)
            preds = preds.squeeze(1)
            loss = loss_fn(preds, batch.label)

            preds = torch.round(torch.sigmoid(preds))
            num_correct = (preds == batch.label).sum().float()


            losses.append(loss.item())
            corrects.append(num_correct.item()/len(batch.label))

    return sum(losses) / len(loader), sum(corrects) / len(loader)

## Training

In [None]:
%%time
best_val_loss = float('inf')
for epoch in range(4):
    train_loss, train_acc = train()
    val_loss, val_acc = evaluate(val_loader)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), './gru_with_bert.pth')

    print(f'Epoch {epoch+1}')
    print(f'Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'Val Loss: {val_loss:.3f} |  Val Acc: {val_acc*100:.2f}%')
    print('-------------------------------------------------------')

Epoch 1
Train Loss: 0.449 | Train Acc: 78.33%
Val Loss: 0.263 |  Val Acc: 89.00%
-------------------------------------------------------
Epoch 2
Train Loss: 0.288 | Train Acc: 88.10%
Val Loss: 0.213 |  Val Acc: 91.60%
-------------------------------------------------------
Epoch 3
Train Loss: 0.249 | Train Acc: 90.05%
Val Loss: 0.204 |  Val Acc: 92.21%
-------------------------------------------------------
Epoch 4
Train Loss: 0.221 | Train Acc: 91.25%
Val Loss: 0.222 |  Val Acc: 91.67%
-------------------------------------------------------
CPU times: user 30min 36s, sys: 24min 34s, total: 55min 10s
Wall time: 55min 32s


## Evaluation

In [None]:
model.load_state_dict(torch.load('./gru_with_bert.pth'))
test_loss, test_acc = evaluate(test_loader)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.202 | Test Acc: 91.97%


In [None]:
nlp = spacy.load('en_core_web_sm')

def sentiment_analysis(text):
    model.eval()
    tokens = tokenizer.tokenize(text)[:max_seq_len-2]
    numericalized_tokens = [tokenizer.cls_token_id] + tokenizer.convert_tokens_to_ids(tokens) + [tokenizer.sep_token_id]
    inp_text = torch.LongTensor(numericalized_tokens).unsqueeze(0).to(device)
    pred = torch.sigmoid(model(inp_text))

    return pred.item()
print(sentiment_analysis('Good morning Bangalore, great to see a bright and sunny day here.'))
print(sentiment_analysis('Me neither! I am so annoyed because my laptop is new, and yet I have not been able to get this new OS to work.'))

0.920813798904419
0.1342366337776184


In [None]:
print(sentiment_analysis('This film is great!'))
print(sentiment_analysis('This film is terrible!'))


0.9939267039299011
0.01869475655257702
