# Sentiment Analysis using Transformers 

Cloud and Machine Learning Project

Ashwin Prakash Nalwade (apn308), Mingxi Chen (mc7805)

We would be making use of the transformers model, specially the BERT model  (Bidirectional Encoder Representations from Transformers) from [this](https://arxiv.org/abs/1810.04805) paper. 

We will use the huggingface transformers library, and use transformers as embedding layers. The rest of the model will learn from the transformers, in this case we use a bidirectional Gated Recurrent Unit (GRU) with mutiple layers.  


## Data Preparation


In [1]:
import torch

import random
import numpy as np

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [2]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/99/84/7bc03215279f603125d844bf81c3fb3f2d50fe8e511546eb4897e4be2067/transformers-4.0.0-py3-none-any.whl (1.4MB)
[K     |████████████████████████████████| 1.4MB 5.9MB/s 
[?25hCollecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 40.1MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 55.4MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893257 sha256=54bc3bbbf957d

In [3]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [4]:
len(tokenizer.vocab)

30522

In [5]:
tokens = tokenizer.tokenize('did YoU LIKe this Movie?')

print(tokens)

['did', 'you', 'like', 'this', 'movie', '?']


In [6]:
indexes = tokenizer.convert_tokens_to_ids(tokens)

print(indexes)

[2106, 2017, 2066, 2023, 3185, 1029]


In [7]:
init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

print(init_token, eos_token, pad_token, unk_token)

[CLS] [SEP] [PAD] [UNK]


In [8]:
init_token_idx = tokenizer.convert_tokens_to_ids(init_token)
eos_token_idx = tokenizer.convert_tokens_to_ids(eos_token)
pad_token_idx = tokenizer.convert_tokens_to_ids(pad_token)
unk_token_idx = tokenizer.convert_tokens_to_ids(unk_token)

print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100


In [9]:
init_token_idx = tokenizer.cls_token_id
eos_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id

print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100


In [10]:
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']

print(max_input_length)

512


In [11]:
def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence) 
    tokens = tokens[:max_input_length-2]
    return tokens

In [12]:
from torchtext import data

TEXT = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = tokenize_and_cut,
                  preprocessing = tokenizer.convert_tokens_to_ids,
                  init_token = init_token_idx,
                  eos_token = eos_token_idx,
                  pad_token = pad_token_idx,
                  unk_token = unk_token_idx)

LABEL = data.LabelField(dtype = torch.float)

Create splits.

In [13]:
from torchtext import datasets

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

train_data, valid_data = train_data.split(random_state = random.seed(SEED))

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:07<00:00, 11.4MB/s]


In [14]:
print(f"Number of training examples: {len(train_data)}")
print(f"Number of validation examples: {len(valid_data)}")
print(f"Number of testing examples: {len(test_data)}")

Number of training examples: 17500
Number of validation examples: 7500
Number of testing examples: 25000


In [15]:
print(vars(train_data.examples[6]))

{'text': [1045, 2387, 2023, 3185, 2197, 5353, 1998, 2009, 2003, 10021, 1998, 2568, 3238, 1012, 2019, 3418, 8364, 4332, 1037, 2158, 2046, 1037, 22788, 1999, 2010, 27621, 1029, 1996, 5694, 2024, 6015, 3168, 3238, 1998, 2064, 2025, 2448, 2013, 1037, 4030, 3048, 2214, 2158, 1012, 2027, 4530, 2037, 24711, 1998, 13277, 2612, 1997, 7161, 2000, 4829, 2032, 2125, 2030, 1010, 4241, 2232, 1010, 6402, 2032, 1012, 15969, 14593, 2720, 1012, 21720, 21010, 1997, 1996, 19227, 2545, 2003, 2496, 2000, 12082, 23009, 14593, 3335, 2064, 3790, 1997, 2681, 2009, 2000, 13570, 4476, 1010, 2040, 2003, 12511, 1999, 3510, 1012, 10166, 1010, 2293, 5436, 1012, 1996, 2717, 1997, 1996, 3459, 2074, 4076, 2169, 2060, 1999, 1996, 16623, 1998, 3524, 2005, 11652, 2000, 10509, 1998, 2448, 2000, 2037, 4681, 1012, 1996, 6811, 2611, 2003, 2025, 2919, 2559, 2021, 2515, 2025, 18496, 2172, 2000, 1996, 2143, 1012, 2073, 2003, 1996, 22788, 1029, 2428, 2074, 1037, 8364, 22416, 1012, 2023, 2515, 2025, 2907, 1037, 12723, 2000, 5415, 2

In [16]:
tokens = tokenizer.convert_ids_to_tokens(vars(train_data.examples[6])['text'])

print(tokens)

['i', 'saw', 'this', 'movie', 'last', 'weekend', 'and', 'it', 'is', 'silly', 'and', 'mind', '##less', '.', 'an', 'ancient', 'curse', 'turns', 'a', 'man', 'into', 'a', 'mummy', 'in', 'his', 'pajamas', '?', 'the', 'victims', 'are', 'scared', 'sense', '##less', 'and', 'can', 'not', 'run', 'from', 'a', 'slow', 'moving', 'old', 'man', '.', 'they', 'drop', 'their', 'torches', 'and', 'shiver', 'instead', 'of', 'attempting', 'to', 'ward', 'him', 'off', 'or', ',', 'du', '##h', ',', 'burn', 'him', '.', 'quentin', 'alias', 'mr', '.', 'faber', '##sham', 'of', 'the', 'honeymoon', '##ers', 'is', 'married', 'to', 'diane', 'brewster', 'alias', 'miss', 'can', '##field', 'of', 'leave', 'it', 'to', 'beaver', 'fame', ',', 'who', 'is', 'unhappy', 'in', 'marriage', '.', 'wow', ',', 'love', 'plot', '.', 'the', 'rest', 'of', 'the', 'cast', 'just', 'follows', 'each', 'other', 'in', 'the', 'tombs', 'and', 'wait', 'for', 'screams', 'to', 'react', 'and', 'run', 'to', 'their', 'aid', '.', 'the', 'egyptian', 'girl'

In [17]:
LABEL.build_vocab(train_data)

In [18]:
print(LABEL.vocab.stoi)

defaultdict(<function _default_unk_index at 0x7f224ce556a8>, {'neg': 0, 'pos': 1})


In [20]:
BATCH_SIZE = 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device)

## Building the Model



In [21]:
from transformers import BertTokenizer, BertModel

bert = BertModel.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




We feed the transformers used as embeddings into the gated recurrent unit to generate sentiment predictions for the input.





In [22]:
import torch.nn as nn

class BERTGRUSentiment(nn.Module):
    def __init__(self,
                 bert,
                 hidden_dim,
                 output_dim,
                 n_layers,
                 bidirectional,
                 dropout):
        
        super().__init__()
        
        self.bert = bert
        
        embedding_dim = bert.config.to_dict()['hidden_size']
        
        self.rnn = nn.GRU(embedding_dim,
                          hidden_dim,
                          num_layers = n_layers,
                          bidirectional = bidirectional,
                          batch_first = True,
                          dropout = 0 if n_layers < 2 else dropout)
        
        self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        
        #text = [batch size, sent len]
                
        with torch.no_grad():
            embedded = self.bert(text)[0]
                
        #embedded = [batch size, sent len, emb dim]
        
        _, hidden = self.rnn(embedded)
        
        #hidden = [n layers * n directions, batch size, emb dim]
        
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
                
        #hidden = [batch size, hid dim]
        
        output = self.out(hidden)
        
        #output = [batch size, out dim]
        
        return output

In [23]:
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.25

model = BERTGRUSentiment(bert,
                         HIDDEN_DIM,
                         OUTPUT_DIM,
                         N_LAYERS,
                         BIDIRECTIONAL,
                         DROPOUT)

Check the number of parameters

In [24]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 112,241,409 trainable parameters


In [25]:
for name, param in model.named_parameters():                
    if name.startswith('bert'):
        param.requires_grad = False

In [26]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,759,169 trainable parameters


In [27]:
for name, param in model.named_parameters():                
    if param.requires_grad:
        print(name)

rnn.weight_ih_l0
rnn.weight_hh_l0
rnn.bias_ih_l0
rnn.bias_hh_l0
rnn.weight_ih_l0_reverse
rnn.weight_hh_l0_reverse
rnn.bias_ih_l0_reverse
rnn.bias_hh_l0_reverse
rnn.weight_ih_l1
rnn.weight_hh_l1
rnn.bias_ih_l1
rnn.bias_hh_l1
rnn.weight_ih_l1_reverse
rnn.weight_hh_l1_reverse
rnn.bias_ih_l1_reverse
rnn.bias_hh_l1_reverse
out.weight
out.bias


## Training the model



In [28]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [29]:
criterion = nn.BCEWithLogitsLoss()

In [30]:
model = model.to(device)
criterion = criterion.to(device)

Accuracy, training, timing, and evaluation functions.

In [31]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [32]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [33]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [34]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [37]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
        
    end_time = time.time()
        
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut6-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 4m 18s
	Train Loss: 0.282 | Train Acc: 88.62%
	 Val. Loss: 0.260 |  Val. Acc: 89.37%
Epoch: 02 | Epoch Time: 4m 18s
	Train Loss: 0.235 | Train Acc: 90.71%
	 Val. Loss: 0.241 |  Val. Acc: 90.39%
Epoch: 03 | Epoch Time: 4m 18s
	Train Loss: 0.206 | Train Acc: 91.81%
	 Val. Loss: 0.215 |  Val. Acc: 91.66%
Epoch: 04 | Epoch Time: 4m 18s
	Train Loss: 0.182 | Train Acc: 93.00%
	 Val. Loss: 0.225 |  Val. Acc: 91.27%
Epoch: 05 | Epoch Time: 4m 18s
	Train Loss: 0.156 | Train Acc: 93.96%
	 Val. Loss: 0.242 |  Val. Acc: 91.30%


Test accuracy

In [None]:
model.load_state_dict(torch.load('tut6-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.198 | Test Acc: 92.09%


## Predictions

In [None]:
def predict_sentiment(model, tokenizer, sentence):
    model.eval()
    tokens = tokenizer.tokenize(sentence)
    tokens = tokens[:max_input_length-2]
    indexed = [init_token_idx] + tokenizer.convert_tokens_to_ids(tokens) + [eos_token_idx]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(0)
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()

In [None]:
predict_sentiment(model, tokenizer, "This film is terrible")

0.015782710164785385

In [None]:
predict_sentiment(model, tokenizer, "This film is great")

0.9446907043457031