##Bert Bi-GRU for sentiment analysis

In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/f4/9f93f06dd2c57c7cd7aa515ffbf9fcfd8a084b92285732289f4a5696dd91/transformers-3.2.0-py3-none-any.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 8.4MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 30.0MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 58.8MB/s 
Collecting tokenizers==0.8.1.rc2
[?25l  Downloading https://files.pythonhosted.org/packages/80/83/8b9fccb9e48eeb575ee19179e2bdde0ee9a1904f97de5f02d19016b8804f/tokenizers-0.8.1rc2-cp36-cp36m-manylinux1_x86_64.whl (3.0MB

In [None]:
import torch #pytorch library

import random
import numpy as np

SEED = 1234

random.seed(SEED) #setting random seed
np.random.seed(SEED) #setting 
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
from transformers import BertTokenizer
#loading the pre-trained bert-base-uncased tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [None]:
#checking number of tokens in BERT vocabulary
len(tokenizer.vocab)

30522

In [None]:
#tokenizing using bert tokenizer
tokens = tokenizer.tokenize('Hello WORLD how ARE yoU?')

print(tokens)

['hello', 'world', 'how', 'are', 'you', '?']


In [None]:
#numericalizing tokens
indexes = tokenizer.convert_tokens_to_ids(tokens)

print(indexes)

[7592, 2088, 2129, 2024, 2017, 1029]


In [None]:
init_token = tokenizer.cls_token # first token of the sequence 
eos_token = tokenizer.sep_token # last token of a sequence 
pad_token = tokenizer.pad_token # token used for padding
unk_token = tokenizer.unk_token # unknown token

print(init_token, eos_token, pad_token, unk_token)

[CLS] [SEP] [PAD] [UNK]


In [None]:
# the indexes of the special tokens by converting them using the vocabulary
init_token_idx = tokenizer.convert_tokens_to_ids(init_token)
eos_token_idx = tokenizer.convert_tokens_to_ids(eos_token)
pad_token_idx = tokenizer.convert_tokens_to_ids(pad_token)
unk_token_idx = tokenizer.convert_tokens_to_ids(unk_token)

print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

# or we can get these indexes by explicitly getting them from the tokenizer.
init_token_idx = tokenizer.cls_token_id
eos_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id

print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100
101 102 0 100


In [None]:
# the model was trained on sequences with a defined maximum length. 
# Getting this maximum length.
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']

print(max_input_length)

512


In [None]:
# making our own tokenizing function that tokenizes sentences and cuts them to a 
# size (max_input_length - 2) 
def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence) 
    tokens = tokens[:max_input_length-2]
    return tokens

In [None]:
from google.colab import files
uploaded = files.upload()

Saving amazon_cells_labelled.txt to amazon_cells_labelled.txt


In [None]:
import pandas as pd
dataset = pd.read_csv('/content/amazon_cells_labelled.txt', sep="\t", names=["Review", "Score"])

In [None]:
dataset

Unnamed: 0,Review,Score
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1
...,...,...
995,The screen does get smudged easily because it ...,0
996,What a piece of junk.. I lose more calls on th...,0
997,Item Does Not Match Picture.,0
998,The only thing that disappoint me is the infra...,0


In [None]:
dataset.to_csv('dataset.csv')

In [None]:
from torchtext import data
#creating preprocessing pipelines for text and label columns
Review = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = tokenize_and_cut,
                  preprocessing = tokenizer.convert_tokens_to_ids,
                  init_token = init_token_idx,
                  eos_token = eos_token_idx,
                  pad_token = pad_token_idx,
                  unk_token = unk_token_idx,lower=True)

Score = data.LabelField(dtype = torch.float)

In [None]:
fields = {'Review': ('r', Review), 'Score': ('s', Score)}

In [None]:
#importing dataset
from torchtext import datasets
train_data = data.TabularDataset(path = '/content/dataset.csv',format = 'csv',fields=fields)
#splitting train_data to train data and valid data
train_data, test_data = train_data.split(random_state = random.seed(SEED),split_ratio=0.8)

In [None]:
train_data, valid_data = train_data.split(random_state = random.seed(SEED),split_ratio=0.8)

In [None]:
print(f"Number of training examples: {len(train_data)}")
print(f"Number of validation examples: {len(valid_data)}")
print(f"Number of testing examples: {len(test_data)}")

Number of training examples: 640
Number of validation examples: 160
Number of testing examples: 200


In [None]:
print(vars(train_data.examples[6]))

{'r': [1996, 2640, 2003, 2200, 5976, 1010, 2004, 1996, 4540, 1000, 12528, 1000, 2003, 2025, 2200, 6625, 2012, 2035, 1012], 's': '0'}


In [None]:
# getting sentence from list of indexes
tokens = tokenizer.convert_ids_to_tokens(vars(train_data.examples[6])['r'])

print(tokens)

['the', 'design', 'is', 'very', 'odd', ',', 'as', 'the', 'ear', '"', 'clip', '"', 'is', 'not', 'very', 'comfortable', 'at', 'all', '.']


In [None]:
#building label vocabulary
Score.build_vocab(train_data)

In [None]:
print(Score.vocab.stoi)

defaultdict(<function _default_unk_index at 0x7f4ca7e3f2f0>, {'1': 0, '0': 1})


In [None]:
#creating iterator for train, valid and test data
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    sort_key=lambda x: len(x.r), 
    sort_within_batch=True,
    device = device)

In [None]:
from transformers import BertTokenizer, BertModel
#importing the pretrained bert model
bert = BertModel.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Building our model

In [None]:
import torch.nn as nn

class BERTGRUSentiment(nn.Module):
    def __init__(self,
                 bert,
                 hidden_dim,
                 output_dim,
                 n_layers,
                 bidirectional,
                 dropout):
        
        super().__init__()
        
        self.bert = bert
        #we use bert pretrained embeddings
        embedding_dim = bert.config.to_dict()['hidden_size']
        #defining our GRU layer 
        self.rnn = nn.GRU(embedding_dim,
                          hidden_dim,
                          num_layers = n_layers,
                          bidirectional = bidirectional,
                          batch_first = True,
                          dropout = 0 if n_layers < 2 else dropout)
        #defining fully connected layer
        self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        #defining dropout
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        
        #text = [batch size, sent len]
                
        with torch.no_grad():
            # The transformer returns the embeddings for the whole sequence 
            # as well as a pooled output. We require the embeddings
            embedded = self.bert(text)[0]
                
        #embedded = [batch size, sent len, emb dim]
        
        _, hidden = self.rnn(embedded)
        
        #hidden = [n layers * n directions, batch size, emb dim]
        
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
                
        #hidden = [batch size, hid dim]
        
        output = self.out(hidden)
        
        #output = [batch size, out dim]
        
        return output

In [None]:
#defining hyperparameters
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.25
# creating the model instance
model = BERTGRUSentiment(bert,
                         HIDDEN_DIM,
                         OUTPUT_DIM,
                         N_LAYERS,
                         BIDIRECTIONAL,
                         DROPOUT)

In [None]:
#getting the number of parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 112,241,409 trainable parameters


In [None]:
# we set the requires_grad to false for all the bert parameters because they are pre trained
for name, param in model.named_parameters():                
    if name.startswith('bert'):
        param.requires_grad = False

In [None]:
#this reduces our number of trainable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,759,169 trainable parameters


In [None]:
# checking the names of the trainable parameters, ensuring they make sense. 
# As we can see, they are all the parameters of the GRU (rnn) and the linear layer (out).
for name, param in model.named_parameters():                
    if param.requires_grad:
        print(name)

rnn.weight_ih_l0
rnn.weight_hh_l0
rnn.bias_ih_l0
rnn.bias_hh_l0
rnn.weight_ih_l0_reverse
rnn.weight_hh_l0_reverse
rnn.bias_ih_l0_reverse
rnn.bias_hh_l0_reverse
rnn.weight_ih_l1
rnn.weight_hh_l1
rnn.bias_ih_l1
rnn.bias_hh_l1
rnn.weight_ih_l1_reverse
rnn.weight_hh_l1_reverse
rnn.bias_ih_l1_reverse
rnn.bias_hh_l1_reverse
out.weight
out.bias


In [None]:
#importing optim library to define our optimizer
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [None]:
#defining our loss function
criterion = nn.BCEWithLogitsLoss()

In [None]:
# putting model and criterion to device
model = model.to(device)
criterion = criterion.to(device)

Training our model

In [None]:
#defining our accuracy function
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [None]:
def train(model, iterator, optimizer, criterion):
    """ training our model"""    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.r).squeeze(1)
        
        loss = criterion(predictions, batch.s)
        
        acc = binary_accuracy(predictions, batch.s)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    """ evaluating our model on validation dataset"""    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.r).squeeze(1)
            
            loss = criterion(predictions, batch.s)
            
            acc = binary_accuracy(predictions, batch.s)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
import time

def epoch_time(start_time, end_time):
    """calculating how long a training/evaluation epoch takes"""
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
#training our model
N_EPOCHS = 20

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
        
    end_time = time.time()
        
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    # saving the models which have better valid_loss    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut6-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 0s
	Train Loss: 0.667 | Train Acc: 58.13%
	 Val. Loss: 0.927 |  Val. Acc: 48.44%
Epoch: 02 | Epoch Time: 0m 0s
	Train Loss: 0.424 | Train Acc: 85.00%
	 Val. Loss: 0.360 |  Val. Acc: 83.33%
Epoch: 03 | Epoch Time: 0m 0s
	Train Loss: 0.299 | Train Acc: 88.75%
	 Val. Loss: 0.366 |  Val. Acc: 81.25%
Epoch: 04 | Epoch Time: 0m 0s
	Train Loss: 0.248 | Train Acc: 89.38%
	 Val. Loss: 0.322 |  Val. Acc: 86.46%
Epoch: 05 | Epoch Time: 0m 0s
	Train Loss: 0.200 | Train Acc: 92.50%
	 Val. Loss: 0.347 |  Val. Acc: 83.33%
Epoch: 06 | Epoch Time: 0m 0s
	Train Loss: 0.166 | Train Acc: 93.59%
	 Val. Loss: 0.347 |  Val. Acc: 84.38%
Epoch: 07 | Epoch Time: 0m 0s
	Train Loss: 0.134 | Train Acc: 94.84%
	 Val. Loss: 0.623 |  Val. Acc: 80.21%
Epoch: 08 | Epoch Time: 0m 0s
	Train Loss: 0.196 | Train Acc: 92.66%
	 Val. Loss: 0.618 |  Val. Acc: 78.65%
Epoch: 09 | Epoch Time: 0m 0s
	Train Loss: 0.215 | Train Acc: 91.88%
	 Val. Loss: 0.408 |  Val. Acc: 82.81%
Epoch: 10 | Epoch Time: 0m 0

In [None]:
#loading the model with minimum valid_loss
model.load_state_dict(torch.load('tut6-model.pt'))
# testing on our test_data
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.267 | Test Acc: 91.02%


In [None]:
def predict_sentiment(model, tokenizer, sentence):
    """ returns the prediction of model on the input sentence using the model"""
    model.eval()
    tokens = tokenizer.tokenize(sentence)
    tokens = tokens[:max_input_length-2]
    indexed = [init_token_idx] + tokenizer.convert_tokens_to_ids(tokens) + [eos_token_idx]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(0)
    prediction = torch.sigmoid(model(tensor))
    if prediction.item()>=0.5:
        print('Negative with score '+str(1-prediction.item()))
    else:
        print('Positive with score '+str(1- prediction.item()) )     

In [None]:
predict_sentiment(model, tokenizer, "Highly recommend for any one who has a blue tooth phone.")

Positive with score 0.9905686117708683


In [None]:
predict_sentiment(model, tokenizer, "I advise EVERYONE DO NOT BE FOOLED!")

Negative with score 0.22611218690872192


In [None]:
predict_sentiment(model, tokenizer, "This product is cheap,but it is not upto my expectation")

Negative with score 0.4909631609916687


In [None]:
predict_sentiment(model, tokenizer, "This product is not upto my expectation")

Negative with score 0.10951513051986694


In [None]:
predict_sentiment(model, tokenizer, "Amazing product but price is high")

Positive with score 0.7733954638242722


In [None]:
predict_sentiment(model, tokenizer, "Does really an amazing product as price is high")

Positive with score 0.9859268562868237
