#ENGR 8990 - Deep Learning & Engineering Applications
## Assignment 4 - Transformer for Sentiment Classification 
In this assignment, you will code a transformer model for sentiment classification.

1.   Construct a transformer encoder (you could use the one in NB13) as the backbone and add a linear classifier for sentiment classification using the IMDB dataset (note: the vocab for IMDB is different from the NMT dataset used in NB13).

2.   Train the model and display the proggess showing both training and validation metrics.

2.   Evaluate the trained model on the test dataset.

In [1]:
import torch 
from torch import nn
import torch.nn.functional as f
import numpy as np
import torchtext.legacy.data as data
import torchtext.legacy.datasets as datasets

## Download the IMDB data

In [2]:
max_len = 200
text = data.Field(sequential=True, fix_length=max_len, batch_first=True, lower=True, dtype=torch.long)
label = data.LabelField(sequential=False, dtype=torch.long)
datasets.IMDB.download('./')
ds_train, ds_test = datasets.IMDB.splits(text, label, path='./imdb/aclImdb/')
print('train : ', len(ds_train))
print('test : ', len(ds_test))
print('train.fields :', ds_train.fields)

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:05<00:00, 14.2MB/s]


train :  25000
test :  25000
train.fields : {'text': <torchtext.legacy.data.field.Field object at 0x7f1cac5d9ad0>, 'label': <torchtext.legacy.data.field.LabelField object at 0x7f1cac5d9b50>}


## Split the training dataset into train set and valid set 

In [3]:
ds_train, ds_valid = ds_train.split(0.9)
print('train : ', len(ds_train))
print('valid : ', len(ds_valid))
print('test : ', len(ds_test))

train :  22500
valid :  2500
test :  25000


In [4]:
num_words = 50000
text.build_vocab(ds_train, max_size=num_words)
label.build_vocab(ds_train)
vocab = text.vocab

In [5]:
batch_size = 64
train_loader, valid_loader, test_loader = data.BucketIterator.splits(
    (ds_train, ds_valid, ds_test), batch_size=batch_size, sort_key=lambda x: len(x.text), repeat=False)

In [6]:
train_iterator, valid_iterator = iter(train_loader), iter(valid_loader)
for batch in train_iterator:
    print (batch.text, batch.label)
    break

tensor([[6811, 2725, 6237,  ...,  263,  734,   31],
        [1986,  276,   10,  ..., 1219,   17,   21],
        [   9,   61,   37,  ..., 4146,    1,    1],
        ...,
        [ 200,   10,    3,  ...,    1,    1,    1],
        [ 133,    2,   75,  ...,   48,  145,  650],
        [  26,    6, 4702,  ...,    1,    1,    1]]) tensor([1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1,
        0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1,
        0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0])


# Code will start here

# a) Data Preparation

In [7]:
# Data Preparation

# Reference: https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/1%20-%20Simple%20Sentiment%20Analysis.ipynb
import torch
from torchtext.legacy import data

SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize = 'spacy',
                  tokenizer_language = 'en_core_web_sm')
LABEL = data.LabelField(dtype = torch.float)


In [8]:

from torchtext.legacy import datasets

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:06<00:00, 13.9MB/s]


In [9]:
# Print the train/train data set
print(f'Number of training examples: {len(train_data)}')
print(f'Number of testing examples: {len(test_data)}')
print(vars(train_data.examples[0]))

Number of training examples: 25000
Number of testing examples: 25000
{'text': ['Masters', 'of', 'Horror', ':', 'Right', 'to', 'Die', 'starts', 'late', 'one', 'night', 'as', 'married', 'couple', 'Abby', '(', 'Julia', 'Anderson', ')', '&', 'Ciff', 'Addison', '(', 'Martin', 'Donovan', ')', 'are', 'driving', 'home', ',', 'however', 'while', 'talking', 'Cliff', 'is', 'distracted', '&', 'crashes', 'into', 'a', 'tree', 'that', 'has', 'fallen', 'across', 'the', 'road', '.', 'Cliff', "'s", 'airbag', 'works', 'OK', '&', 'he', 'walks', 'away', 'with', 'minor', 'injuries', ',', 'unfortunately', 'for', 'Abby', 'hers', 'did', "n't", '&', 'she', 'ended', 'up', 'as', 'toast', 'when', 'she', 'was', 'thrown', 'from', 'the', 'car', '&', 'doused', 'in', 'petrol', 'which', 'set', 'alight', 'burning', 'her', 'entire', 'body', '.', 'Abby', "'s", 'life', 'is', 'saved', ',', 'just', '.', 'She', 'is', 'taken', 'to', 'hospital', 'where', 'she', 'is', 'on', 'life', 'support', 'seriously', 'injured', '&', 'horribl

In [10]:
# Split train data into train and validation data 70:30  ratio
import random

train_data, valid_data = train_data.split(random_state = random.seed(SEED))
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 17500
Number of validation examples: 7500
Number of testing examples: 25000


In [11]:
# Build a unique vocabulary set using token: spacy and  tokenizer_language = en_core_web_sm
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABEL vocabulary: 2


In [12]:
# Find repetitive words with frequencies

print(TEXT.vocab.freqs.most_common(20))

[('the', 201848), (',', 192094), ('.', 165079), ('and', 108918), ('a', 108697), ('of', 100405), ('to', 93109), ('is', 75325), ('in', 61106), ('I', 54354), ('it', 53326), ('that', 48913), ('"', 44033), ("'s", 43177), ('this', 42233), ('-', 36840), ('/><br', 35313), ('was', 35231), ('as', 30149), ('movie', 29894)]


In [13]:
# print Vocabulary set in itos and Label
# check the labels, ensuring 0 is for negative and 1 is for positive.
print(TEXT.vocab.itos[:10])
print(LABEL.vocab.stoi)

['<unk>', '<pad>', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is']
defaultdict(None, {'neg': 0, 'pos': 1})


In [14]:
# Prepare batch for training
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    device = device)

# b) Define the model

In [15]:
#Initialize Model
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):

        #text = [sent len, batch size]
        
        embedded = self.embedding(text)
        
        #embedded = [sent len, batch size, emb dim]
        
        output, hidden = self.rnn(embedded)
        
        #output = [sent len, batch size, hid dim]
        #hidden = [1, batch size, hid dim]
        
        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        
        return self.fc(hidden.squeeze(0))

In [16]:
# create model instance and define dimensions
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [17]:
# Find trainable Parameter

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,592,105 trainable parameters


# c) Train the model

In [18]:
#define optimizer function
import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=1e-3)

In [19]:
# Define binary cross entropy loss and apply it to the model
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)

In [20]:
# calculate how many rounded predictions equal the actual labels and average it across the batch
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [21]:
# define train function

def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
                
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [22]:
#define validation process
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [23]:
#define timer
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [24]:
# run training and validation process
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 9m 28s
	Train Loss: 0.694 | Train Acc: 50.10%
	 Val. Loss: 0.696 |  Val. Acc: 49.43%
Epoch: 02 | Epoch Time: 9m 21s
	Train Loss: 0.693 | Train Acc: 49.90%
	 Val. Loss: 0.696 |  Val. Acc: 49.69%
Epoch: 03 | Epoch Time: 9m 20s
	Train Loss: 0.693 | Train Acc: 49.91%
	 Val. Loss: 0.696 |  Val. Acc: 50.41%
Epoch: 04 | Epoch Time: 9m 21s
	Train Loss: 0.693 | Train Acc: 49.72%
	 Val. Loss: 0.696 |  Val. Acc: 49.12%
Epoch: 05 | Epoch Time: 9m 23s
	Train Loss: 0.693 | Train Acc: 49.97%
	 Val. Loss: 0.696 |  Val. Acc: 50.73%


In [26]:
# Tesing Phase
model.load_state_dict(torch.load('tut1-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.709 | Test Acc: 47.17%
