# Sentiment Analysis using Knowledge Distillation and Transformers 

Cloud and Machine Learning Project

Ashwin Prakash Nalwade (apn308), Mingxi Chen (mc7805)

We make use of the advantages provided by the knowledge distillation techniques, and utilize the the DistilBERT library provided by HuggingFace, one of the leading organisations working on NLP and transformers.

See this paper by Geoffrey Hinton on [Knowledge Distillation](https://arxiv.org/abs/1503.02531) to know more about the topic.

## Data Preparation

In [1]:
import torch

import random
import numpy as np

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [2]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ed/db/98c3ea1a78190dac41c0127a063abf92bd01b4b0b6970a6db1c2f5b66fa0/transformers-4.0.1-py3-none-any.whl (1.4MB)
[K     |████████████████████████████████| 1.4MB 6.0MB/s 
Collecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 40.2MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 43.9MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893257 sha256=eccf6523c00553411e7

In [3]:
from transformers import DistilBertModel, DistilBertTokenizer

In [4]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [5]:
len(tokenizer.vocab)

30522

In [6]:
tokens = tokenizer.tokenize('Hello WORLD how ARE yoU?')

print(tokens)

['hello', 'world', 'how', 'are', 'you', '?']


In [7]:
indexes = tokenizer.convert_tokens_to_ids(tokens)

print(indexes)

[7592, 2088, 2129, 2024, 2017, 1029]


In [8]:
init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

print(init_token, eos_token, pad_token, unk_token)

[CLS] [SEP] [PAD] [UNK]


In [9]:
init_token_idx = tokenizer.convert_tokens_to_ids(init_token)
eos_token_idx = tokenizer.convert_tokens_to_ids(eos_token)
pad_token_idx = tokenizer.convert_tokens_to_ids(pad_token)
unk_token_idx = tokenizer.convert_tokens_to_ids(unk_token)

print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100


In [10]:
init_token_idx = tokenizer.cls_token_id
eos_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id

print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100


In [11]:
max_input_length = tokenizer.max_model_input_sizes['distilbert-base-uncased']

print(max_input_length)

512


In [12]:
def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence) 
    tokens = tokens[:max_input_length-2]
    return tokens

Create splits

In [13]:
from torchtext import data

TEXT = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = tokenize_and_cut,
                  preprocessing = tokenizer.convert_tokens_to_ids,
                  init_token = init_token_idx,
                  eos_token = eos_token_idx,
                  pad_token = pad_token_idx,
                  unk_token = unk_token_idx)

LABEL = data.LabelField(dtype = torch.float)

In [14]:
from torchtext import datasets

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

train_data, valid_data = train_data.split(random_state = random.seed(SEED))

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:07<00:00, 11.7MB/s]


In [15]:
print(f"Number of training examples: {len(train_data)}")
print(f"Number of validation examples: {len(valid_data)}")
print(f"Number of testing examples: {len(test_data)}")

Number of training examples: 17500
Number of validation examples: 7500
Number of testing examples: 25000


In [16]:
print(vars(train_data.examples[6]))

{'text': [1045, 3427, 1996, 2034, 2321, 2781, 1010, 3241, 2009, 2001, 1037, 2613, 4516, 1006, 2007, 2019, 29348, 2135, 15241, 6918, 1000, 2006, 4950, 1000, 3135, 1007, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 2043, 1045, 3651, 2009, 2001, 2035, 9813, 1045, 2245, 1000, 2339, 2052, 1045, 2215, 2000, 5949, 2026, 2051, 3666, 2023, 18015, 1029, 1029, 1000, 2061, 1045, 2357, 2009, 2125, 1998, 2234, 3784, 2000, 11582, 2060, 2111, 1012, 1996, 3494, 2123, 1005, 1056, 2552, 1999, 1037, 19337, 2666, 12423, 2126, 1012, 2205, 2172, 26838, 7603, 1012, 2005, 1037, 3124, 2000, 3604, 2431, 2126, 2105, 1996, 2088, 2046, 1037, 2162, 7950, 2406, 1010, 2002, 6051, 2066, 1037, 4845, 1012, 1998, 1045, 2123, 1005, 1056, 2903, 2009, 2001, 2138, 1000, 2010, 2839, 2001, 2061, 6314, 2055, 1996, 3119, 2415, 20109, 1000, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 2200, 13012, 2618, 1998, 5236, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 2031, 2017, 2464, 1000, 2103, 1997, 2439, 233

In [17]:
tokens = tokenizer.convert_ids_to_tokens(vars(train_data.examples[6])['text'])

print(tokens)

['i', 'watched', 'the', 'first', '15', 'minutes', ',', 'thinking', 'it', 'was', 'a', 'real', 'documentary', '(', 'with', 'an', 'irritating', '##ly', 'overly', 'dramatic', '"', 'on', 'camera', '"', 'producer', ')', '.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'when', 'i', 'realized', 'it', 'was', 'all', 'staged', 'i', 'thought', '"', 'why', 'would', 'i', 'want', 'to', 'waste', 'my', 'time', 'watching', 'this', 'junk', '?', '?', '"', 'so', 'i', 'turned', 'it', 'off', 'and', 'came', 'online', 'to', 'warn', 'other', 'people', '.', 'the', 'characters', 'don', "'", 't', 'act', 'in', 'a', 'bel', '##ie', '##vable', 'way', '.', 'too', 'much', 'immature', 'emotion', '.', 'for', 'a', 'guy', 'to', 'travel', 'half', 'way', 'around', 'the', 'world', 'into', 'a', 'war', 'torn', 'country', ',', 'he', 'acted', 'like', 'a', 'kid', '.', 'and', 'i', 'don', "'", 't', 'believe', 'it', 'was', 'because', '"', 'his', 'character', 'was', 'so', 'upset', 'about', 'the', 'trade', 'center', 'bombings', '"', '.', 

In [18]:
LABEL.build_vocab(train_data)

In [19]:
print(LABEL.vocab.stoi)

defaultdict(<function _default_unk_index at 0x7fdefb118ea0>, {'neg': 0, 'pos': 1})


In [20]:
BATCH_SIZE = 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device)

In [21]:
from transformers import DistilBertModel, DistilBertTokenizer

bert = DistilBertModel.from_pretrained('distilbert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




We feed the transformers used as embeddings into the gated recurrent unit to generate sentiment predictions for the input.

In [22]:
import torch.nn as nn

class BERTGRUSentiment(nn.Module):
    def __init__(self,
                 bert,
                 hidden_dim,
                 output_dim,
                 n_layers,
                 bidirectional,
                 dropout):
        
        super().__init__()
        
        self.bert = bert
        
        embedding_dim = bert.config.hidden_size
        
        self.rnn = nn.GRU(embedding_dim,
                          hidden_dim,
                          num_layers = n_layers,
                          bidirectional = bidirectional,
                          batch_first = True,
                          dropout = 0 if n_layers < 2 else dropout)
        
        self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        
        #text = [batch size, sent len]
                
        with torch.no_grad():
            embedded = self.bert(text)[0]
                
        #embedded = [batch size, sent len, emb dim]
        
        _, hidden = self.rnn(embedded)
        
        #hidden = [n layers * n directions, batch size, emb dim]
        
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
                
        #hidden = [batch size, hid dim]
        
        output = self.out(hidden)
        
        #output = [batch size, out dim]
        
        return output

In [23]:
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.25

model = BERTGRUSentiment(bert,
                         HIDDEN_DIM,
                         OUTPUT_DIM,
                         N_LAYERS,
                         BIDIRECTIONAL,
                         DROPOUT)

In [24]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 69,122,049 trainable parameters


In [25]:
for name, param in model.named_parameters():                
    if name.startswith('bert'):
        param.requires_grad = False

In [26]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,759,169 trainable parameters


In [27]:
for name, param in model.named_parameters():                
    if param.requires_grad:
        print(name)

rnn.weight_ih_l0
rnn.weight_hh_l0
rnn.bias_ih_l0
rnn.bias_hh_l0
rnn.weight_ih_l0_reverse
rnn.weight_hh_l0_reverse
rnn.bias_ih_l0_reverse
rnn.bias_hh_l0_reverse
rnn.weight_ih_l1
rnn.weight_hh_l1
rnn.bias_ih_l1
rnn.bias_hh_l1
rnn.weight_ih_l1_reverse
rnn.weight_hh_l1_reverse
rnn.bias_ih_l1_reverse
rnn.bias_hh_l1_reverse
out.weight
out.bias


## Training the Model

Accuracy, training, timing, and evaluation functions.

In [28]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [29]:
criterion = nn.BCEWithLogitsLoss()

In [30]:
model = model.to(device)
criterion = criterion.to(device)

In [31]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [32]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [33]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [34]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [35]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
        
    end_time = time.time()
        
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut6-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')



Epoch: 01 | Epoch Time: 2m 22s
	Train Loss: 0.471 | Train Acc: 76.95%
	 Val. Loss: 0.289 |  Val. Acc: 88.19%
Epoch: 02 | Epoch Time: 2m 22s
	Train Loss: 0.285 | Train Acc: 88.35%
	 Val. Loss: 0.249 |  Val. Acc: 90.09%
Epoch: 03 | Epoch Time: 2m 21s
	Train Loss: 0.251 | Train Acc: 89.78%
	 Val. Loss: 0.237 |  Val. Acc: 90.40%
Epoch: 04 | Epoch Time: 2m 21s
	Train Loss: 0.218 | Train Acc: 91.33%
	 Val. Loss: 0.240 |  Val. Acc: 90.41%
Epoch: 05 | Epoch Time: 2m 21s
	Train Loss: 0.193 | Train Acc: 92.58%
	 Val. Loss: 0.225 |  Val. Acc: 91.66%


Test accuracy.

In [36]:
model.load_state_dict(torch.load('tut6-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.218 | Test Acc: 91.42%


## Predictions

In [37]:
def predict_sentiment(model, tokenizer, sentence):
    model.eval()
    tokens = tokenizer.tokenize(sentence)
    tokens = tokens[:max_input_length-2]
    indexed = [init_token_idx] + tokenizer.convert_tokens_to_ids(tokens) + [eos_token_idx]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(0)
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()

In [38]:
predict_sentiment(model, tokenizer, "This film is terrible")

0.016997352242469788

In [39]:
predict_sentiment(model, tokenizer, "This film is great")

0.9688772559165955