In [1]:
!pip install spacy
!python -m spacy download en_core_web_sm
!python -m spacy link en_core_web_sm en

Collecting spacy
[?25l  Downloading https://files.pythonhosted.org/packages/10/b5/c7a92c7ce5d4b353b70b4b5b4385687206c8b230ddfe08746ab0fd310a3a/spacy-2.3.2-cp36-cp36m-manylinux1_x86_64.whl (9.9MB)
[K     |████████████████████████████████| 10.0MB 13.4MB/s ta 0:00:01
Collecting plac<1.2.0,>=0.9.6 (from spacy)
  Downloading https://files.pythonhosted.org/packages/86/85/40b8f66c2dd8f4fd9f09d59b22720cffecf1331e788b8a0cab5bafb353d1/plac-1.1.3-py2.py3-none-any.whl
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
[?25l  Downloading https://files.pythonhosted.org/packages/ef/25/8cb4d3dfce05cd261adadfec0cbc4d90a7ddf6abac888b2d354042986b64/cymem-2.0.4.tar.gz (56kB)
[K     |████████████████████████████████| 61kB 2.5MB/s eta 0:00:011
[?25hCollecting blis<0.5.0,>=0.4.0 (from spacy)
[?25l  Downloading https://files.pythonhosted.org/packages/41/19/f95c75562d18eb27219df3a3590b911e78d131b68466ad79fdf5847eaac4/blis-0.4.1-cp36-cp36m-manylinux1_x86_64.whl (3.7MB)
[K     |████████████████████████████████| 

In [2]:
import torch

SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load IMDb dataset

In [3]:
from torchtext.data import Field, LabelField
from torchtext.datasets import IMDB

TEXT = Field(tokenize='spacy')
LABEL = LabelField(dtype=torch.float)

train_data, test_data = IMDB.splits(TEXT, LABEL)

aclImdb_v1.tar.gz:   5%|▍         | 3.98M/84.1M [00:00<00:02, 39.7MB/s]

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:02<00:00, 33.4MB/s]


In [4]:
print(vars(train_data.examples[0]))

{'text': ['Zentropa', 'has', 'much', 'in', 'common', 'with', 'The', 'Third', 'Man', ',', 'another', 'noir', '-', 'like', 'film', 'set', 'among', 'the', 'rubble', 'of', 'postwar', 'Europe', '.', 'Like', 'TTM', ',', 'there', 'is', 'much', 'inventive', 'camera', 'work', '.', 'There', 'is', 'an', 'innocent', 'American', 'who', 'gets', 'emotionally', 'involved', 'with', 'a', 'woman', 'he', 'does', "n't", 'really', 'understand', ',', 'and', 'whose', 'naivety', 'is', 'all', 'the', 'more', 'striking', 'in', 'contrast', 'with', 'the', 'natives.<br', '/><br', '/>But', 'I', "'d", 'have', 'to', 'say', 'that', 'The', 'Third', 'Man', 'has', 'a', 'more', 'well', '-', 'crafted', 'storyline', '.', 'Zentropa', 'is', 'a', 'bit', 'disjointed', 'in', 'this', 'respect', '.', 'Perhaps', 'this', 'is', 'intentional', ':', 'it', 'is', 'presented', 'as', 'a', 'dream', '/', 'nightmare', ',', 'and', 'making', 'it', 'too', 'coherent', 'would', 'spoil', 'the', 'effect', '.', '<', 'br', '/><br', '/>This', 'movie', 'i

# Split validation set out of training set

In [5]:
import random

train_data, valid_data = train_data.split(random_state=random.seed(SEED))

In [6]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 17500
Number of validation examples: 7500
Number of testing examples: 25000


# Build vocabulary

In [7]:
TEXT.build_vocab(train_data, max_size=25000, vectors='glove.6B.100d')
LABEL.build_vocab(train_data)

.vector_cache/glove.6B.zip: 862MB [06:28, 2.22MB/s]                            
100%|█████████▉| 399999/400000 [00:22<00:00, 17779.48it/s]


In [8]:
print(f'Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}')
print(f'Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}')

Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABEL vocabulary: 2


In [9]:
print(list(LABEL.vocab.stoi.items()))
print(list(TEXT.vocab.stoi.items())[:20])
print()
print(TEXT.vocab.freqs.most_common(20))

[('neg', 0), ('pos', 1)]
[('<unk>', 0), ('<pad>', 1), ('the', 2), (',', 3), ('.', 4), ('a', 5), ('and', 6), ('of', 7), ('to', 8), ('is', 9), ('in', 10), ('I', 11), ('it', 12), ('that', 13), ('"', 14), ("'s", 15), ('this', 16), ('-', 17), ('/><br', 18), ('was', 19)]

[('the', 204107), (',', 194270), ('.', 166464), ('a', 110139), ('and', 109993), ('of', 101553), ('to', 94337), ('is', 76827), ('in', 61595), ('I', 54376), ('it', 53662), ('that', 49415), ('"', 44401), ("'s", 43781), ('this', 42212), ('-', 37510), ('/><br', 35656), ('was', 35057), ('as', 30614), ('with', 30210)]


In [10]:
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', 'the', ',', '.', 'a', 'and', 'of', 'to', 'is']


# Create iterators

In [11]:
from torchtext.data import BucketIterator

BATCH_SIZE = 64

train_iter, valid_iter, test_iter = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    device=device
)



In [12]:
print(next(iter(train_iter)).label.shape)
print(next(iter(train_iter)).text.shape)



torch.Size([64])
torch.Size([1134, 64])


In [13]:
batch = next(iter(train_iter))
print(batch.text.shape) 
print([TEXT.vocab.itos[i] for i in batch.text[:, 0]])

torch.Size([1103, 64])
['This', 'film', 'was', 'a', 'waste', 'of', 'time', ',', 'even', 'rented', 'on', 'DVD', '.', 'If', 'super', '-', '<unk>', 'camera', 'shots', 'get', 'any', 'faster', 'than', 'this', ',', 'we', 'might', 'as', 'well', 'pay', 'twenty', 'bucks', 'to', 'get', 'in', 'the', '<unk>', ',', 'get', 'popcorn', ',', 'and', 'watch', 'the', '<unk>', 'spin', '.', 'Jet', 'Li', 'is', 'so', 'much', 'better', 'than', 'this', '.', 'One', 'can', 'only', 'hope', 'that', 'he', 'wo', "n't", 'be', 'making', 'deals', 'anytime', 'soon', 'to', 'make', 'another', 'cliche', '-', 'ridden', 'film', 'like', 'The', '<unk>', '/><br', '/>If', 'there', "'s", 'one', 'film', 'you', 'should', 'avoid', ',', 'this', 'is', '"', 'The', 'One', '"', '.', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>',

# Define some utility functions for training/evaluating models

In [14]:
import time
import torch.nn as nn
from torch.optim import Adam

EMBEDDING_DIM = TEXT.vocab.vectors.shape[1]
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

def count_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def init_params(model):
    model.embedding.weight.data.copy_(TEXT.vocab.vectors)
    for idx in [UNK_IDX, PAD_IDX]:
        model.embedding.weight.data[idx] = torch.zeros(EMBEDDING_DIM)
        
loss_fn = nn.BCEWithLogitsLoss()

def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y)
    acc = correct.sum() / len(y)
    return acc.item()

def train_epoch(model, optimizer, iterator=train_iter, criterion=loss_fn):
    epoch_loss = epoch_acc = sample_count = 0
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
        
        x, y = batch.text, batch.label
        output = model(x).squeeze()
        loss = criterion(output, y)
        acc = binary_accuracy(output, y)
        
        loss.backward()
        optimizer.step()
        
        batch_size = len(y)
        sample_count += batch_size
        epoch_loss += loss.item() * batch_size
        epoch_acc += acc * batch_size
        
    return epoch_loss / sample_count, epoch_acc / sample_count

@torch.no_grad()
def evaluate(model, iterator, criterion=loss_fn):
    epoch_loss = epoch_acc = sample_count = 0
    model.eval()
    
    for batch in iterator:
        x, y = batch.text, batch.label
        output = model(x).squeeze()
        loss = criterion(output, y)
        acc = binary_accuracy(output, y)
        
        batch_size = len(y)
        sample_count += batch_size
        epoch_loss += loss.item() * batch_size
        epoch_acc += acc * batch_size
        
    return epoch_loss / sample_count, epoch_acc / sample_count

def epoch_time(start, end):
    elapsed = end - start
    mins = int(elapsed / 60)
    secs = int(elapsed - mins * 60)
    return mins, secs

def train(model, filename, optim=Adam, epochs=20):
    optimizer = optim(model.parameters())
    min_valid_loss = float('inf')
    
    for epoch in range(epochs):
        start = time.time()
        
        train_loss, train_acc = train_epoch(model, optimizer)
        valid_loss, valid_acc = evaluate(model, valid_iter)
        
        end = time.time()
        mins, secs = epoch_time(start, end)
        
        if valid_loss < min_valid_loss:
            min_valid_loss = valid_loss
            torch.save(model.state_dict(), f'{filename}.pt')
            
        print(f'Epoch: {epoch + 1:02} | Epoch Time: {mins}m {secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc * 100:.2f}%')

# Word averaging model

In [16]:
class WordAVGModel(nn.Module):
    
    def __init__(self, vocab_size=len(TEXT.vocab), embedding_dim=EMBEDDING_DIM, output_dim=1, pad_idx=PAD_IDX):
        super(WordAVGModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.fc = nn.Linear(embedding_dim, output_dim)
    
    def forward(self, inp): # seq_len, batch_size
        embedded = self.embedding(inp) # seq_len, batch_size, embedding_dim
        pooled = embedded.mean(0) # batch_size, embedding_dim
        return self.fc(pooled) # batch_size, output_dim
        

In [17]:
word_avg_model = WordAVGModel()
count_params(word_avg_model)

2500301

In [19]:
word_avg_model.to(device)
init_params(word_avg_model)
train(word_avg_model, 'word_avg')

Epoch: 01 | Epoch Time: 0m 4s
	Train Loss: 0.685 | Train Acc: 60.22%
	 Val. Loss: 0.625 |  Val. Acc: 68.57%
Epoch: 02 | Epoch Time: 0m 4s
	Train Loss: 0.641 | Train Acc: 74.00%
	 Val. Loss: 0.521 |  Val. Acc: 75.20%
Epoch: 03 | Epoch Time: 0m 4s
	Train Loss: 0.568 | Train Acc: 79.13%
	 Val. Loss: 0.457 |  Val. Acc: 80.40%
Epoch: 04 | Epoch Time: 0m 4s
	Train Loss: 0.495 | Train Acc: 83.37%
	 Val. Loss: 0.443 |  Val. Acc: 82.96%
Epoch: 05 | Epoch Time: 0m 4s
	Train Loss: 0.433 | Train Acc: 86.12%
	 Val. Loss: 0.435 |  Val. Acc: 84.88%
Epoch: 06 | Epoch Time: 0m 4s
	Train Loss: 0.382 | Train Acc: 88.01%
	 Val. Loss: 0.449 |  Val. Acc: 86.00%
Epoch: 07 | Epoch Time: 0m 4s
	Train Loss: 0.342 | Train Acc: 89.40%
	 Val. Loss: 0.476 |  Val. Acc: 86.76%
Epoch: 08 | Epoch Time: 0m 4s
	Train Loss: 0.310 | Train Acc: 90.34%
	 Val. Loss: 0.494 |  Val. Acc: 87.21%
Epoch: 09 | Epoch Time: 0m 4s
	Train Loss: 0.286 | Train Acc: 91.07%
	 Val. Loss: 0.519 |  Val. Acc: 87.47%
Epoch: 10 | Epoch Time: 0m 4

# RNN model

In [21]:
class RNNModel(nn.Module):
    
    def __init__(self, hidden_dim, num_layers, bidirectional, dropout, 
                 vocab_size=len(TEXT.vocab), embedding_dim=EMBEDDING_DIM, 
                 output_dim=1, pad_idx=PAD_IDX):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers, dropout=dropout, bidirectional=bidirectional)
        self.dropout = nn.Dropout(dropout)
        
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.num_directions = 2 if bidirectional else 1
        
        self.fc = nn.Linear(self.num_directions * hidden_dim, output_dim)
    
    def forward(self, inp): # seq_len, batch_size
        embedded = self.dropout(self.embedding(inp)) # seq_len, batch_size, embedding_dim
        _, (hidden, _) = self.rnn(embedded) # num_layers * num_directions, batch_size, hidden_dim
        last_hidden = hidden[-self.num_directions:, :, :] # num_directions, batch_size, hidden_dim
        if self.num_directions == 1:
            output = last_hidden.squeeze(0)
        else:
            output = torch.cat([last_hidden[0, :, :], last_hidden[1, :, :]], dim=1)
        output = self.dropout(output) # batch_size, hidden_dim * num_directions
        return self.fc(output.squeeze(0)) # (batch_size > 1), output_dim

In [22]:
HIDDEN_DIM = 256
NUM_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT=0.5

rnn_model = RNNModel(HIDDEN_DIM, NUM_LAYERS, BIDIRECTIONAL, DROPOUT)
count_params(rnn_model)

4810857

In [23]:
rnn_model.to(device)
init_params(rnn_model)
train(rnn_model, 'rnn')



Epoch: 01 | Epoch Time: 2m 5s
	Train Loss: 0.664 | Train Acc: 59.21%
	 Val. Loss: 0.558 |  Val. Acc: 73.53%
Epoch: 02 | Epoch Time: 2m 4s
	Train Loss: 0.599 | Train Acc: 67.99%
	 Val. Loss: 0.435 |  Val. Acc: 81.12%
Epoch: 03 | Epoch Time: 2m 4s
	Train Loss: 0.442 | Train Acc: 80.62%
	 Val. Loss: 0.438 |  Val. Acc: 80.84%
Epoch: 04 | Epoch Time: 2m 5s
	Train Loss: 0.300 | Train Acc: 88.25%
	 Val. Loss: 0.316 |  Val. Acc: 87.23%
Epoch: 05 | Epoch Time: 2m 3s
	Train Loss: 0.219 | Train Acc: 91.57%
	 Val. Loss: 0.287 |  Val. Acc: 88.80%
Epoch: 06 | Epoch Time: 2m 5s
	Train Loss: 0.159 | Train Acc: 94.19%
	 Val. Loss: 0.295 |  Val. Acc: 88.01%
Epoch: 07 | Epoch Time: 2m 4s
	Train Loss: 0.130 | Train Acc: 95.26%
	 Val. Loss: 0.357 |  Val. Acc: 88.57%
Epoch: 08 | Epoch Time: 2m 4s
	Train Loss: 0.097 | Train Acc: 96.74%
	 Val. Loss: 0.337 |  Val. Acc: 88.83%
Epoch: 09 | Epoch Time: 2m 4s
	Train Loss: 0.075 | Train Acc: 97.45%
	 Val. Loss: 0.390 |  Val. Acc: 89.04%
Epoch: 10 | Epoch Time: 2m 3

# CNN model

In [28]:
import torch.nn.functional as F

class CNNModel(nn.Module):
    
    def __init__(self, n_filters, filter_sizes, dropout, 
                 vocab_size=len(TEXT.vocab), embedding_dim=EMBEDDING_DIM, 
                 output_dim=1, pad_idx=PAD_IDX):
        super(CNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([nn.Conv2d(1, n_filters, kernel_size=(fs, embedding_dim)) for fs in filter_sizes])
        self.dropout = nn.Dropout(dropout)
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
    def forward(self, inp): # seq_len, batch_size
        inp = inp.permute(1, 0) # batch_size, seq_len
        embedded = self.dropout(self.embedding(inp)) # batch_size, seq_len, embedding_dim
        embedded = embedded.unsqueeze(1) # batch_size, 1, seq_len, embedding_dim
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs] # [batch_size, n_filters, seq_len - filter_sizes + 1] * len(filter_sizes)
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved] # [batch_size, n_filters] * len(filter_sizes)
        concated = self.dropout(torch.cat(pooled, dim=1)) # batch_size, n_filters * len(filter_sizes)
        return self.fc(concated)

In [29]:
N_FILTERS = 100
FILTER_SIZES = [3, 4, 5]
DROPOUT = 0.5

cnn_model = CNNModel(N_FILTERS, FILTER_SIZES, DROPOUT)
count_params(cnn_model)

2620801

In [30]:
cnn_model.to(device)
init_params(cnn_model)
train(cnn_model, 'cnn')

Epoch: 01 | Epoch Time: 0m 17s
	Train Loss: 0.620 | Train Acc: 64.51%
	 Val. Loss: 0.422 |  Val. Acc: 81.53%
Epoch: 02 | Epoch Time: 0m 17s
	Train Loss: 0.419 | Train Acc: 80.89%
	 Val. Loss: 0.354 |  Val. Acc: 84.99%
Epoch: 03 | Epoch Time: 0m 17s
	Train Loss: 0.337 | Train Acc: 85.24%
	 Val. Loss: 0.327 |  Val. Acc: 86.47%
Epoch: 04 | Epoch Time: 0m 17s
	Train Loss: 0.294 | Train Acc: 87.55%
	 Val. Loss: 0.304 |  Val. Acc: 87.64%
Epoch: 05 | Epoch Time: 0m 17s
	Train Loss: 0.249 | Train Acc: 89.66%
	 Val. Loss: 0.287 |  Val. Acc: 88.39%
Epoch: 06 | Epoch Time: 0m 17s
	Train Loss: 0.220 | Train Acc: 91.17%
	 Val. Loss: 0.284 |  Val. Acc: 88.68%
Epoch: 07 | Epoch Time: 0m 17s
	Train Loss: 0.177 | Train Acc: 93.10%
	 Val. Loss: 0.290 |  Val. Acc: 88.60%
Epoch: 08 | Epoch Time: 0m 17s
	Train Loss: 0.162 | Train Acc: 93.77%
	 Val. Loss: 0.293 |  Val. Acc: 88.76%
Epoch: 09 | Epoch Time: 0m 17s
	Train Loss: 0.137 | Train Acc: 94.73%
	 Val. Loss: 0.319 |  Val. Acc: 87.96%
Epoch: 10 | Epoch T

# Load the best model parameters for each model and test

In [31]:
word_avg_model.load_state_dict(torch.load('word_avg.pt'))
rnn_model.load_state_dict(torch.load('rnn.pt'))
cnn_model.load_state_dict(torch.load('cnn.pt'))

<All keys matched successfully>

In [32]:
def get_test_metrics(model):
    loss, acc = evaluate(model, test_iter)
    print(f'Test Loss: {loss:.3f} |  Test Acc: {acc * 100:.2f}%')

In [33]:
for model in [word_avg_model, rnn_model, cnn_model]:
    print(model.__class__.__name__)
    get_test_metrics(model)
    print()

WordAVGModel




Test Loss: 0.431 |  Test Acc: 84.34%

RNNModel
Test Loss: 0.331 |  Test Acc: 86.33%

CNNModel
Test Loss: 0.296 |  Test Acc: 87.86%

