In [1]:
!pip install torch==1.8.0+cu111 torchvision==0.9.0+cu111 torchaudio==0.8.0 -f https://download.pytorch.org/whl/torch_stable.html

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting torch==1.8.0+cu111
  Downloading https://download.pytorch.org/whl/cu111/torch-1.8.0%2Bcu111-cp37-cp37m-linux_x86_64.whl (1982.2 MB)
[K     |█████████████▌                  | 834.1 MB 1.3 MB/s eta 0:15:07tcmalloc: large alloc 1147494400 bytes == 0x3a346000 @  0x7f7a6470d615 0x592b76 0x4df71e 0x59afff 0x515655 0x549576 0x593fce 0x548ae9 0x51566f 0x549576 0x593fce 0x548ae9 0x5127f1 0x598e3b 0x511f68 0x598e3b 0x511f68 0x598e3b 0x511f68 0x4bc98a 0x532e76 0x594b72 0x515600 0x549576 0x593fce 0x548ae9 0x5127f1 0x549576 0x593fce 0x5118f8 0x593dd7
[K     |█████████████████               | 1055.7 MB 1.3 MB/s eta 0:11:42tcmalloc: large alloc 1434370048 bytes == 0x7e99c000 @  0x7f7a6470d615 0x592b76 0x4df71e 0x59afff 0x515655 0x549576 0x593fce 0x548ae9 0x51566f 0x549576 0x593fce 0x548ae9 0x5127f1 0x598e3b 0x511f68 0x598e

In [2]:
!pip install torchtext==0.9

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchtext==0.9
  Downloading torchtext-0.9.0-cp37-cp37m-manylinux1_x86_64.whl (7.1 MB)
[K     |████████████████████████████████| 7.1 MB 19.9 MB/s 
Installing collected packages: torchtext
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.12.0
    Uninstalling torchtext-0.12.0:
      Successfully uninstalled torchtext-0.12.0
Successfully installed torchtext-0.9.0


In [3]:
import time
import torch
import random
import torch.nn as nn
import torch.optim as optim

from torchtext.legacy import data, datasets
from tqdm import tqdm

SEED = 42

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [5]:
TEXT = data.Field(tokenize = 'spacy', include_lengths = True)
LABEL = data.LabelField(dtype = torch.float)

In [7]:
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:09<00:00, 8.53MB/s]


In [8]:
train_data, valid_data = train_data.split(random_state = random.seed(SEED))

In [9]:
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)

.vector_cache/glove.6B.zip: 862MB [02:43, 5.28MB/s]                           
100%|█████████▉| 399999/400000 [00:15<00:00, 26490.88it/s]


In [10]:
BATCH_SIZE = 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    device = device)

In [22]:
class BILSTM(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=pad_idx)
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.rnn = nn.LSTM(
            emb_dim,
            hidden_dim,
            num_layers=n_layers,
            bidirectional=bidirectional,
            dropout=dropout
        )
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, input, input_lengths):
        batch_size = input.shape[1]
        embedded = self.embedding(input)
        embedded = self.dropout(embedded)

        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, input_lengths.to('cpu'))
        packed_output, (hidden, cell) = self.rnn(packed_embedded)

        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        # output: (seq_len, batch_size, hidden_dim * num_direction)
        # hidden: (num_layers * num_direction, batch_size, hidden_dim)
        # cell: (num_layers * num_direction, batch_size, hidden_dim)

        hidden = hidden.view(self.n_layers, -1, batch_size, self.hidden_dim)

        hidden = torch.cat((hidden[-1][0][:], hidden[-1][1][:]), dim=1)
        hidden = self.dropout(hidden)

        logits = self.fc(hidden)

        return logits


In [23]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = BILSTM(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT, PAD_IDX)

In [24]:
pretrained_embeddings = TEXT.vocab.vectors

In [25]:
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 1.9269,  1.4873,  0.9007,  ...,  0.1233,  0.3499,  0.6173],
        [ 0.7262,  0.0912, -0.3891,  ...,  0.0821,  0.4440, -0.7240],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.2012,  0.4027, -0.3398,  ..., -0.0332, -0.8565, -0.3024],
        [ 0.0406,  0.1348, -0.3367,  ...,  0.3521, -0.1215, -0.5980],
        [ 0.1223, -0.4096,  0.2615,  ...,  0.5974,  0.1588,  0.3312]])

In [26]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.2012,  0.4027, -0.3398,  ..., -0.0332, -0.8565, -0.3024],
        [ 0.0406,  0.1348, -0.3367,  ...,  0.3521, -0.1215, -0.5980],
        [ 0.1223, -0.4096,  0.2615,  ...,  0.5974,  0.1588,  0.3312]])


In [27]:
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [28]:
def binary_accuracy(preds, y):
    predicted = torch.round(torch.sigmoid(preds))
    correct = (predicted == y).float()
    acc = correct.sum() / len(correct)
    return acc

In [29]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()

        input, input_lengths = batch.text
        preds = model(input, input_lengths)
        preds = preds.squeeze(1)

        loss = criterion(preds, batch.label)
        acc = binary_accuracy(preds, batch.label)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [30]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.eval()
    
    for batch in iterator:
        input, input_lengths = batch.text
        preds = model(input, input_lengths)
        preds = preds.squeeze(1)
        loss = criterion(preds, batch.label)
        acc = binary_accuracy(preds, batch.label)

        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [31]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [32]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'model.pt')
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 30s
	Train Loss: 0.657 | Train Acc: 60.18%
	 Val. Loss: 0.562 |  Val. Acc: 71.81%
Epoch: 02 | Epoch Time: 0m 32s
	Train Loss: 0.544 | Train Acc: 72.46%
	 Val. Loss: 0.428 |  Val. Acc: 80.80%
Epoch: 03 | Epoch Time: 0m 32s
	Train Loss: 0.475 | Train Acc: 77.90%
	 Val. Loss: 0.437 |  Val. Acc: 80.37%
Epoch: 04 | Epoch Time: 0m 31s
	Train Loss: 0.458 | Train Acc: 78.25%
	 Val. Loss: 0.417 |  Val. Acc: 81.21%
Epoch: 05 | Epoch Time: 0m 32s
	Train Loss: 0.361 | Train Acc: 84.46%
	 Val. Loss: 0.359 |  Val. Acc: 84.32%


In [34]:
model.load_state_dict(torch.load('model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.357 | Test Acc: 84.51%
