<a href="https://colab.research.google.com/github/ccarpenterg/introNLP/blob/master/03b_NLP_and_recurrent_neural_networks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pre-trained Word Embeddings and RNNs

In [0]:
import torch
from torchtext import data

SEED = 3773

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(dtype=torch.float)

In [2]:
from torchtext import datasets

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:07<00:00, 11.0MB/s]


In [3]:
print("Number of training examples: {}".format(len(train_data)))
print("Number of testing examples: {}".format(len(test_data)))

Number of training examples: 25000
Number of testing examples: 25000


In [0]:
print(vars(train_data.examples[0]))

In [5]:
import random

train_data, valid_data = train_data.split(
    random_state=random.seed(SEED),
    split_ratio=0.8
)

print("Number of training examples: {}".format(len(train_data)))
print("Number of validation examples: {}".format(len(valid_data)))
print("Number of testing examples: {}".format(len(test_data)))

Number of training examples: 20000
Number of validation examples: 5000
Number of testing examples: 25000


In [6]:
MAX_VOCAB_SIZE = 8185

TEXT.build_vocab(train_data,
                 max_size=MAX_VOCAB_SIZE,
                 vectors="glove.6B.100d",
                 unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)

.vector_cache/glove.6B.zip: 862MB [06:30, 2.21MB/s]                           
100%|█████████▉| 398986/400000 [00:16<00:00, 25878.13it/s]

In [7]:
print("Unique tokens in TEXT vocabulary: {}".format(len(TEXT.vocab)))
print("Unique tokens in LABEL vocabulary: {}".format(len(LABEL.vocab)))

Unique tokens in TEXT vocabulary: 8187
Unique tokens in LABEL vocabulary: 2


In [8]:
print(TEXT.vocab.freqs.most_common(20))

[('the', 232678), (',', 220840), ('.', 188920), ('and', 125362), ('a', 125266), ('of', 115884), ('to', 107654), ('is', 87196), ('in', 70206), ('I', 62349), ('it', 61298), ('that', 56438), ('"', 50419), ("'s", 49667), ('this', 48419), ('-', 41945), ('/><br', 41022), ('was', 40196), ('as', 35006), ('with', 34063)]


In [9]:
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is']


In [10]:
print(LABEL.vocab.stoi)

defaultdict(<function _default_unk_index at 0x7fa01045f378>, {'pos': 0, 'neg': 1})


In [0]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    device=device
)

In [0]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim,
                 hidden_dim, output_dim,
                 n_layers, bidirectional):
        
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        self.rnn = nn.LSTM(embedding_dim,
                           hidden_dim,
                           num_layers=n_layers,
                           bidirectional=bidirectional)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, sequences):

        # sequences: (max sequences length, batch size)
        seq_embeddings = self.embedding(sequences)

        # seq_embeddings: (max sequences length, batch size, embedding dim)
        seq_hidden, (hidden, cell) = self.rnn(seq_embeddings)

        # hidden: (num_layers * num_directions, batch size, hidden dim)
        hidden_concat = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)

        # hidden_concat: (batch size, num_directions * hidden dim)
        output = self.fc(hidden_concat)

        # output: (batch size, 1) -> vector
        return output

In [0]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

## Model 1 - 1 bidrectional layer, no pre-trained embeddings

In [0]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 1
BIDIRECTIONAL = True

model_v1 = RNN(INPUT_DIM,
            EMBEDDING_DIM,
            HIDDEN_DIM,
            OUTPUT_DIM,
            N_LAYERS,
            BIDIRECTIONAL)

In [15]:
print("The model has {:,} trainable parameters".format(count_parameters(model_v1)))

The model has 1,552,397 trainable parameters


## Model 2 - 1 bidirectional layer, pre-trained embeddings (Glove)

In [16]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 1
BIDIRECTIONAL = True

model_v2 = RNN(INPUT_DIM,
               EMBEDDING_DIM,
               HIDDEN_DIM,
               OUTPUT_DIM,
               N_LAYERS,
               BIDIRECTIONAL)


pretrained_embeddings = TEXT.vocab.vectors
model_v2.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-0.1285, -0.7403, -0.7101,  ...,  0.0324, -0.4687,  1.6241],
        [ 0.3661, -0.2995, -0.1835,  ..., -2.0702,  1.9870,  1.3561],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.2122, -0.5872,  0.2765,  ...,  0.0076, -0.2042,  0.2265],
        [-0.5347,  0.2205, -0.5386,  ..., -0.3264,  1.0983, -0.1159],
        [-0.4323, -0.3820, -0.1995,  ..., -0.4230,  0.1340, -0.1766]])

In [17]:
print("The model has {:,} trainable parameters".format(count_parameters(model_v2)))

The model has 1,552,397 trainable parameters


In [0]:
import torch.optim as optim

optimizer_v1 = optim.Adam(model_v1.parameters())

optimizer_v2 = optim.Adam(model_v2.parameters())

In [0]:
criterion = nn.BCEWithLogitsLoss()

model_v1 = model_v1.to(device)
model_v2 = model_v2.to(device)

criterion = criterion.to(device)

In [0]:
def accuracy(outputs, labels):
    """
    Returns accuracy per batch

    """

    # Precit y = 1 if sigmoid(output) >= 0.5 (positive review)
    # Precit y = 0 if sigmoid(output) <  0.5 (negative review)
    predictions = torch.round(torch.sigmoid(outputs))
    correct = (predictions == labels).float()
    return correct.sum() / len(correct)

In [0]:
def train(model, iterator, optimizer, cruterion):
    
    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:

        optimizer.zero_grad()

        outputs = model(batch.text).squeeze(1)

        loss = criterion(outputs, batch.label)

        acc = accuracy(outputs, batch.label)

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
def evaluate(model, iterator, criterion):

    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():

        for batch in iterator:

            outputs = model(batch.text).squeeze(1)

            loss = criterion(outputs, batch.label)

            acc = accuracy(outputs, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins, elapsed_secs = divmod(int(elapsed_time), 60)
    return elapsed_mins, elapsed_secs

## Model 1 - Training

In [25]:
N_EPOCHS = 5

for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss, train_acc = train(model_v1, train_iterator, optimizer_v1, criterion)
    valid_loss, valid_acc = evaluate(model_v1, valid_iterator, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print("Epoch: {:02} | Epoch Time: {}m {}s".format(epoch+1, epoch_mins, epoch_secs))
    print("\tTrain Loss: {:.3f} | Train Acc: {:.2f}%".format(train_loss, train_acc*100))
    print("\t Val. Loss: {:.3f} |  Val. Acc: {:.2f}%".format(valid_loss, valid_acc*100))

Epoch: 01 | Epoch Time: 0m 45s
	Train Loss: 0.681 | Train Acc: 56.12%
	 Val. Loss: 0.657 |  Val. Acc: 61.79%
Epoch: 02 | Epoch Time: 0m 46s
	Train Loss: 0.640 | Train Acc: 63.49%
	 Val. Loss: 0.661 |  Val. Acc: 61.00%
Epoch: 03 | Epoch Time: 0m 50s
	Train Loss: 0.598 | Train Acc: 68.43%
	 Val. Loss: 0.563 |  Val. Acc: 72.49%
Epoch: 04 | Epoch Time: 0m 50s
	Train Loss: 0.533 | Train Acc: 74.36%
	 Val. Loss: 0.563 |  Val. Acc: 71.84%
Epoch: 05 | Epoch Time: 0m 51s
	Train Loss: 0.588 | Train Acc: 68.16%
	 Val. Loss: 0.585 |  Val. Acc: 70.51%


In [26]:
test_loss, test_acc = evaluate(model_v1, test_iterator, criterion)

print("Test Loss: {:.3f} |  Test Acc: {:.2f}%".format(test_loss, test_acc*100))

Test Loss: 0.590 |  Test Acc: 69.27%


## Model 2 - Training

In [27]:
N_EPOCHS = 5

for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss, train_acc = train(model_v2, train_iterator, optimizer_v2, criterion)
    valid_loss, valid_acc = evaluate(model_v2, valid_iterator, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print("Epoch: {:02} | Epoch Time: {}m {}s".format(epoch+1, epoch_mins, epoch_secs))
    print("\tTrain Loss: {:.3f} | Train Acc: {:.2f}%".format(train_loss, train_acc*100))
    print("\t Val. Loss: {:.3f} |  Val. Acc: {:.2f}%".format(valid_loss, valid_acc*100))

Epoch: 01 | Epoch Time: 0m 48s
	Train Loss: 0.653 | Train Acc: 60.56%
	 Val. Loss: 0.614 |  Val. Acc: 65.39%
Epoch: 02 | Epoch Time: 0m 50s
	Train Loss: 0.575 | Train Acc: 69.20%
	 Val. Loss: 0.368 |  Val. Acc: 84.14%
Epoch: 03 | Epoch Time: 0m 50s
	Train Loss: 0.308 | Train Acc: 87.23%
	 Val. Loss: 0.279 |  Val. Acc: 88.59%
Epoch: 04 | Epoch Time: 0m 50s
	Train Loss: 0.230 | Train Acc: 90.88%
	 Val. Loss: 0.270 |  Val. Acc: 89.58%
Epoch: 05 | Epoch Time: 0m 51s
	Train Loss: 0.179 | Train Acc: 93.09%
	 Val. Loss: 0.281 |  Val. Acc: 88.90%


In [28]:
test_loss, test_acc = evaluate(model_v2, test_iterator, criterion)

print("Test Loss: {:.3f} |  Test Acc: {:.2f}%".format(test_loss, test_acc*100))

Test Loss: 0.312 |  Test Acc: 87.83%
