<a href="https://colab.research.google.com/github/ccarpenterg/introNLP/blob/master/03b_NLP_and_recurrent_neural_networks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pre-trained Word Embeddings and RNNs

In [0]:
import torch
from torchtext import data

SEED = 3773

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(dtype=torch.float)

In [0]:
from torchtext import datasets

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

In [3]:
print("Number of training examples: {}".format(len(train_data)))
print("Number of testing examples: {}".format(len(test_data)))

Number of training examples: 25000
Number of testing examples: 25000


In [0]:
print(vars(train_data.examples[0]))

In [5]:
import random

train_data, valid_data = train_data.split(
    random_state=random.seed(SEED),
    split_ratio=0.8
)

print("Number of training examples: {}".format(len(train_data)))
print("Number of validation examples: {}".format(len(valid_data)))
print("Number of testing examples: {}".format(len(test_data)))

Number of training examples: 20000
Number of validation examples: 5000
Number of testing examples: 25000


In [0]:
MAX_VOCAB_SIZE = 8185

TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

In [7]:
print("Unique tokens in TEXT vocabulary: {}".format(len(TEXT.vocab)))
print("Unique tokens in LABEL vocabulary: {}".format(len(LABEL.vocab)))

Unique tokens in TEXT vocabulary: 8187
Unique tokens in LABEL vocabulary: 2


In [8]:
print(TEXT.vocab.freqs.most_common(20))

[('the', 232678), (',', 220840), ('.', 188920), ('and', 125362), ('a', 125266), ('of', 115884), ('to', 107654), ('is', 87196), ('in', 70206), ('I', 62349), ('it', 61298), ('that', 56438), ('"', 50419), ("'s", 49667), ('this', 48419), ('-', 41945), ('/><br', 41022), ('was', 40196), ('as', 35006), ('with', 34063)]


In [9]:
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is']


In [10]:
print(LABEL.vocab.stoi)

defaultdict(<function _default_unk_index at 0x7fd2d4d5e378>, {'pos': 0, 'neg': 1})


In [0]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    device=device
)

In [0]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim,
                 hidden_dim, output_dim,
                 n_layers, bidirectional):
        
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        self.rnn = nn.LSTM(embedding_dim,
                           hidden_dim,
                           num_layers=n_layers,
                           bidirectional=bidirectional)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, sequences):

        # sequences: (max sequences length, batch size)
        seq_embeddings = self.embedding(sequences)

        # seq_embeddings: (max sequences length, batch size, embedding dim)
        seq_hidden, (hidden, cell) = self.rnn(seq_embeddings)

        # hidden: (num_layers * num_directions, batch size, hidden dim)
        hidden_concat = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)

        # hidden_concat: (batch size, num_directions * hidden dim)
        output = self.fc(hidden_concat)

        # output: (batch size, 1) -> vector
        return output

In [0]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 1
BIDIRECTIONAL = True

model = RNN(INPUT_DIM,
            EMBEDDING_DIM,
            HIDDEN_DIM,
            OUTPUT_DIM,
            N_LAYERS,
            BIDIRECTIONAL)

In [14]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print("The model has {:,} trainable parameters".format(count_parameters(model)))

The model has 1,552,397 trainable parameters


In [0]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [0]:
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [0]:
def accuracy(outputs, labels):
    """
    Returns accuracy per batch

    """

    # Precit y = 1 if sigmoid(output) >= 0.5 (positive review)
    # Precit y = 0 if sigmoid(output) <  0.5 (negative review)
    predictions = torch.round(torch.sigmoid(outputs))
    correct = (predictions == labels).float()
    return correct.sum() / len(correct)

In [0]:
def train(model, iterator, optimizer, cruterion):
    
    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:

        optimizer.zero_grad()

        outputs = model(batch.text).squeeze(1)

        loss = criterion(outputs, batch.label)

        acc = accuracy(outputs, batch.label)

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
def evaluate(model, iterator, criterion):

    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():

        for batch in iterator:

            outputs = model(batch.text).squeeze(1)

            loss = criterion(outputs, batch.label)

            acc = accuracy(outputs, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_time * 60))
    return elapsed_mins, elapsed_secs

In [29]:
N_EPOCHS = 5

for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print("Epoch: {:02} | Epoch Time: {}m {}s".format(epoch+1, epoch_mins, epoch_secs))
    print("\tTrain Loss: {:.3f} | Train Acc: {:.2f}%".format(train_loss, train_acc*100))
    print("\t Val. Loss: {:.3f} |  Val. Acc: {:.2f}%".format(valid_loss, valid_acc*100))

Epoch: 01 | Epoch Time: 0m -2814s
	Train Loss: 0.621 | Train Acc: 65.99%
	 Val. Loss: 0.647 |  Val. Acc: 60.56%
Epoch: 02 | Epoch Time: 0m -2980s
	Train Loss: 0.585 | Train Acc: 69.34%
	 Val. Loss: 0.616 |  Val. Acc: 65.47%
Epoch: 03 | Epoch Time: 0m -2974s
	Train Loss: 0.571 | Train Acc: 70.68%
	 Val. Loss: 0.566 |  Val. Acc: 72.11%
Epoch: 04 | Epoch Time: 0m -2997s
	Train Loss: 0.368 | Train Acc: 84.13%
	 Val. Loss: 0.348 |  Val. Acc: 84.75%
Epoch: 05 | Epoch Time: 0m -2984s
	Train Loss: 0.267 | Train Acc: 89.40%
	 Val. Loss: 0.315 |  Val. Acc: 86.73%


In [30]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print("Test Loss: {:.3f} |  Test Acc: {:.2f}%".format(test_loss, test_acc*100))

Test Loss: 0.362 |  Test Acc: 84.79%


In [0]:
iterator = iter(train_iterator)
batch = next(iterator)
print(batch.text.shape)
print(batch.text[:,0].shape)

torch.Size([1103, 64])
torch.Size([1103])


In [0]:
model = model.to(device)

output = model(batch.text)

print(output.shape)

torch.Size([64, 1])
