<a href="https://colab.research.google.com/github/ccarpenterg/introNLP/blob/master/04a_NLP_and_sequence_to_sequence_RNNs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLP and Sequence-to-Sequence RNNs

In [0]:
!wget https://www.manythings.org/anki/spa-eng.zip
!unzip spa-eng.zip

--2020-02-20 00:26:47--  https://www.manythings.org/anki/spa-eng.zip
Resolving www.manythings.org (www.manythings.org)... 104.24.108.196, 104.24.109.196, 2606:4700:3033::6818:6dc4, ...
Connecting to www.manythings.org (www.manythings.org)|104.24.108.196|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4767708 (4.5M) [application/zip]
Saving to: ‘spa-eng.zip’


2020-02-20 00:26:52 (18.8 MB/s) - ‘spa-eng.zip’ saved [4767708/4767708]

Archive:  spa-eng.zip
  inflating: _about.txt              
  inflating: spa.txt                 


In [0]:
import torch
import torch.nn as nn
import torch.optim as optim

import torchtext
from torchtext.datasets import TranslationDataset
from torchtext import data

import spacy

from io import open

import unicodedata
import string
import re
import random
import math
import os

print("Spacy version:", spacy.__version__)

Spacy version: 2.1.9


## Preprocessing the Natural Language Data

### Text Normalization functions

In [0]:
def unicodeToAscii(s):
    return ''.join(
        char for char in unicodedata.normalize('NFD', s)
        if unicodedata.category(char) != 'Mn'
    )

def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

### Read Pairs function

In [0]:
def readPairs(pathToFile, slang, tlang):
    print("Reading lines...")

    f = open(pathToFile, encoding='utf-8')
    lines = f.read().strip().split('\n')

    pairs = [[normalizeString(s) for s in line.split('\t')[:2]] for line in lines]

    return pairs

### Filter function

In [0]:
MAX_LENGTH = 10

eng_prefixes = (
    "i am", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re ",
    "would", "what",
    "when", "how"
)

def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[0].startswith(eng_prefixes)

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

### Prepare Data function

In [0]:
def prepareData(pathToFile, slang, tlang):
    pairs = readPairs(pathToFile, slang, tlang)
    print("Read {} sentence pairs".format(len(pairs)))
    #pairs = filterPairs(pairs)
    print("Trimmed to {} sentence pairs".format(len(pairs)))
    return pairs

### Dataset Indexes (Splits) function

In [0]:
def datasetIndexes(pairs, train=0.8, val=0.1):
    num_examples = len(pairs)
    indexes = list(range(num_examples))
    last_train_idx = round(train*num_examples)
    last_valid_idx = last_train_idx + round(val*num_examples)
    random.shuffle(indexes)
    train_idxs = indexes[:last_train_idx]
    val_idxs = indexes[last_train_idx:last_valid_idx]
    test_idxs = indexes[last_valid_idx:]
    return train_idxs, val_idxs, test_idxs


### Spanish-English Translation Dataset class

In [0]:
class SpanishEnglishDataset(TranslationDataset):
    """English to Spanish Dataset"""

    @classmethod
    def splits(cls, exts, fields, root='.data/',
               train='train', validation='val', test='test', **kwargs):
        
        if 'path' not in kwargs:
            expected_folder = os.path.join(root, cls.name)
            path = expected_folder if os.path.exists(expected_folder) else None
        else:
            path = kwargs['path']
            del kwargs['path']
        
        return super(SpanishEnglishDataset, cls).splits(
            exts, fields, path, root, train, validation, test, **kwargs
        )

## Spanish-English Dataset files (splits)

In [0]:
pairs = prepareData('spa.txt', 'eng', 'spa')
train_idxs, val_idxs, test_idxs = datasetIndexes(pairs)
print("{} {} {}".format(len(train_idxs), len(val_idxs), len(test_idxs) ))
print(random.choice(pairs))

Reading lines...
Read 123335 sentence pairs
Trimmed to 123335 sentence pairs
98668 12334 12333
['the management finally succumbed to the demand of the workers and gave them a raise .', 'la gestion finalmente sucumbio a la demanda de los trabajadores y les dio un aumento .']


In [0]:
with open('train.en', 'w') as slang_file, open('train.es', 'w') as tlang_file:
    for i in train_idxs:
        slang_file.write(pairs[i][0] + '\n')
        tlang_file.write(pairs[i][1] + '\n')

with open('val.en', 'w') as slang_file, open('val.es', 'w') as tlang_file:
    for i in val_idxs:
        slang_file.write(pairs[i][0] + '\n')
        tlang_file.write(pairs[i][1] + '\n')

with open('test.en', 'w') as slang_file, open('test.es', 'w') as tlang_file:
    for i in test_idxs:
        slang_file.write(pairs[i][0] + '\n')
        tlang_file.write(pairs[i][1] + '\n')

## Loading the files (train, val, test) and Creating a SpanishDatatset instance

In [0]:
!python -m spacy download en
!python -m spacy download es

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('es_core_news_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.6/dist-packages/es_core_news_sm -->
/usr/local/lib/python3.6/dist-packages/spacy/data/es
You can now load the model via spacy.load('es')


In [0]:
spacy_es = spacy.load('es')
spacy_en = spacy.load('en')

def tokenize_es(text):
    """
    Tokenizes Spanish text from a string into a list of strings
    """
    return [token.text for token in spacy_es.tokenizer(text)]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings
    """
    return [token.text for token in spacy_en.tokenizer(text)]

In [0]:
SRC = data.Field(tokenize=tokenize_en, init_token='<SOS>', eos_token='<EOS>', lower=True)
TRG = data.Field(tokenize=tokenize_es, init_token='<SOS>', eos_token='<EOS>', lower=True)

train_data, valid_data, test_data = SpanishEnglishDataset.splits(
    path='',
    exts=('.en', '.es'),
    fields=(SRC, TRG)
)

In [0]:
SRC.build_vocab(train_data, min_freq=5)
TRG.build_vocab(train_data, min_freq=5)

In [0]:
BATCH_SIZE = 128
N_EPOCHS = 10
CLIP = 1

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    device=device
)

In [0]:
print(len(train_data))
print(len(valid_data))
print(len(test_data))

98668
12334
12333


## Vocabulary and Training Examples

In [0]:
SRC.vocab.itos[:10]

['<unk>', '<pad>', '<SOS>', '<EOS>', '.', 'i', 'the', 'to', 'you', 'tom']

In [0]:
batch = next(iter(train_iterator))

src_example = batch.src[:, 0]
src = ' '.join(map(lambda i: SRC.vocab.itos[i], src_example))
trg_example = batch.trg[:, 0]
trg = ' '.join(map(lambda i: TRG.vocab.itos[i], trg_example))

print("English sentence:", src)
print("Spanish sentence:", trg)


English sentence: <SOS> you have until midnight . <EOS> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
Spanish sentence: <SOS> tenes hasta medianoche . <EOS> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>


In [0]:
print(SRC.vocab.freqs.most_common(10))

[('.', 85805), ('i', 28905), ('the', 23337), ('to', 22119), ('you', 20269), ('tom', 16949), ('a', 14944), ('?', 13157), ('t', 12197), ('is', 12022)]


In [0]:
print(TRG.vocab.freqs.most_common(10))

[('.', 85461), ('que', 21836), ('de', 18965), ('el', 18433), ('no', 17094), ('a', 16918), ('tom', 16127), ('la', 14933), ('?', 13174), ('es', 10342)]


## Sequence-to-Sequence (Seq2Seq) Model (without Attention)

### Encoder

In [0]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim,
                 hidden_dim, num_layers, dropout):
        super().__init__()

        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        self.rnn = nn.LSTM(embedding_dim, hidden_dim,
                           num_layers, dropout=dropout)
        
        self.dropout = nn.Dropout(dropout)

    def forward(self, src_sequences):

        # src_sequences: (max sequences length, batch size)

        seq_embeddings = self.dropout(self.embedding(src_sequences))

        # src_seq_embeddings: (max sequences length, batch size, embedding dimension)

        seq_hidden, (hidden, cell) = self.rnn(seq_embeddings)

        # hidden = (num layers * num directions, batch size, hidden dim)
        # cell = (num layers * num directions, batch size, hidden dim)

        return hidden, cell

### Decoder

The Decoder is also built using an RNN and it represents a Language Model. As we saw in the first notebook, language modeling is the task of predicting the next word given a sequence of previous words:


$$ \Large P(y^{(t+1)}|y^{(1)}, ...,y^{(t)}) $$

In this case, our Decoder represents a Conditional Language Model

$$ \Large P(y|x) = P(y_1|x)P(y_2|y_1,x)P(y_3|y_1,y_2,x)...P(y_T|y_1,...,y_{T-1},x) $$

And we want to find a translation $y$ that maximizes

$$ \Large P(y|x) =  \displaystyle \prod_{t=1}^{T} P(y_t|y_1,...,y_{t-1}, x) $$


#### Negative log likelihood

When training our model, in each step $t$ it tries to minimize the negative log likelihood loss of the next word to be the ground truth:

$$ \large L_{CE} (\hat{y}^{(t)},y^{(t)}) = -\log \hat{y}_{s_{t+1}}^{(t)}$$

In [0]:
class Decoder(nn.Module):
    def __init__(self, output_dim, embedding_dim,
                 hidden_dim, num_layers, dropout):
        super().__init__()

        self.output_dim = output_dim
        self.hidden_di = hidden_dim
        self.num_layers = num_layers

        self.embedding = nn.Embedding(output_dim, embedding_dim)

        self.rnn = nn.LSTM(embedding_dim, hidden_dim,
                           num_layers, dropout=dropout)
        
        self.fc = nn.Linear(hidden_dim, output_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, seq_at_t, hidden, cell):

        # seq_at_t: (batch size) -> vector of sequences at time t

        seq_at_t = seq_at_t.unsqueeze(0)

        # seq_at_t: (1, batch size)

        seq_at_t_embedded = self.dropout(self.embedding(seq_at_t))

        # seq_at_t_embedded: (1, batch size, embedding dim)
        # hidden = (num layers * num directions, batch size, hidden dim)
        # cell = (num layers * num directions, batch size, hidden dim)

        output, (hidden, cell) = self.rnn(seq_at_t_embedded, (hidden, cell))

        # num directions = 1
        # output: (1, batch size, hidden dim)

        pred_scores = self.fc(output.squeeze(0))

        # pred_scores: (batch size, output dim)
        # hidden = (num layers , batch size, hidden dim)
        # cell = (num layers , batch size, hidden dim)

        return pred_scores, hidden, cell

### Seq2Seq Model

In [0]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):

        # src: (max src length, batch size)
        # trg: (max trg length, batch size)

        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim

        # outputs: tensor to store the decoder output
        # outputs: (target max length, batch size, target vocab size)
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)

        # we'll plug the last hidden states of the encoder in the decoder
        hidden, cell = self.encoder(src)

        # the first target sequence token: <sos>
        seq_at_t = trg[0, :]

        for t in range(1, trg_len):

            pred_scores, hidden, cell = self.decoder(seq_at_t, hidden, cell)

            outputs[t] = pred_scores

            # teacher_force: Boolean
            teacher_force = random.random() < teacher_forcing_ratio

            # argmax_token: Token
            argmax_token = pred_scores.argmax(1)

            # depending on teacher_forcing value, use the next token or argmax_token
            seq_at_t = trg[t] if teacher_force else argmax_token

        return outputs

### Model: 2 LSTM layers, no pre-trained Embeddings

In [0]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab) # output_dim is both the output dim and target vocab size
ENCODER_EMB_DIM = 256 # length of the encoder's embedding vectors
DECODER_EMB_DIM = 256 # length of the decoder's embedding vectors
HIDDEN_DIM = 512 # RNN's hidden units
NUM_LAYERS = 2 # number of stacked LSTM layers
ENCODER_DROPOUT = 0.5 # encoder's dropout probability
DECODER_DROPOUT = 0.5 # decoder's dropout probability

encoder = Encoder(INPUT_DIM, ENCODER_EMB_DIM, HIDDEN_DIM, NUM_LAYERS, ENCODER_DROPOUT)
decoder = Decoder(OUTPUT_DIM, DECODER_EMB_DIM, HIDDEN_DIM, NUM_LAYERS, DECODER_DROPOUT)

model = Seq2Seq(encoder, decoder, device)

In [0]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print("The model has {:,} trainable parameters".format(count_parameters(model)))

The model has 13,780,836 trainable parameters


In [0]:
def init_weights(model):
    for name, param in model.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)

model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(4801, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(6756, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc): Linear(in_features=512, out_features=6756, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [0]:
print(model)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(4801, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(6756, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (fc): Linear(in_features=512, out_features=6756, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)


### Loss Function and Optimizer

In [0]:
optimizer = optim.Adam(model.parameters())

TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)

### Train function

In [0]:
def train(model, iterator, optimizer, criterion, clip):

    model.train()

    epoch_loss = 0

    for batch in iterator:

        src = batch.src
        trg = batch.src

        optimizer.zero_grad()

        pred_scores_tensor = model(src, trg)

        # trg: (max trg length, batch size)
        # pred_scores_tensor: (max trg length, batch size, prob. distrib. size)

        output_dim = model.decoder.output_dim

        pred_scores_matrix = pred_scores_tensor[1:].view(-1, output_dim)

        # <sos> is the first input and doesn't have ground truth (it's not predicted)
        trg = trg[1:].view(-1)

        # trg: ( [max trg length - 1] * batch size)
        # pred_scores_matrix: ( [max trg length - 1] * batch size, prob.dist. size)

        loss = criterion(pred_scores_matrix, trg)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

### Evaluation function

In [0]:
def evaluate(model, iterator, criterion):

    model.eval()

    epoch_loss = 0

    with torch.no_grad():

        for batch in iterator:

            src = batch.src
            trg = batch.trg

            pred_scores_tensor = model(src, trg, 0)

            # trg: (max trg length, batch size)
            # pred_scores_tensor: (max trg length, batch size, prob. distrib. size)

            output_dim = model.decoder.output_dim

            pred_scores_matrix = pred_scores_tensor[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)
            
            # trg: ( [max trg length - 1] * batch size)
            # pred_scores_matrix: ( [max trg length - 1] * batch size, prob.dist. size)

            loss = criterion(pred_scores_matrix, trg)

            epoch_loss += loss.item()

        return epoch_loss / len(iterator)

### Epoch Timer

In [0]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins, elapsed_secs = divmod(int(elapsed_time), 60)
    return elapsed_mins, elapsed_secs

### Training-Evaluation function

In [0]:
def train_evaluate(model, optimizer, criterion, n_epochs, clip):

    metrics = {
        "train_loss": [],
        "train_pplx": [],
        "valid_loss": [],
        "valid_pplx": [],
    }

    model = model.to(device)

    for epoch in range(n_epochs):

        start_time = time.time()

        train_loss = train(model, train_iterator, optimizer, criterion, clip)
        valid_loss = evaluate(model, valid_iterator, criterion)

        train_pplx = math.exp(train_loss)
        valid_pplx = math.exp(valid_loss)

        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        print("Epoch: {:02} | Epoch Time: {}m {}s".format(epoch+1, epoch_mins, epoch_secs))
        print("\tTrain Loss: {:.3f} | Train Acc: {:7.3f}".format(train_loss, train_pplx))
        print("\t Val. Loss: {:.3f} |  Val. Acc: {:7.3f}".format(valid_loss, valid_pplx))

        metrics["train_loss"].append(train_loss)
        metrics["train_pplx"].append(train_pplx)
        metrics["valid_loss"].append(valid_loss)
        metrics["valid_pplx"].append(valid_pplx)

    return metrics

### Plot Metrics function

In [0]:
import matplotlib.pyplot as plt

def plot_metrics(metrics):

    train_loss = metrics['train_loss']
    valid_loss = metrics['valid_loss']
    train_pplx = metrics['train_pplx']
    valid_pplx = metrics['valid_pplx']

    epochs = range(1, N_EPOCHS + 1)

    plt.figure(figsize=(14, 5))
    
    plt.subplot(1, 2, 1)
    plt.plot(epochs, train_loss, 'b', label='Training loss')
    plt.plot(epochs, valid_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(epochs, train_pplx, 'b', label='Training perplexity')
    plt.plot(epochs, valid_pplx, 'r', label='Validation perplexity')
    plt.title('Training and validation perplexity')
    plt.xlabel('Epochs')
    plt.ylabel('Perplexity')
    plt.legend()

    plt.show()

### Training a Seq2Seq Model from Scratch

In [0]:
results = train_evaluate(model, optimizer, criterion, N_EPOCHS, CLIP)

In [0]:
test_loss = evaluate(model, test_iterator, criterion)

test_pplx = math.exp(test_loss)

print("Train Loss: {:.3f} | Train Acc: {:7.3f}%".format(test_loss, test_pplx))