<a href="https://colab.research.google.com/github/ccarpenterg/introNLP/blob/master/04a_NLP_and_sequence_to_sequence_RNNs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLP and Sequence-to-Sequence RNNs

In [0]:
!wget https://www.manythings.org/anki/spa-eng.zip
!unzip spa-eng.zip

In [0]:
import torch

import torchtext
from torchtext.datasets import TranslationDataset
from torchtext import data

import spacy

from io import open

import unicodedata
import string
import re
import random
import os

print("Spacy version:", spacy.__version__)

Spacy version: 2.1.9


## Preprocessing the Natural Language Data

### Text Normalization functions

In [0]:
def unicodeToAscii(s):
    return ''.join(
        char for char in unicodedata.normalize('NFD', s)
        if unicodedata.category(char) != 'Mn'
    )

def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

### Read Pairs function

In [0]:
def readPairs(pathToFile, slang, tlang):
    print("Reading lines...")

    f = open(pathToFile, encoding='utf-8')
    lines = f.read().strip().split('\n')

    pairs = [[normalizeString(s) for s in line.split('\t')[:2]] for line in lines]

    return pairs

### Filter function

In [0]:
MAX_LENGTH = 10

eng_prefixes = (
    "i am", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[0].startswith(eng_prefixes)

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

### Prepare Data function

In [0]:
def prepareData(pathToFile, slang, tlang):
    pairs = readPairs(pathToFile, slang, tlang)
    print("Read {} sentence pairs".format(len(pairs)))
    pairs = filterPairs(pairs)
    print("Trimmed to {} sentence pairs".format(len(pairs)))
    return pairs

### Dataset Indexes (Splits) function

In [0]:
def datasetIndexes(pairs, train=0.8, val=0.1):
    num_examples = len(pairs)
    indexes = list(range(num_examples))
    last_train_idx = round(train*num_examples)
    last_valid_idx = last_train_idx + round(val*num_examples)
    random.shuffle(indexes)
    train_idxs = indexes[:last_train_idx]
    val_idxs = indexes[last_train_idx:last_valid_idx]
    test_idxs = indexes[last_valid_idx:]
    return train_idxs, val_idxs, test_idxs


### Spanish-English Translation Dataset class

In [0]:
class SpanishEnglishDataset(TranslationDataset):
    """English to Spanish Dataset"""

    @classmethod
    def splits(cls, exts, fields, root='.data/',
               train='train', validation='val', test='test', **kwargs):
        
        if 'path' not in kwargs:
            expected_folder = os.path.join(root, cls.name)
            path = expected_folder if os.path.exists(expected_folder) else None
        else:
            path = kwargs['path']
            del kwargs['path']
        
        return super(SpanishEnglishDataset, cls).splits(
            exts, fields, path, root, train, validation, test, **kwargs
        )

## Spanish-English Dataset files (splits)

In [0]:
pairs = prepareData('spa.txt', 'eng', 'spa')
train_idxs, val_idxs, test_idxs = datasetIndexes(pairs)
print("{} {} {}".format(len(train_idxs), len(val_idxs), len(test_idxs) ))
print(random.choice(pairs))

Reading lines...
Read 123335 sentence pairs
Trimmed to 7588 sentence pairs
6070 759 759
['he s your friend .', 'el es tu amigo .']


In [0]:
with open('train.en', 'w') as slang_file, open('train.es', 'w') as tlang_file:
    for i in train_idxs:
        slang_file.write(pairs[i][0] + '\n')
        tlang_file.write(pairs[i][1] + '\n')

with open('val.en', 'w') as slang_file, open('val.es', 'w') as tlang_file:
    for i in val_idxs:
        slang_file.write(pairs[i][0] + '\n')
        tlang_file.write(pairs[i][1] + '\n')

with open('test.en', 'w') as slang_file, open('test.es', 'w') as tlang_file:
    for i in test_idxs:
        slang_file.write(pairs[i][0] + '\n')
        tlang_file.write(pairs[i][1] + '\n')

## Loading the files (train, val, test) and Creating a SpanishDatatset instance

In [0]:
!python -m spacy download en
!python -m spacy download es

In [0]:
spacy_es = spacy.load('es')
spacy_en = spacy.load('en')

def tokenize_es(text):
    """
    Tokenizes Spanish text from a string into a list of strings
    """
    return [token.text for token in spacy_es.tokenizer(text)]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings
    """
    return [token.text for token in spacy_en.tokenizer(text)]

In [0]:
SRC = data.Field(tokenize=tokenize_en, init_token='<SOS>', eos_token='<EOS>', lower=True)
TRG = data.Field(tokenize=tokenize_es, init_token='<SOS>', eos_token='<EOS>', lower=True)

train_data, valid_data, test_data = SpanishEnglishDataset.splits(
    path='',
    exts=('.en', '.es'),
    fields=(SRC, TRG)
)

In [0]:
SRC.build_vocab(train_data, min_freq=5)
TRG.build_vocab(train_data, min_freq=5)

In [0]:
BATCH_SIZE = 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    device=device
)

In [0]:
print(len(train_data))
print(len(valid_data))
print(len(test_data))

6070
759
759


## Vocabulary and Training Examples

In [0]:
SRC.vocab.itos[:10]

['<unk>', '<pad>', '<SOS>', '<EOS>', '.', 'i', 'm', 're', 'you', 'he']

In [0]:
batch = next(iter(train_iterator))

src_example = batch.src[:, 0]
src = ' '.join(map(lambda i: SRC.vocab.itos[i], src_example))
trg_example = batch.trg[:, 0]
trg = ' '.join(map(lambda i: TRG.vocab.itos[i], trg_example))

print("English sentence:", src)
print("Spanish sentence:", trg)


English sentence: <SOS> i m not good at lying . <EOS> <pad> <pad>
Spanish sentence: <SOS> no soy bueno mintiendo . <EOS> <pad> <pad> <pad> <pad>


In [0]:
SRC.vocab.freqs.most_common(10)

[('.', 5981),
 ('i', 2777),
 ('m', 2166),
 ('re', 1449),
 ('you', 1389),
 ('he', 1087),
 ('is', 975),
 ('to', 937),
 ('a', 910),
 ('s', 666)]

In [0]:
TRG.vocab.freqs.most_common(10)

[('.', 5966),
 ('estoy', 1283),
 ('el', 1080),
 ('de', 961),
 ('es', 899),
 ('no', 793),
 ('a', 678),
 ('un', 560),
 ('esta', 547),
 ('soy', 525)]

## Sequence-to-Sequence (Seq2Seq) Model (without Attention)

### Encoder

In [0]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim,
                 hidden_dim, num_layers, dropout):
        super().__init__()

        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        self.rnn = nn.LSTM(embedding_dim, hidden_dim,
                           num_layers, dropout=dropout)
        
        self.dropout = nn.Dropout(dropout)

    def forward(self, src_sequences):

        # src_sequences: (max sequences length, batch size)

        seq_embeddings = self.dropout(self.embedding(src_sequences))

        # src_seq_embeddings: (max sequences length, batch size, embedding dimension)

        seq_hidden, (hidden, cell) = self.rnn(embedded)

        # hidden = (num layers * num directions, batch size, hidden dim)
        # cell = (num layers * num directions, batch size, hidden dim)

        return hidden, cell

### Decoder

In [0]:
class Decoder(nn.Module):
    def __init__(self, embedding_dim, hidden_dim,
                 output_dim, num_layers, dropout):
        super().__init__()

        self.output_dim = output_dim
        self.hidden_di = hidden_dim
        self.num_layers = num_layers

        self.embedding = nn.Embedding(output_dim, embedding_dim)

        self.rnn = nn.LSTM(embedding_dim, hidden_num,
                           num_layers, dropout=dropout)
        
        self.fc = nn.Linear(hidden_dim, output_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, seq_at_t, hidden, cell):

        # seq_at_t: (batch size) -> vector of sequences at time t

        seq_at_t = seq_at_t.unsqueeze(0)

        # seq_at_t: (1, batch size)

        seq_at_t_embedded = self.dropout(self.embedding(seq_at_t))

        # seq_at_t_embedded: (1, batch size, embedding dim)
        # hidden = (num layers * num directions, batch size, hidden dim)
        # cell = (num layers * num directions, batch size, hidden dim)

        output, (hidden, cell) = self.rnn(seq_at_t_embedded, (hidden, cell))

        # num directions = 1
        # output: (1, batch size, hidden dim)

        pred_scores = self.fc(output.squeeze(0))

        # pred_scores: (batch size, output dim)
        # hidden = (num layers , batch size, hidden dim)
        # cell = (num layers , batch size, hidden dim)

        return pred_scores, hidden, cell

### Seq2Seq Model

In [0]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):

        # src: (max src length, batch size)
        # trg: (max trg length, batch size)

        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim

        # outputs: tensor to store the decoder output
        # outputs: (target max length, batch size, target vocab size)
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)

        # we'll plug the last hidden states of the encoder in the decoder
        hidden, cell = self.encoder(src)

        # the first target sequence token: <sos>
        seq_at_t = trg[0, :]

        for t in range(1, trg_len):

            pred_scores, hidden, cell = self.decoder(seq_at_t, hidden, cell)

            outputs[t] = pred_scores

            # teacher_force: Boolean
            teacher_force = random.random() < teacher_forcing_ratio

            # argmax_token: Token
            argmax_token = output.argmax(1)

            # depending on teacher_forcing value, use the next token or argmax_token
            seq_at_t = trg[t] if teacher_force else argmax_token

        return outputs