We want to train a train a neural network which can reverse a sentence. 
We will first use an encoder/decoder without attention and then try with attention. 

Mostly trying to re-implement https://towardsdatascience.com/attention-seq2seq-with-pytorch-learning-to-invert-a-sequence-34faf4133e53

When trying to understand Attention and Transformers, I found the following posts be useful 
 - https://jalammar.github.io/visualizing-neural-machine-translation-mechanics-of-seq2seq-models-with-attention/
 - https://towardsdatascience.com/attn-illustrated-attention-5ec4ad276ee3
 - https://jalammar.github.io/illustrated-transformer/

In [132]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
import random
from collections import defaultdict

class ToyDataset(Dataset):
    """
    Generate a dataset of sentences and their reverse sentences of variable length.
    """
    def __init__(self, words=[], min_len=5, max_len=10, type='train'):
        self.min_len = min_len
        self.max_len = max_len
        # if no words are passed, let them be simple charecters
        if not words:
            self.words = list('abcdef')

        # start of sentence
        self.eos = '<eos>'
        self.sos = '<sos>'
        self.pad = '<pad>'

        # complete vocab
        self.vocab = [self.pad, self.sos, self.eos] + self.words
        self.vocab_size = len(self.vocab)

        self.w2i = {self.vocab[idx]:idx for idx in range(len(self.vocab))}
        self.i2w = {idx:self.vocab[idx] for idx in range(len(self.vocab))}

        if type == 'train':
            self.set = [self._sample() for _ in range(3000)]
        else:
            self.set = [self._sample() for _ in range(300)]

    def _sample(self):
        size = random.randint(self.min_len, self.max_len)

        # ignore the last two as they are sos and eos
        sentence = random.choices(self.words, k=size)
        reverse = [sentence[idx] for idx in range(len(sentence)-1, -1, -1)]
        
        # add the sos and eos
        sentence = [self.sos] + sentence + [self.eos]
        reverse = [self.sos] + reverse + [self.eos]
        # example sentence: ['<sos>', 'd', 'f', 'b', 'd', 'd', 'd', '<eos>']
        # example reverse:  ['<sos>', 'd', 'd', 'd', 'b', 'f', 'd', '<eos>']
        
        # padding. Add 2 as we added sos and eos
        sentence += [self.pad] * (self.max_len + 2 - len(sentence))
        reverse += [self.pad] * (self.max_len + 2 - len(reverse))
        # example sentence: ['<sos>', 'd', 'f', 'b', 'd', 'd', 'd', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>']
        # example reverse:  ['<sos>', 'd', 'd', 'd', 'b', 'f', 'd', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>']
    
        # convert the words to integers
        sentence_tensor = torch.LongTensor([self.w2i[word] for word in sentence])
        reverse_tensor = torch.LongTensor([self.w2i[word] for word in reverse])
        # example sentence: tensor([1, 6, 8, 4, 6, 6, 6, 2, 0, 0, 0, 0]) 
        # example reverse: tensor([1, 6, 6, 6, 4, 8, 6, 2, 0, 0, 0, 0])
        
        # return one hot encoded tensors
        return F.one_hot(sentence_tensor, self.vocab_size).float(), F.one_hot(reverse_tensor, self.vocab_size).float()

    def __len__(self):
        return len(self.set)

    def __getitem__(self, item):
        return self.set[item]

Lets see an sample

In [133]:
ds = ToyDataset()
idx = random.randint(0, len(ds))
ds[idx][0].size()

# here sentence length is 12 and vocab size is 9

torch.Size([12, 9])

In [103]:
train_dl = DataLoader(ds, batch_size=3)
batch = next(iter(train_dl))
batch[0].size()

torch.Size([3, 12, 9])

### Encoder

In [104]:
"""
Stacked Linear Layers with Relu
"""
class LinearRelu(nn.Module):
    def __init__(self, input_size, hidden_sizes):
        super(LinearRelu, self).__init__()
        self.input_size = input_size
        self.hidden_sizes = hidden_sizes
        self.relu = nn.ReLU()
        
        self.layers = nn.ModuleList()
        for idx in range(len(hidden_sizes)):
            if idx == 0:
                self.layers.append(nn.Linear(input_size, hidden_sizes[idx]))
            else:
                self.layers.append(nn.Linear(hidden_sizes[idx-1], hidden_sizes[idx]))

    def forward(self, x):
        for idx, layer in enumerate(self.layers):
            x = self.relu(layer(x))
        return x
    
# lets check this
input_size = 15
bs = 50
hidden_sizes = [12, 8, 5]
x = torch.rand((bs, input_size))
linrel = LinearRelu(input_size, hidden_sizes)
out = linrel(x)
assert x.size()[1] == input_size
assert out.size()[1] == hidden_sizes[-1]
print(f'input has size {x.size()} and output has size {out.size()}')

input has size torch.Size([50, 15]) and output has size torch.Size([50, 5])


In [134]:
"""
We will use a bidirectional LSTM to encode the input
"""
class Encoder(nn.Module):
    def __init__(self, input_size, lstm_size):
        super(Encoder, self).__init__()
#         self.linear_relus = LinearRelu(input_size, hidden_sizes)
        self.lstm = nn.LSTM(input_size, lstm_size, bidirectional=True, batch_first=True)
        
    def forward(self, x):
        out, (h, c) = self.lstm(x)
        return out, h, c
# class Decoder(nn.Module):
    

In [119]:
batch = next(iter(train_dl))
lstm_size = 5
encoder = Encoder(ds.vocab_size,lstm_size=5)
out, h, c = encoder(batch[0])
print(f'input.size: {batch[0].size()}, h.size: {h.size()}, out.size: {out.size()}, cell.size: {c.size()}')

input.size: torch.Size([3, 12, 9]), h.size: torch.Size([2, 3, 5]), out.size: torch.Size([3, 12, 10]), cell.size: torch.Size([2, 3, 5])


In [81]:
batch[0].dtype

torch.int64