This code was heavily influenced by the following:
5.4-EXE-seq2seq-digits : Jupyter notebook that was provided in week 5 
https://github.com/bentrevett/pytorch-seq2seq
https://github.com/bastings/annotated_encoder_decoder

In [1]:
#!pip install --upgrade torch numpy matplotlib sacrebleu

In [2]:
import copy
import math
import time
import matplotlib.pyplot as plt

In [3]:
import numpy as np
#import sacrebleu
import torch
import torch.nn as nn
import torch.nn.functional as F
from IPython.core.debugger import set_trace
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [4]:
USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device("cuda:0")  # or set to 'cpu'
print("CUDA:", USE_CUDA)
print(DEVICE)

CUDA: False
cuda:0


In [5]:
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

### MODEL


In [6]:
class EncoderDecoder(nn.Module):
    """
    A standard Encoder-Decoder architecture. Base for this and many
    other models.
    """
    def __init__(self, encoder, decoder, src_embed, trg_embed, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.trg_embed = trg_embed
        self.generator = generator
    def forward(self, src, trg, src_mask, trg_mask, src_lengths, trg_lengths):
        """Take in and process masked src and target sequences."""
        encoder_hidden, encoder_final = self.encode(src, src_mask, src_lengths)
        return self.decode(
            encoder_hidden, encoder_final, src_mask, trg, trg_mask
        )
    def encode(self, src, src_mask, src_lengths):
        return self.encoder(self.src_embed(src), src_mask, src_lengths)
    def decode(
        self,
        encoder_hidden,
        encoder_final,
        src_mask,
        trg,
        trg_mask,
        decoder_hidden=None,
        max_len=None,
    ):
        return self.decoder(
            self.trg_embed(trg),
            encoder_hidden,
            encoder_final,
            src_mask,
            trg_mask,
            hidden=decoder_hidden,
            max_len=max_len,
        )


Projecting the pre-output layer ($x$ in the `forward` function below) to obtain the output layer, so that the final dimension is the target vocabulary size.<br>


In [7]:
class Generator(nn.Module):
    """Define standard linear + softmax generation step."""
    def __init__(self, hidden_size, vocab_size):
        super(Generator, self).__init__()
        self.proj = nn.Linear(hidden_size, vocab_size, bias=False)
    def forward(self, x):
        return F.log_softmax(self.proj(x), dim=-1)


## Encoder


In [8]:
class Encoder(nn.Module):
    """Encodes a sequence of word embeddings"""
    def __init__(self, input_size, hidden_size, num_layers=1, dropout=0.0):
        super(Encoder, self).__init__()
        self.num_layers = num_layers
        self.rnn = nn.GRU(
            input_size,
            hidden_size,
            num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout,
        )
    def forward(self, x, mask, lengths):
        """
        Applies a bidirectional GRU to sequence of embeddings x.
        The input mini-batch x needs to be sorted by length.
        x should have dimensions [batch, time, dim].
        """
        packed = pack_padded_sequence(
            x, lengths, batch_first=True, enforce_sorted=False
        )
        output, final = self.rnn(packed)
        output, _ = pad_packed_sequence(
            output, batch_first=True, padding_value=0
        )

        # we need to manually concatenate the final states for both directions
        fwd_final = final[0 : final.size(0) : 2]
        bwd_final = final[1 : final.size(0) : 2]
        final = torch.cat(
            [fwd_final, bwd_final], dim=2
        )  # [num_layers, batch, 2*dim]
        return output, final


### Decoder<br>


In [9]:
class Decoder(nn.Module):
    """A conditional RNN decoder with attention."""
    def __init__(
        self,
        emb_size,
        hidden_size,
        num_layers=1,
        dropout=0.5,
        bridge=True,
    ):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = dropout
        self.rnn = nn.GRU(
            emb_size ,
            hidden_size,
            num_layers,
            batch_first=True,
            dropout=dropout,
        )

        # to initialize from the final encoder state
        self.bridge = (
            nn.Linear(2 * hidden_size, hidden_size, bias=True)
            if bridge
            else None
        )
        self.dropout_layer = nn.Dropout(p=dropout)
    def forward_step(
        self, prev_embed, encoder_hidden, src_mask, hidden
    ):
        """Perform a single decoder step (1 word)"""


        # update rnn hidden state
        #print(prev_embed.shape,hidden.shape)
        output, hidden = self.rnn(prev_embed, hidden)
        pre_output = self.dropout_layer(output)
        return output, hidden, pre_output
    def forward(
        self,
        trg_embed,
        encoder_hidden,
        encoder_final,
        src_mask,
        trg_mask,
        hidden=None,
        max_len=None,
    ):
        """Unroll the decoder one step at a time."""

        # the maximum number of steps to unroll the RNN
        if max_len is None:
            max_len = trg_mask.size(-1)

        # initialize decoder hidden state
        if hidden is None:
            hidden = self.init_hidden(encoder_final)


        # here we store all intermediate hidden states and pre-output vectors
        decoder_states = []
        pre_output_vectors = []

        # unroll the decoder RNN for max_len steps
        for i in range(max_len):
            prev_embed = trg_embed[:, i].unsqueeze(1)
            output, hidden, pre_output = self.forward_step(
                prev_embed, encoder_hidden, src_mask, hidden
            )
            decoder_states.append(output)
            pre_output_vectors.append(pre_output)
        decoder_states = torch.cat(decoder_states, dim=1)
        pre_output_vectors = torch.cat(pre_output_vectors, dim=1)
        return decoder_states, hidden, pre_output_vectors  # [B, N, D]
    def init_hidden(self, encoder_final):
        """Returns the initial decoder state,
        conditioned on the final encoder state."""
        if encoder_final is None:
            return None  # start with zeros
        return torch.tanh(self.bridge(encoder_final))


## Full Model<br>
Here we define a function from hyperparameters to a full model.<br>


In [10]:
def make_model(
    src_vocab,
    tgt_vocab,
    emb_size=64,
    hidden_size=128,
    num_layers=1,
    dropout=0.1,
):
    "Helper: Construct a model from hyperparameters."
    model = EncoderDecoder(
        Encoder(emb_size, hidden_size, num_layers=num_layers, dropout=dropout),
        Decoder(
            emb_size,
            hidden_size,
            num_layers=num_layers,
            dropout=dropout,
        ),
        nn.Embedding(src_vocab, emb_size, padding_idx=0),
        nn.Embedding(tgt_vocab, emb_size, padding_idx=0),
        Generator(hidden_size, tgt_vocab),
    )
    return model.cuda() if USE_CUDA else model


# Training<br>

## Batches and Masking<br>


In [11]:
class Batch:
    """Object for holding a batch of data with mask during training.
    Input is a batch from a torch text iterator.
    """
    def __init__(self, src, trg, pad_index=0):
        src, src_lengths = src
        self.src = src
        self.src_lengths = src_lengths
        self.src_mask = (src != pad_index).unsqueeze(-2)
        self.nseqs = src.size(0)
        self.trg = None
        self.trg_y = None
        self.trg_mask = None
        self.trg_lengths = None
        self.ntokens = None
        if trg is not None:
            trg, trg_lengths = trg
            self.trg = trg[:, :-1]
            self.trg_lengths = trg_lengths
            self.trg_y = trg[:, 1:]
            self.trg_mask = self.trg_y != pad_index
            self.ntokens = (self.trg_y != pad_index).data.sum().item()
        if USE_CUDA:
            self.src = self.src.cuda()
            self.src_mask = self.src_mask.cuda()
            if trg is not None:
                self.trg = self.trg.cuda()
                self.trg_y = self.trg_y.cuda()
                self.trg_mask = self.trg_mask.cuda()


## Training Loop<br>
The code below trains the model for 1 epoch (=1 pass through the training data).<br>


In [12]:
def run_epoch(data_iter, model, loss_compute, print_every=50):
    """Standard Training and Logging Function"""
    start = time.time()
    total_tokens = 0
    total_loss = 0
    print_tokens = 0
    for i, batch in enumerate(data_iter, 1):
        out, _, pre_output = model.forward(
            batch.src,
            batch.trg,
            batch.src_mask,
            batch.trg_mask,
            batch.src_lengths,
            batch.trg_lengths,
        )
        loss = loss_compute(pre_output, batch.trg_y, batch.nseqs)
        total_loss += loss
        total_tokens += batch.ntokens
        print_tokens += batch.ntokens
        if model.training and i % print_every == 0:
            elapsed = time.time() - start
            print(
                "Epoch Step: %d Loss: %f Tokens per Sec: %f"
                % (i, loss / batch.nseqs, print_tokens / elapsed)
            )
            start = time.time()
            print_tokens = 0
    return math.exp(total_loss / float(total_tokens))


## Synthetic Data<br>


In [13]:
target_to_text = {
    "0": "0",
    "1": "1",
    "2": "two",
    "3": "three",
    "4": "four",
    "5": "five",
    "6": "six",
    "7": "seven",
    "8": "eight",
    "9": "nine",
}
input_characters = " ".join(target_to_text.values())
valid_characters = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9",] + list(
    set(input_characters)
)

In [14]:
src_vocab_len = len(valid_characters)
trg_vocab_len = len(target_to_text.keys())

In [15]:
def data_gen(
    num_words=9,
    batch_size=16,
    num_batches=100,
    min_length=3,
    max_length=8,
    pad_index=0,
    eos_index=1,
    sos_index=1,
):
    """Generate random data for a src-tgt copy task."""
    for i in range(num_batches):
        data = [
            np.random.randint(
                2,
                num_words,
                size=(np.random.randint(min_length, max_length + 1)),
            )
            for i in range(batch_size)
        ]
        for arr in data:
            arr[-1] = eos_index
            arr[0] = sos_index
        trg_max_length = max([len(i) for i in data])
        tmp = np.zeros((batch_size, trg_max_length), dtype="int64")
        trg_lengths = []
        for i, arr in enumerate(data):
            cur_len = len(arr)
            trg_lengths.append(cur_len)
            tmp[i, :cur_len] = arr
        data = tmp
        src = [
            [
                target_to_text[str(x)]
                for x in i
                if x not in (pad_index, eos_index, sos_index)
            ]
            for i in data
        ]
        src = [[valid_characters.index(el) for el in " ".join(y)] for y in src]
        src_max_len = max([len(i) for i in src])
        src_lengths = []
        tmp = np.zeros((batch_size, src_max_len), dtype="int64")
        for i, arr in enumerate(src):
            cur_len = len(arr)
            src_lengths.append(cur_len)
            tmp[i, :cur_len] = arr
        src = torch.from_numpy(tmp)
        data = torch.from_numpy(data)
        data = data.cuda() if USE_CUDA else data
        trg = data
        yield Batch(
            (torch.LongTensor(src), src_lengths),
            (torch.LongTensor(trg), trg_lengths),
            pad_index=pad_index,
        )


## Loss Computation
  
A simple loss compute and train function.


In [16]:
class SimpleLossCompute:
    def __init__(self, generator, criterion, opt=None):
        self.generator = generator
        self.criterion = criterion
        self.opt = opt
    def __call__(self, x, y, norm):
        x = self.generator(x)
        loss = self.criterion(
            x.contiguous().view(-1, x.size(-1)), y.contiguous().view(-1)
        )
        loss = loss / norm
        if self.opt is not None:
            loss.backward()
            self.opt.step()
            self.opt.zero_grad()
        return loss.data.item() * norm


### Printing examples<br>

We use greedy decoding for simplicity; that is, at each time step, starting at the first token, we choose the one with that maximum probability, and we never revisit that choice.<br>


In [17]:
def greedy_decode(
    model, src, src_mask, src_lengths, max_len=10, sos_index=1, eos_index=1
):
    """Greedily decode a sentence."""
    with torch.no_grad():
        encoder_hidden, encoder_final = model.encode(src, src_mask, src_lengths)
        prev_y = torch.ones(1, 1).fill_(sos_index).type_as(src)
        trg_mask = torch.ones_like(prev_y)
    output = []
    hidden = None
    for i in range(max_len):
        with torch.no_grad():
            out, hidden, pre_output = model.decode(
                encoder_hidden,
                encoder_final,
                src_mask,
                prev_y,
                trg_mask,
                hidden,
            )

            # we predict from the pre-output layer, which is
            # a combination of Decoder state, prev emb, and context
            prob = model.generator(pre_output[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.data.item()
        output.append(next_word)
        prev_y = torch.ones(1, 1).type_as(src).fill_(next_word)
    output = np.array(output)

    # cut off everything starting from </s>
    # (only when eos_index provided)
    if eos_index is not None:
        first_eos = np.where(output == eos_index)[0]
        if len(first_eos) > 0:
            output = output[: first_eos[0]]
    return output

In [18]:
def lookup_words(x, vocab=None):
    if vocab is not None:
        x = [vocab.itos[i] for i in x]
    return [str(t) for t in x]

In [19]:
def turn_num_to_text(nums):
    return [valid_characters[num] for num in nums]

In [20]:
def print_examples(
    example_iter,
    model,
    n=2,
    max_len=10,
    sos_index=1,
    src_eos_index=None,
    trg_eos_index=1,
    src_vocab=None,
    trg_vocab=None,
):
    """Prints N examples. Assumes batch size of 1."""
    model.eval()
    count = 0
    print()
    if src_vocab is not None and trg_vocab is not None:
        src_eos_index = src_vocab.stoi[EOS_TOKEN]
        trg_sos_index = trg_vocab.stoi[SOS_TOKEN]
        trg_eos_index = trg_vocab.stoi[EOS_TOKEN]
    else:
        src_eos_index = None
        trg_sos_index = 1
        trg_eos_index = 1
    for i, batch in enumerate(example_iter):
        src = batch.src.cpu().numpy()[0, :]
        trg = batch.trg_y.cpu().numpy()[0, :]

        # remove </s> (if it is there)
        src = src[:-1] if src[-1] == src_eos_index else src
        trg = trg[:-1] if trg[-1] == trg_eos_index else trg
        result = greedy_decode(
            model,
            batch.src,
            batch.src_mask,
            batch.src_lengths,
            sos_index=trg_sos_index,
            eos_index=trg_eos_index,
        )
        match = 0
        print("Example #%d" % (i + 1))
        print("Src : ", "".join(turn_num_to_text(src)))
        print("Trg : ", " ".join(lookup_words(trg, vocab=trg_vocab)))
        print("Pred: ", " ".join(lookup_words(result, vocab=trg_vocab)))
        print()
        count += 1
        print()
        if count == n:
            break
    return src,trg


## Training the "translating" task
  



In [None]:
def train_trans_task():
    num_words = 10
    criterion = nn.NLLLoss(reduction="sum", ignore_index=0)
    model = make_model(
        src_vocab_len, trg_vocab_len, emb_size=32, hidden_size=64
    )
    optim = torch.optim.Adam(model.parameters(), lr=0.0003)
    min_length = 4
    max_length = 150
    batch_size = 32
    num_batches = 150
    eval_data = list(
        data_gen(
            num_words=num_words,
            batch_size=1,
            num_batches=num_batches,
            min_length=min_length,
            max_length=max_length,
        )
    )
    dev_perplexities = []
    if USE_CUDA:
        model.cuda()
    for epoch in range(10):
        print("Epoch %d" % epoch)
        data = data_gen(
            num_words=num_words,
            batch_size=batch_size,
            num_batches=num_batches,
            min_length=min_length,
            max_length=max_length,
        )
        # train
        model.train()
        run_epoch(
            data, model, SimpleLossCompute(model.generator, criterion, optim)
        )

        # evaluate
        model.eval()
        with torch.no_grad():
            perplexity = run_epoch(
                eval_data,
                model,
                SimpleLossCompute(model.generator, criterion, None),
            )
            print("Evaluation perplexity: %f" % perplexity)
            dev_perplexities.append(perplexity)
            #src_ex,trg_ex=print_examples(eval_data, model, n=2, max_len=max_length)
    return dev_perplexities#,src_ex,trg_ex


In [None]:
dev_perplexities = train_trans_task()

  "num_layers={}".format(dropout, num_layers))


Epoch 0
Epoch Step: 50 Loss: 164.726929 Tokens per Sec: 151.965372
Epoch Step: 100 Loss: 203.113098 Tokens per Sec: 164.072132
Epoch Step: 150 Loss: 162.904739 Tokens per Sec: 166.993928
Evaluation perplexity: 8.300728
Epoch 1
Epoch Step: 50 Loss: 154.496826 Tokens per Sec: 157.744379
Epoch Step: 100 Loss: 151.661118 Tokens per Sec: 161.732837
Epoch Step: 150 Loss: 142.145813 Tokens per Sec: 166.023368
Evaluation perplexity: 8.199388
Epoch 2
Epoch Step: 50 Loss: 152.990280 Tokens per Sec: 162.938273
Epoch Step: 100 Loss: 139.293213 Tokens per Sec: 160.848474
Epoch Step: 150 Loss: 154.508865 Tokens per Sec: 159.663959
Evaluation perplexity: 8.129913
Epoch 3
Epoch Step: 50 Loss: 154.930649 Tokens per Sec: 165.481599
Epoch Step: 100 Loss: 179.833450 Tokens per Sec: 158.378223
Epoch Step: 150 Loss: 168.300095 Tokens per Sec: 152.126946
Evaluation perplexity: 8.082966
Epoch 4
Epoch Step: 50 Loss: 148.695267 Tokens per Sec: 141.859177
Epoch Step: 100 Loss: 149.059250 Tokens per Sec: 146.7487

In [None]:
def plot_perplexity(perplexities):
    """plot perplexities"""
    plt.title("Perplexity per Epoch")
    plt.xlabel("Epoch")
    plt.ylabel("Perplexity")
    plt.plot(perplexities)
    plt.show()

In [None]:
plot_perplexity(dev_perplexities)