# Simplest Encoder-Decoder Model


In [1]:
# import libraries
%matplotlib inline
import torch
import torch.nn as nn
import torch.nn.functional as F
import re
import os
import unicodedata
import numpy as np
import math
import copy
import time
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from IPython.core.debugger import set_trace

In [27]:
# use CUDA if it is available
USE_CUDA = torch.cuda.is_available()
DEVICE=torch.device('cuda:0') # or set to 'cpu'
print("CUDA:", USE_CUDA)
print(DEVICE)

CUDA: False
cuda:0


In [28]:
MAX_LENGTH = 10  # Maximum sentence length

# Default word tokens
PAD_token = 0  # Used for padding short sentences
SOS_token = 1  # Start-of-sentence token
EOS_token = 2  # End-of-sentence token

In [29]:
# set seeds
seed = 1111
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

# 1. Sequence-to-Sequence Model Architecture

<img src="seq2seq_model.png">

# 2. Record the mapping in `Voc`

class `Voc` records the mapping between word and its index, and records the vocabulary size.


In [4]:
class Voc:
    def __init__(self, name):
        self.name = name
        self.trimmed = False
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3  # Count SOS, EOS, PAD

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1

    # Remove words below a certain count threshold
    def trim(self, min_count):
        if self.trimmed:
            return
        self.trimmed = True
        keep_words = []
        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)

        print('keep_words {} / {} = {:.4f}'.format(
            len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
        ))
        
        # Reinitialize dictionaries
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        self.num_words = 3 # Count default tokens
        for word in keep_words:
            self.addWord(word)

In [5]:
# Lowercase and remove non-letter characters
def normalizeString(s):
    s = s.lower()
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [6]:
# Takes string sentence, returns sentence of word indexes
def indexesFromSentence(voc, sentence):
    return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]

# 3. Encoder

In encoder, it iterates a sentence each timestep (i.e. word) by timestep. At each timestep, it outputs a hidden state vector and an output vector. The hidden state vector is passed to the next timestep. The output vector is recorded.

The last hidden state vector is stated as **"Encoder state"**, also known as **"Context Vector"**.

The first hidden state vector fed to the first timestep is all zeros.

The sentence is pad with *PAD_token* to a certain length.


In [30]:
class Encoder(nn.Module):
    def __init__(self, hidden_dim, vocab_size, embedding_dim, n_layers=1, dropout=0):
        """
        constructor
        @param hidden_dim     hidden dimension
        @param vocab_size     input vocabulary size
        @param embedding_dim  embedding dimension
        @param n_layers       number of recurrent layers
        @param dropout        dropout rate
        """
        super(Encoder, self).__init__()
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.n_layers = n_layers
        self.dropout = dropout
        
        # embedding layer
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size, 
            embedding_dim=embedding_dim)
        
        # recurrent layer
        self.rnn = nn.GRU(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=n_layers,
            dropout=(0 if n_layers == 1 else dropout),
            batch_first=True,
            bidirectional=True)
        
    def forward(self, input_seq, input_mask, input_lengths, hidden=None):
        """
        Applies a bi-directional GRU to input sequence input_seq.
        @param  input_seq      input sequence, [batch_size, input_lengths]
        @param  input_mask     input mask
        @param  input_lengths  input length
        @param  hidden         previous hidden state vector
        """
        # convert word indexes to embeddings
        embedded = self.embedding(input_seq)
        # embedded should have dimensions [batch, input_lengths, self.embedding_dim]
        
        # pack padded batch of sequences for RNN module
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths, batch_first=True)
        # packed should have dimension [batch, input_lengths, self.embedding_dim]
        
        # forward pass through recurrent layer
        output, hidden = self.rnn(packed, hidden)
        output, _ = torch.nn.utils.rnn.pad_packed_sequence(output, batch_first=True)
        
        # manually concatenate the final state for both directions
        forward_hidden = hidden[0:hidden.size(0):2]
        backward_hidden = hidden[1:hidden.size(0):2]
        final_hidden = torch.cat([forward_hidden, backward_hidden], dim=2)  # shape [self.n_layers, batch_size, 2*self.embedding_dim]
        
        # return output and final hidden state
        return output, final_hidden

# 4. Decoder

In decoder, it tries to reproduce the sentence each timestep (i.e. word) by timestep. At each timestep, it also outputs a hidden state vector and an output vector. The first initial hidden state at the first timestep is same as the "Context Vector" of the Encoder.


In [8]:
class Decoder(nn.Module):
    def __init__(self, hidden_dim, target_vocab_size, embedding_dim, output_dim, n_layers=1, dropout=0.1, bridge=True):
        """
        constructor
        @param hidden_dim         hidden dimension
        @param target_vocab_size  target vocabulary size
        @param embedding_dim      embedding dimension
        @param output_dim         output dimension
        @param n_layers           number of recurrent layers
        @param dropout            dropout rate
        """
        super(Decoder, self).__init__()
        
        # keep for reference
        self.hidden_dim = hidden_dim
        self.vocab_size = target_vocab_size
        self.embedding_dim = embedding_dim
        self.output_dim = output_dim
        self.n_layers = n_layers
        self.dropout = dropout
        
        # embedding layer
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size, 
            embedding_dim=embedding_dim)
        
        # embedding dropout
        self.embedding_dropout = nn.Dropout(p=dropout)
        
        # recurrent layer
        # QUESTION: 为什么要 +2*hidden_dim ?????
        self.rnn = nn.GRU(
            input_size=embedding_dim + 2*hidden_dim,
            hidden_size=hidden_dim,
            num_layers=n_layers,
            dropout=(0 if n_layers == 1 else dropout),
            batch_first=True,
            bidirectional=False)
        
        # to initialize from the final encoder state
        # QUESTION: self.bridge 是做什么的 ？？
        self.bridge = nn.Linear(2*hidden_dim, hidden_dim, bias=True) if bridge else None
        
        # dropout layer
        self.dropout = nn.Dropout(p=dropout)
        
        # fc layer
        self.out = nn.Linear(hidden_dim, output_dim)
    
    def forward_at_one_timestep(self, word, encoder_hidden, src_mask, )
    def forward(self, input_step, last_hidden, encoder_outputs):
        # get embedding of current input word
        embedded = self.embedding(input_step)
        embedded = self.embedding_dropout(embedded)
        
        # forward through recurrent layer
        rnn_output, hidden = self.rnn(embedded, last_hidden)
        rnn_output = rnn_output.squeeze(0)
        
        # predict next word
        output = self.out(rnn_output)
        output = F.softmax(output, dim=1)
        
        # return output and final hidden state
        return output, hidden

# 5. Wrapper

class Wrapper wraps one encoder and one decoder.

In [9]:
class EncoderDecoder(nn.Module):
    def __init__(self, encoder, decoder):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, src, target, src_mask, target_mask, src_length, target_length):
        encoder_output, encoder_hidden = self.encode(src, src_mask, src_length)
        return self.decode(encoder_output, encoder_hidden, src_mask, target, target_mask)
    
    def encode(self, src, src_mask, src_length):
        return self.encoder(src, src_mask, src_length)
    
    def decode(self, encoder_output, encoder_hidden, src_mask, target, target_mask, decoder_hidden=None):
        return self.decoder(target, encoder_output, encoder_hidden, src_mask, target_mask, hidden=decoder_hidden)


# 5. Evaluate

In [10]:
def evaluate(searcher, voc, sentence, max_length=MAX_LENGTH):
    # format input sentence as a batch
    # words -> indexes
    indexes_batch = [indexesFromSentence(voc, sentence)]
    
    # create lengths tensor
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    
    # transpose dimensions of batch to match models' expectations
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
    
    # use appropriate device
    input_batch = input_batch.to(device)
    lengths = lengths.to(device)
    
    # decoder sentence with searcher
    tokens, scores = searcher(input_batch, lengths, max_length)
    
    # indexes -> words
    decoded_words = [voc.index2word[token.item()] for token in tokens]
    
    return decoded_words

In [11]:
# evaluate inputs from user input (stdin)
def evaluateInput(searcher, voc):
    input_sentence = ''
    while True:
        try:
            # get input sentence
            input_sentence = input('> ')
            
            # check if it is quit case
            if input_sentence == 'q' or input_sentence == 'quit':
                break
            
            # normalize sentence
            input_sentence = normalizeString(input_sentence)
            
            # evaluate sentence
            output_words = evaluate(searcher, voc, input_sentence)
            
            # format and print response sentence
            output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
        except KeyError:
            print("Error: Encoutered unknown word.")

In [12]:
# Normalize input sentence and call evaluate()
def evaluateExample(sentence, searcher, voc):
    print("> " + sentence)
    
    # Normalize sentence
    input_sentence = normalizeString(sentence)
    
    # Evaluate sentence
    output_words = evaluate(searcher, voc, input_sentence)
    output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
    print('Bot:', ' '.join(output_words))

In [17]:
# set configurations
model_name = 'cb_model'

encoder_embedding_dim = 500
decoder_embedding_dim = 500

encoder_hidden_dim = 500
decoder_hidden_dim = 500

encoder_n_layers = 2
decoder_n_layers = 2

encoder_dropout_rate = 0.1
decoder_dropout_rate = 0.1

batch_size = 64

iterations = 4000

In [14]:
# read corpus and create Voc instance
corpus_name = 'cornell movie-dialogs corpus'
voc = Voc(corpus_name)

In [25]:
# initialize models
print('Building encoder and decoder ...')
vocab_size = voc.num_words
encoder = Encoder(encoder_hidden_dim, vocab_size, encoder_embedding_dim, n_layers=1, dropout=0)
decoder = Decoder(decoder_hidden_dim, vocab_size, decoder_embedding_dim, voc.num_words, n_layers=1, dropout=0.1)

# use appropriate device
encoder = encoder.to(device)
decoder = decoder.to(device)

# set models to train mode
encoder.train()
decoder.train()
print('Models built and ready to go!')

Building encoder and decoder ...
Models built and ready to go!


# 6. Convert Module to TorchScript


In [26]:
### Compile the whole greedy search model to TorchScript model
# Create artificial inputs
test_seq = torch.LongTensor(MAX_LENGTH, 1).random_(0, voc.num_words).to(device)
test_seq_length = torch.LongTensor([test_seq.size()[0]]).to(device)
# Trace the model
traced_encoder = torch.jit.trace(encoder, (test_seq, test_seq_length))

TypeError: tuple indices must be integers or slices, not tuple