In [2]:
from IPython.display import Image
from IPython.core.display import HTML
import getpass

from traitlets.config.manager import BaseJSONConfigManager
path = "/Users/{}/anaconda3/envs/rise_latest/etc/jupyter/nbconfig".format(getpass.getuser())
cm = BaseJSONConfigManager(config_dir=path)
o = cm.update("livereveal", {
              "theme": "sky",
              "transition": "fade",
              "start_slideshow_at": "selected",
})

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# Sequence to Sequence Modeling

## Topics
* Encoder-Decoder Architecture
* Neural Machine Translation

## Variable Length Sequence to Sequence
<br>
<br>
<center>
<img src="src/Shape_of_NLP_Problems_8.png?" alt="perceptron" style="width:968px">
</center>   

## Sequence-to-Sequence Overview
<br>
<br>
<center>
<img src="src/0_encoder_decoder.png?" alt="perceptron" style="width:968px">
</center>   

## Encoder-Decoder Architecture
<br>
<br>
<center>
<img src="src/1_encoder_decoder.png?" alt="perceptron" style="width:968px">
</center>   

## Sequence-to-Sequence Training: Input
<br>
<br>
<center>
<img src="src/2_encoder_decoder.png?" alt="perceptron" style="width:968px">
</center>   

## Sequence-to-Sequence Training: Loss
<br>
<br>
<center>
<img src="src/3_encoder_decoder.png?" alt="perceptron" style="width:968px">
</center>   

## Sequence-to-Sequence Training: Teacher Forcing
<br>
<br>
<center>
<img src="src/3a_encoder_decoder.png?" alt="perceptron" style="width:968px">
</center>   

## Sequence-to-Sequence Inference
<br>
<br>
<center>
<img src="src/4_encoder_decoder.png?" alt="perceptron" style="width:968px">
</center>   

In [3]:
# Required imports
import torch
import numpy as np
import pandas as pd
import pickle
from torch.nn import Linear, Embedding, RNN, GRU, LSTM
from torch.nn import Sigmoid, LogSoftmax
from torch.optim import SGD, Adam
from torch.nn import BCELoss, NLLLoss, CrossEntropyLoss
from string import punctuation
import itertools
from tqdm import tqdm
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelBinarizer, LabelEncoder

In [4]:
data = pd.read_pickle('../data/4_europarl_en_sp.pkl')
data['text'] = data['english'].map(lambda x: "".join([i for i in x.lower() if i not in string.punctuation]).split())
data['label'] = data['spanish'].map(lambda x: "".join([i for i in x.lower() if i not in string.punctuation]).split())

data['text'] = data['text'].map(lambda x: ['<SOS>'] + x + ['<EOS>'])
data['label'] = data['label'].map(lambda x: ['<SOS>'] + x + ['<EOS>'])
print(data.shape)
data.head()


(11618, 4)


Unnamed: 0,english,spanish,text,label
600702,Allow us to state that the opening of the inqu...,Permita que le digamos que las investigaciones...,"[<SOS>, allow, us, to, state, that, the, openi...","[<SOS>, permita, que, le, digamos, que, las, i..."
3733,"Since June of last year, the OLAF regulation h...",Desde junio del año pasado el Reglamento de la...,"[<SOS>, since, june, of, last, year, the, olaf...","[<SOS>, desde, junio, del, año, pasado, el, re..."
1180999,I am fully aware of the views expressed here o...,Soy plenamente consciente de las opiniones exp...,"[<SOS>, i, am, fully, aware, of, the, views, e...","[<SOS>, soy, plenamente, consciente, de, las, ..."
109251,I will have clarification of that situation la...,Hoy mismo recibiré información sobre dicha sit...,"[<SOS>, i, will, have, clarification, of, that...","[<SOS>, hoy, mismo, recibiré, información, sob..."
1401751,"Let us not forget, the Roma were the first to ...",No olvidemos que la población gitana fue la pr...,"[<SOS>, let, us, not, forget, the, roma, were,...","[<SOS>, no, olvidemos, que, la, población, git..."


In [5]:
input_words = set(itertools.chain.from_iterable(data['text']))
output_words = set(itertools.chain.from_iterable(data['label']))

input2idx = {word: idx for idx, word in enumerate(input_words)}
idx2input = {idx: word for word, idx in input2idx.items()}

output2idx = {word: idx for idx, word in enumerate(output_words)}
idx2output = {idx: word for word, idx in output2idx.items()}

input_size = len(input_words)
output_size = len(output_words)

In [6]:
input_seqs = data['text'].map(lambda x: [input2idx[i] for i in x]).tolist()
output_seqs = data['label'].map(lambda x: [output2idx[i] for i in x]).tolist()

data = list(zip(input_seqs, output_seqs))

train_data, test_data = train_test_split(data)

In [8]:
# First we'll want to create an embedding layer for our encoder
enc_embedding = Embedding(num_embeddings=input_size, embedding_dim=100)

# Next we'll want to create an LSTM for our encoder
enc_rnn = LSTM(input_size=100, hidden_size=50)

# Next we'll want to create an embedding layer for our decoder
dec_embedding = Embedding(num_embeddings=output_size, embedding_dim=100)

# Next, create an LSTM for the decoder
dec_rnn = LSTM(input_size=100, hidden_size=50)

# When we read the output from the decoder network we'll
# want to classify it as one of the words from the output corpus
# to do this, we'll need a linear layer
dec_linear = Linear(50, output_size)

# Lastly, we'll need an instance of LogSoftmax to convert the
# output to a softmax distribution for feeding into NLLLoss
# Hint: Set dim=0 on initialization here
softmax = LogSoftmax(dim=1)

# Create an instance of the NLLLoss
criterion = NLLLoss()

In [9]:
# grab an example input and output sequence:
input_seq = input_seqs[0]
output_seq = output_seqs[0]

# convert these to torch tensors
input_seq_tensor = torch.LongTensor(input_seq)
output_seq_tensor = torch.LongTensor(output_seq)
print("Input Sequence Tensor Shape:", input_seq_tensor.shape)
print("Output Sequence Tensor Shape:", output_seq_tensor.shape)

# pass the input sequence through the encoder embedding
enc_embedded = enc_embedding.forward(input_seq_tensor)
print("Encoder Embedded Sequence Shape:", enc_embedded.shape)

# unsqueeze the embedding tensor to have a batch size of 1
enc_embedded_unsqueezed = enc_embedded.unsqueeze(1)
print("Encoder Embedded Sequence Shape (1 batch):", enc_embedded.shape)

# create initial hidden states for the encoder LSTM
h0 = torch.zeros(1, 1, 50)
c0 = torch.zeros(1, 1, 50)
enc_hidden = (h0, c0)
    

# pass the embedded input sequence through the encoder LSTM
enc_out, enc_hidden = enc_rnn(enc_embedded_unsqueezed, enc_hidden)
print("Encoder Output Shape:", enc_out.shape)
print("Encoder Hidden Shape(s):", enc_hidden[0].shape, enc_hidden[1].shape)

# set the decoder rnn initial hidden state 
# to the last hidden state of the encoder rnn
dec_hidden = enc_hidden

# pass the output sequence through the decoder embedding layer
dec_embedded = dec_embedding(output_seq_tensor)
print("Decoder Embedded Sequence Shape:", dec_embedded.shape)

# unsqueeze the embedding tensor to have a batch size of 1
dec_embedded_unsqueezed = dec_embedded.unsqueeze(1)
print("Decoder Embedded Sequence Shape (1 batch):", dec_embedded.shape)

# Assuming only teacher forcing (running true answer as input to the decoder)
# Run the output sequence through the dec LSTM
# Note: we want to pass ALL BUT THE LAST elemebt of the output 
# sequence through the LSTM

dec_lstm_in = dec_embedded_unsqueezed[:-1]
print("Decoder LSTM Input Shape:", dec_lstm_in.shape)

dec_out, dec_hidden = dec_rnn(dec_lstm_in, dec_hidden)
print("Decoder Output Shape:", dec_out.shape)
print("Decoder Hidden Shape(s):", dec_hidden[0].shape, dec_hidden[1].shape)

# Now we want to run the decoder output through our decoder linear layer
# Also, squeeze the batch dimension (dim=1) of the linear output to get rid of it
dec_linear_output = dec_linear(dec_out).squeeze(1)
print("Decoder Linear Output Shape:", dec_linear_output.shape)

# pass the decoder linear output through a softmax
dec_softmax_output = softmax(dec_linear_output)
print("Decoder Softmax Output Shape:", dec_softmax_output.shape)

# verify that the decoder output distributions is _actually_ a softmax
dec_softmax_norms = torch.exp(dec_softmax_output).sum(dim=1)
print("Decoder Softmax Norms Shape:", dec_softmax_norms.shape)
print("Decoder Softmax Norms:", dec_softmax_norms)

# the targets for the loss function should be
# ALL BUT THE FIRST element of the output sequence
dec_loss_target = output_seq_tensor[1:]
print("Decoder Loss Target Shape:", dec_loss_target.shape)

# Calculate the loss using the decoder softmax 
# output and the decoder loss target
loss = criterion(dec_softmax_output, dec_loss_target)
print("Loss:", loss.data)

Input Sequence Tensor Shape: torch.Size([16])
Output Sequence Tensor Shape: torch.Size([13])
Encoder Embedded Sequence Shape: torch.Size([16, 100])
Encoder Embedded Sequence Shape (1 batch): torch.Size([16, 100])
Encoder Output Shape: torch.Size([16, 1, 50])
Encoder Hidden Shape(s): torch.Size([1, 1, 50]) torch.Size([1, 1, 50])
Decoder Embedded Sequence Shape: torch.Size([13, 100])
Decoder Embedded Sequence Shape (1 batch): torch.Size([13, 100])
Decoder LSTM Input Shape: torch.Size([12, 1, 100])
Decoder Output Shape: torch.Size([12, 1, 50])
Decoder Hidden Shape(s): torch.Size([1, 1, 50]) torch.Size([1, 1, 50])
Decoder Linear Output Shape: torch.Size([12, 15608])
Decoder Softmax Output Shape: torch.Size([12, 15608])
Decoder Softmax Norms Shape: torch.Size([12])
Decoder Softmax Norms: tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000], grad_fn=<SumBackward2>)
Decoder Loss Target Shape: torch.Size([12])
Loss: tensor(9.7073)


In [10]:
class encoder(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, batch_size):
        super(encoder, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = Embedding(num_embeddings=vocab_size, 
                                   embedding_dim=embedding_dim)
        self.rnn = LSTM(input_size=embedding_dim, 
                       hidden_size=hidden_dim)
        self.batch_size = batch_size
        self.softmax = LogSoftmax()
        self.hidden = self.init_hidden()
                
    def forward(self, x):
        e = self.embedding(x)
        e = e.view(len(x), self.batch_size, -1)
        out, self.hidden = self.rnn(e, self.hidden)
        return out, self.hidden
                  
    def init_hidden(self):
        h0 = torch.autograd.Variable(torch.zeros(1, self.batch_size, self.hidden_dim))
        c0 = torch.autograd.Variable(torch.zeros(1, self.batch_size, self.hidden_dim))
        return (h0, c0)
    
class decoder(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, batch_size):
        super(decoder, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = Embedding(num_embeddings=vocab_size, 
                                   embedding_dim=embedding_dim)
        self.rnn = LSTM(input_size=embedding_dim, 
                       hidden_size=hidden_dim)
        self.linear = Linear(hidden_dim, output_dim)
        self.batch_size = batch_size
        self.softmax = LogSoftmax(dim=1)
        self.hidden = self.init_hidden()
                
    def forward(self, input, hidden):
        self.hidden = hidden
        e = self.embedding(input)
        e = e.view(len(input), self.batch_size, -1)
        out, self.hidden = self.rnn(e, self.hidden)
        self.out = out
        output = self.linear(out[0])
        so = self.softmax(output)
        return so, self.hidden
                  
    def init_hidden(self):
        h0 = torch.autograd.Variable(torch.zeros(1, self.batch_size, self.hidden_dim))
        c0 = torch.autograd.Variable(torch.zeros(1, self.batch_size, self.hidden_dim))
        return (h0, c0)
    
class seq2seq(torch.nn.Module):
    def __init__(self, encoder, decoder):
        super(seq2seq, self).__init__()
        self.enc = encoder
        self.dec = decoder
                
    def forward(self, input_seq, output_seq, p_tf=0):
        outputs = []
        
        self.enc.hidden = self.enc.init_hidden()
        self.dec.hidden = self.dec.init_hidden()        
        
        enc_output, enc_hidden = enc.forward(torch.LongTensor(input_seq))
        dec_hidden = enc_hidden
        tf_cnt = 0
        for i in range(output_seq.shape[0]):  
            
            if (np.random.uniform()) > p_tf and (i != 0):
                dec_input = torch.LongTensor([torch.argmax(dec_output).data])
            else:
                dec_input = torch.LongTensor([output_seq[i]])
                
            dec_output, dec_hidden = self.dec.forward(dec_input, dec_hidden) 
            outputs.append(dec_output)
            
        return torch.stack(outputs).squeeze(1)
    
    def predict(self, input_seq, sos_idx, eos_idx, max_len=20):
        outputs = []
        self.enc.hidden = self.enc.init_hidden()
        self.dec.hidden = self.dec.init_hidden()   
        
        enc_output, enc_hidden = enc.forward(torch.LongTensor(input_seq))
        dec_hidden = enc_hidden
        
        cnt = 0
        dec_input = torch.LongTensor([sos_idx])
        
        dec_output, dec_hidden = self.dec.forward(dec_input, dec_hidden) 
        
        output_idx = torch.argmax(dec_output).data
        
        while (int(output_idx) != eos_idx) and (cnt <= max_len): 
            cnt += 1
            dec_input = torch.LongTensor([output_idx])        
            dec_output, dec_hidden = self.dec.forward(dec_input, dec_hidden) 
            output_idx = torch.argmax(dec_output).data
            outputs.append(int(output_idx))
            
            
        return outputs  

In [11]:
enc_vocab_size = input_size
enc_embedding_dim = 100
enc_hidden_dim = 50

dec_vocab_size = output_size
dec_embedding_dim = 100
dec_hidden_dim = 50
dec_output_dim = output_size

enc = encoder(enc_vocab_size, enc_embedding_dim, enc_hidden_dim, batch_size=1)
dec = decoder(dec_vocab_size, dec_embedding_dim, dec_hidden_dim, dec_output_dim, batch_size=1)
s2s = seq2seq(enc, dec)


optim = SGD(params=s2s.parameters(), lr=0.01)
criterion = NLLLoss()

In [12]:
epochs = 2
for epoch in range(epochs):
    s2s.train()
    total_loss = 0
    s2s.train()
    for it, example in enumerate(train_data):

        if (it % 100 == 0) and (it != 0):
            print("Epoch|it: {}|{}, Total Loss: {:.2f}".format(epoch, it, total_loss / it))
        input_seq, output_seq = example
        optim.zero_grad()

        input_seq = torch.LongTensor(input_seq)
        output_seq = torch.LongTensor(output_seq)

        res = s2s.forward(input_seq, output_seq[:-1], p_tf=0.5)
        loss = criterion(res, output_seq[1:])
        loss.backward()
        total_loss += loss.data.numpy()

        optim.step()


Epoch|it: 0|100, Total Loss: 9.65


KeyboardInterrupt: 

In [13]:
sos_idx = output2idx['<SOS>']
eos_idx = output2idx['<EOS>']
pred_idxs = s2s.predict(input_seq, sos_idx, eos_idx)
[idx2output[i] for i in pred_idxs]


['empresariales',
 'quedan',
 'cancelación',
 'rübig',
 'entrevistas',
 'países',
 'alimentaria',
 'ocupar',
 'culturas',
 'hora',
 'nombramientos',
 'colosal',
 'paulatina',
 'constituyen',
 'sombra',
 'interferían',
 'venganza',
 'democratacristiana',
 'cero',
 'hiv',
 'reintroducir']