## Sequence to Sequence translation practice

What I will do will like below.
1. Build a baseline translation system.
    1. One-directional RNN as encoder and decoder.
    2. Have Attention architecture.
2. Using word_embdding as input.
3. Using bi-directional RNN
4. Try more architecture.  

About the language, at first try to reimplement system using German to English.  
Then try to change it to Chinese to English.

### Preprocessing
- [ ] Establish a language class, it has word2index, index2word, and wordsdict.
- [ ] Split corpus into language pairs.
- [ ] Encode function


In [1]:
# Import cell
import re
import unicodedata
import string
import os
import random
import time

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
from torch import optim

#### Index words

In [2]:
SOS_token = 0
EOS_token = 1

class Lang:
    """Language class. Can index words for input sentences."""
    def __init__(self, name):
        self.name = name
        self.word2ind = {'SOS':0, 'EOS':1}
        self.word2cnt = {}
        self.ind2word = {0:'SOS', 1:'EOS'}
        self.nwords = 2
    
    def index_words(self, sentence):
        for word in sentence.split():
            self.index_word(word.lower())
    
    def index_word(self, word):
        if word not in self.word2ind:
            self.word2ind[word] = self.nwords
            self.word2cnt[word] = 1
            self.ind2word[self.nwords] = word
            self.nwords += 1
        else:
            self.word2cnt[word] += 1
            
    def __str__(self):
        return 'This is %s, it has %d words' % (self.name, self.nwords)

#### Reading and decoding language
First write function to decode and process the punctuation.  
*How to normalize Chinese?*

In [3]:
def unicode2ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def normalize_string(s):
    s = unicode2ascii(s.lower().strip())
    s = re.sub(r'([.!?])', r' \1', s)
    s = re.sub(r'[^a-zA-Z0-9.!?\t]', r' ', s)
    return s

#### Read Data

In [4]:
def read_langs(path, lang1_n, lang2_n):
    lang1 = Lang(lang1_n)
    lang2 = Lang(lang2_n)
    data = open(os.path.join(path,'%s-%s.txt'%(lang1_n, lang2_n)))
    pairs = []
    
    for line in data:
        pair = normalize_string(line).strip().split('\t')
        pairs.append(pair)
    return lang1, lang2, pairs

In [5]:
MAX_LENGTH = 20

good_prefixes = (
    "i",
    "he", 
    "she", 
    "you", 
    "they",
    "we"
)

def filter(p):
    return(len(p[0]) < MAX_LENGTH and p[0].startswith(good_prefixes))

def filter_pairs(pairs):
    return[pair for pair in pairs if filter(pair)]

In [6]:
def prepare_data(path, lang1_name, lang2_name):
    input_lang, output_lang, pairs = read_langs(path, lang1_name, lang2_name)
    print("Read %d sentence pairs" % len(pairs))
    
#     print(pairs[0:100])
    pairs = filter_pairs(pairs)
    print("Trimming to %d pairs." % len(pairs))
    print("Indexing words...")
    for pair in pairs:
        input_lang.index_words(pair[0])
        output_lang.index_words(pair[1])
    
    return input_lang, output_lang, pairs

In [7]:
path = '/home/andy/data/lang_pairs/'
lang1_name = 'eng'
lang2_name = 'deu'
input_lang, output_lang, pairs = prepare_data(path, lang1_name, lang2_name)

Read 152557 sentence pairs
Trimming to 7388 pairs.
Indexing words...


#### Turning data to variables

In [8]:
def index_sentence(sentence, lang):
    return [lang.word2ind[word] for word in sentence.strip().split()]

def variable_sentence(sentence, lang):
    indexs = index_sentence(sentence, lang)
    indexs = indexs + [EOS_token]
    var = Variable(torch.LongTensor(indexs).view(-1, 1)).cuda()
    return var

def variable_pair(pair):
    input_var = variable_sentence(pair[0], input_lang)
    output_var = variable_sentence(pair[1], output_lang)
    return (input_var, output_var)

## Building Model

### Encoder
We can use normal RNN layer to input input pair.
Get the final state.

In [9]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers)
    
    def forward(self, words_input, hidden):
        seq_len = len(words_input)
        embeded = self.embedding(words_input).view(seq_len, 1, -1)
        output, hidden = self.gru(embeded, hidden)
        return output, hidden
    
    def ini_hidden(self):
        hidden = Variable(torch.zeros(self.n_layers, 1, self.hidden_size)).cuda()
        return hidden

### Attention Decoder
$$
p(y_i \mid \{y_1,...,y_{i-1}\}, x) = g(y_{i-1}, s_i, c_i)  
$$
$$
s_i = f(s_{i-1}, y_{i-1}, c_i)  
$$
$$
c_i = \sum_{j=1}^{T_x}a_{ij}h_i  
$$
$$
a_{ij} = \dfrac{exp(e_{ij})}{\sum_{k=1}^Texp(e_{ik})}
$$
$$
e_{ij} = a(s_{i-1}, h_{j})
$$

The general form of the attention calculation relies on the target (decoder) side hidden state and corresponding source (encoder) side state, normalized over all states to get values summing to 1:

$$
a_t(s) = align(h_t, \bar h_s)  = \dfrac{exp(score(h_t, \bar h_s))}{\sum_{s'} exp(score(h_t, \bar h_{s'}))}
$$

The specific "score" function that compares two states is either *dot*, a simple dot product between the states; *general*, a a dot product between the decoder hidden state and a linear transform of the encoder state; or *concat*, a dot product between a new parameter $v_a$ and a linear transform of the states concatenated together.

$$
score(h_t, \bar h_s) =
\begin{cases}
h_t ^\top \bar h_s & dot \\
h_t ^\top \textbf{W}_a \bar h_s & general \\
v_a ^\top \textbf{W}_a [ h_t ; \bar h_s ] & concat
\end{cases}
$$

The modular definition of these scoring functions gives us an opportunity to build specific attention module that can switch between the different score methods. The input to this module is always the hidden state (of the decoder RNN) and set of encoder outputs.

In [14]:
class Attn(nn.Module):
    def __init__(self, method, hidden_size):
        super().__init__()
        
        self.method = method
        self.hidden_size = hidden_size
        
        if method == 'general':
            self.attn = nn.Linear(hidden_size, hidden_size)
        elif method == 'concat':
            self.attn = nn.Linear(hidden_size*2, hidden_size)
            self.other = nn.Parameter(torch.FloatTensor(hidden_size))
    
    def forward(self, hidden, encoder_outputs):
        seq_len = len(encoder_outputs)
        attn_energies = Variable(torch.zeros(seq_len)).cuda()
        for i in range(seq_len):
            attn_energies[i] = self.score(hidden, encoder_outputs[i])
        return F.softmax(attn_energies).unsqueeze(0).unsqueeze(0)
    
    def score(self, hidden, encoder_output):
        if self.method == 'dot':
            return hidden.squeeze().dot(encoder_output.squeeze())
        if self.method == 'general':
            energy = self.attn(encoder_output)
            energy = hidden.squeeze().dot(energy.squeeze())
            return  energy
        if self.method == 'concat':
            energy = self.attn(torch.cat(hidden, encoder_output), 1)
            energy = self.other.dot(energy.squeeze())
            return energy

In [11]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, attn_mode, hidden_size, output_size, n_layers=1, dropout=0.1):
        super().__init__()
        # Parameters
        self.attn_mode = attn_mode
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout
        
        # Layers
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size * 2, hidden_size, n_layers, dropout=dropout)
        self.out = nn.Linear(hidden_size * 2, output_size)
        
        #Attn mode
        if attn_mode != None:
            self.attn = Attn(attn_mode, hidden_size)
            
    def forward(self, word_input, last_hidden, last_context, encoder_outputs):
        word_embeded = self.embedding(word_input).view(1, 1, -1)
        
        rnn_input = torch.cat((word_embeded, last_context.unsqueeze(0)), 2)
        rnn_output, hidden = self.gru(rnn_input, last_hidden)
        
        attn_weight = self.attn(rnn_output.squeeze(0), encoder_outputs)
        context = attn_weight.bmm(encoder_outputs.transpose(0, 1))
        
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        output = F.log_softmax(self.out(torch.cat((rnn_output, context), 1)))
        return output, context, hidden, attn_weight
        
        

### Testing Model

In [15]:
encoder_test = EncoderRNN(10, 10, 2).cuda()
decoder_test = AttnDecoderRNN('general', 10, 10, 2).cuda()
print(encoder_test)
print(decoder_test)

encoder_hidden = encoder_test.ini_hidden()
word_input =  Variable(torch.LongTensor([1, 2, 3])).cuda()
encoder_outputs, encoder_hidden = encoder_test(word_input, encoder_hidden)

decoder_attns = torch.zeros(1, 3, 3)
decoder_hidden = encoder_hidden
decoder_context = Variable(torch.zeros(1, decoder_test.hidden_size)).cuda()

for i in range(3):
    decoder_output, decoder_context, decoder_hidden, decoder_attn = decoder_test(word_input[i], decoder_hidden,
                                                                               decoder_context, encoder_outputs)
    print(decoder_output.size(), decoder_hidden.size(), decoder_attn.size())
    decoder_attns[0, i] = decoder_attn.squeeze(0).cpu().data

EncoderRNN (
  (embedding): Embedding(10, 10)
  (gru): GRU(10, 10, num_layers=2)
)
AttnDecoderRNN (
  (embedding): Embedding(10, 10)
  (gru): GRU(20, 10, num_layers=2, dropout=0.1)
  (out): Linear (20 -> 10)
  (attn): Attn (
    (attn): Linear (10 -> 10)
  )
)
torch.Size([1, 10]) torch.Size([2, 1, 10]) torch.Size([1, 1, 3])
torch.Size([1, 10]) torch.Size([2, 1, 10]) torch.Size([1, 1, 3])
torch.Size([1, 10]) torch.Size([2, 1, 10]) torch.Size([1, 1, 3])


## Training
The process:
1. First, run sentence word by word, and last get the outputs and last hidden state.
2. Feed hidden state and first input word SOS to decoder input and context.init.
3. And get output, hidden state. Do cycle.
4. When training, use teacher forcing.


In [13]:
teacher_force_ratio = 0.5
clip = 5

def training(input_var, target_var, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):
    
    loss = 0
    
    #Zero gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    #Get length
    input_len = input_var.size()[0]
    target_len = target_var.size()[0]
    
    #Run words through encoder
    encoder_hidden = encoder.ini_hidden()
    encoder_outputs, encoder_hidden = encoder(input_var, encoder_hidden)
    
    #Then go to decoder, prepare input, context, hidden first
    decoder_input = Variable(torch.LongTensor([[SOS_token]])).cuda()
    decoder_hidden = encoder_hidden
    decoder_context = Variable(torch.zeros(1, decoder.hidden_size)).cuda()
    
    #Training decoder, use teacher enforce
    is_teacher_enforce = random.random() < teacher_force_ratio
    if is_teacher_enforce:
        for i in range(target_len):
            decoder_output, decoder_context, decoder_hidden, decoder_atten = decoder(decoder_input,
                                                                                    decoder_hidden,
                                                                                    decoder_context,
                                                                                    encoder_outputs)
            loss += criterion(decoder_output, target_var[i])
            decoder_input = target_var[i]
    else:
        for i in range(target_len):
            decoder_output, decoder_context, decoder_hidden, decoder_atten = decoder(decoder_input,
                                                                                    decoder_hidden,
                                                                                    decoder_context,
                                                                                    encoder_outputs)
            loss += criterion(decoder_output, target_var[i])
            
            _, topi = decoder_output.data.topk(1)
            ni = topi[0][0]
            
            decoder_input = Variable(torch.LongTensor(ni)).cuda()
            if ni == EOS_token:
                break
    
    #Backpropagation
    loss.backward()
    torch.nn.utils.clip_grad_norm(encoder.parameters(), clip)
    torch.nn.utils.clip_grad_norm(decoder.parameters(), clip)
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    return loss.data[0] / target_len
    

In [14]:

# Time helper
def as_min(s):
    m = s//60
    s = s % 60
    return "%dm %ds" % (m,s)

def since_time(since, percent):
    s = time.time()
    s = s - since
    es = s / percent
    rs = es - s
    return "%s - (%s)" % (as_min(s), as_min(rs))

## Running Training

In [15]:
#hyperparameter, initial model, optimizor, and loss function
attn_mode = "general"
hidden_size = 500
n_layers = 2
dropout = 0.05

#initial model
encoder = EncoderRNN(input_lang.nwords, hidden_size, n_layers).cuda()
decoder = AttnDecoderRNN(attn_mode, hidden_size, output_lang.nwords, n_layers, dropout).cuda()

#optimizor and learning_rate
learning_rate = 1e-4
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
criterion = nn.NLLLoss()

In [16]:
# plotting and data
n_epochs = 50000
plot_every = 300
print_every = 1000

#history storage
plot_losses = []
plot_loss_total = 0
print_loss_total = 0

In [19]:
start = time.time()
for epoch in range(1, n_epochs+1):
    # Get data
    input_var, output_var = variable_pair(random.choice(pairs))
    
    #Running traning
    loss = training(input_var, output_var, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
#     print(loss)
    #keep track of loss
    plot_loss_total += loss
    print_loss_total += loss
    
    #keep plot
    if epoch % plot_every == 0:
        plot_losses.append(plot_loss_total/plot_every)
        plot_loss_total = 0
    
    #print states
    if epoch % print_every == 0:
        print_loss_ave = print_loss_total / print_every
        since = time.time()
        print("%s (%d %d%%) %.4f" % (since_time(since, epoch/n_epochs), epoch, epoch/n_epochs, print_loss_ave))
        
        
    

RuntimeError: cuda runtime error (59) : device-side assert triggered at /home/andy/tools/pytorch/torch/lib/THC/generic/THCTensorCopy.c:18