## Sequence to sequence learning from TensorFlow

### Model №2. Ngrams to Characters.

In [1]:
# ngram encoding and decoding

def word2ngrams(word, n=3):
    ngrams = [word[i:i+n] for i in range(len(word)-n+1)]
    return ngrams
    
def ngrams2word(ngrams, n=3):
    word = ''.join([ngram[0] for ngram in ngrams])
    word += ngrams[-1][-2] + ngrams[-1][-1]
    return word

In [5]:
import codecs

letters = u"́-_'abcdefghijklmnopqrstuvwxyzáóúíéṡḟæǽβęœī"

# cleaned dictionary 
with codecs.open('LemmaDict.txt', 'r', encoding='utf-8') as f:
    LemmaDict ={}
    for line in f:
        try:
            items = line.strip('\r\n').split('\t')
            LemmaDict[items[0]] = items[1]
        except IndexError:
            pass
                               
ngrams = []
for form, lemma in LemmaDict.items():
    ngrams += word2ngrams(form)
    ngrams += word2ngrams(lemma)
ngrams = set(ngrams)
print(len(ngrams))
print(len(letters))

# converting trigrams and letters to indices
index_to_letter = dict(enumerate(letters))
letter_to_index = dict((v, k) for k,v in index_to_letter.items())
index_to_ngram = dict(enumerate(ngrams))
ngram_to_index = dict((v, k) for k,v in index_to_ngram.items())


EncodedLemmaDict = {}
for form, lemma in LemmaDict.items():
    try:
        EncodedLemmaDict[lemma.lower()] = [ngram_to_index[ngram] for ngram in word2ngrams(form)]
    except IndexError:
        pass
                               
    
max_k = max([len(k) for k,v in EncodedLemmaDict.items()])
max_v = max([len(v) for k,v in EncodedLemmaDict.items()])
for k,v in EncodedLemmaDict.items():
    if len(k) == max_k or  len(v) == max_v:
        print(k)
        print(v)
        
print(max_k)
print(max_v)

8624
43
eschoitchennaighthe
[7940, 8375, 5519, 7324, 815, 7683, 1435, 2370, 4395, 6981, 3952, 3796, 7806, 6555, 5733, 8256, 5657]
dochenélaigidir
[1751, 7805, 7260, 619, 8324, 1675, 2628, 7569, 2370, 4395, 7514, 7412, 2991, 5957, 6960, 7806, 8553, 1745, 2743, 874, 3307, 2502]
triscatail-trénfher
[2195, 7555, 7100, 924, 7879, 2217, 1969, 3202, 8254, 4801, 1355, 7355, 6481, 813, 1594, 4780]
19
22


In [8]:
import numpy as np

pairs = np.random.permutation(list(EncodedLemmaDict.keys()))

input_ = np.zeros((len(pairs), 67)) 
labels_ = np.zeros((len(pairs), 23))
for i, k in enumerate(pairs):
    v = EncodedLemmaDict[k]
    k = k + "_" * (22 - len(k))
    v = v + [0] * (19 - len(v))
    for j, n in enumerate(v):
        input_[i][j] = n
    for j, letter in enumerate(k.split()):
        try:
            labels_[i][j] = letter_to_index[letter]
        except KeyError:
            pass
        
input_ = input_.astype(np.int32)
labels_ = labels_.astype(np.int32)

input_test   = input_[:5000]
input_val    = input_[5000:10000]
input_train  = input_[10000:]
labels_test  = labels_[:5000]
labels_val   = labels_[5000:10000]
labels_train = labels_[10000:]

data_test  = zip(input_test, labels_test)
data_val   = zip(input_val, labels_val)
data_train = zip(input_train, labels_train)

In [10]:
print(list(data_train)[9679])

(array([8120,  643, 1285, 4838, 3794, 2882,   64,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0]), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))


In [12]:
# building the model (analogous to grapheme-to-phoneme models)

import tensorflow as tf
import numpy as np
from tensorflow.python.framework import ops
from tensorflow.contrib.rnn import RNNCell, MultiRNNCell, DropoutWrapper, BasicLSTMCell
from tensorflow.contrib.legacy_seq2seq import embedding_rnn_seq2seq, sequence_loss
from tensorflow.contrib import seq2seq

In [7]:
ops.reset_default_graph()
try:
    sess.close()
except:
    
    pass
sess = tf.InteractiveSession()

In [8]:
input_seq_length = 22
output_seq_length = 19
batch_size = 128

input_vocab_size = 8624
output_vocab_size = 43
embedding_dim = 256

In [9]:
encode_input = [tf.placeholder(tf.int32, 
                                shape=(None,),
                                name = "ei_%i" %i)
                                for i in range(input_seq_length)]

labels = [tf.placeholder(tf.int32,
                                shape=(None,),
                                name = "l_%i" %i)
                                for i in range(output_seq_length)]

decode_input = [tf.zeros_like(encode_input[0], dtype=np.int32, name="GO")] + labels[:-1]

In [10]:
keep_prob = tf.placeholder("float")

cells = [DropoutWrapper(
        BasicLSTMCell(embedding_dim), output_keep_prob=keep_prob) for i in range(3)]

stacked_lstm = MultiRNNCell(cells)

with tf.variable_scope("decoders") as scope:
    decode_outputs, decode_state = embedding_rnn_seq2seq(
        encode_input, decode_input, stacked_lstm, input_vocab_size, output_vocab_size, embedding_dim)
    
    scope.reuse_variables()
    
    decode_outputs_test, decode_state_test = embedding_rnn_seq2seq(
        encode_input, decode_input, stacked_lstm, input_vocab_size, output_vocab_size, embedding_dim,
    feed_previous=True)

In [11]:
loss_weights = [tf.ones_like(l, dtype=tf.float32) for l in labels]
loss = sequence_loss(decode_outputs, labels, loss_weights, output_vocab_size)
optimizer = tf.train.AdamOptimizer(1e-4)
train_op = optimizer.minimize(loss)

In [12]:
sess.run(tf.initialize_all_variables())

Instructions for updating:
Use `tf.global_variables_initializer` instead.


### Training model

In [13]:
class DataIterator:
    def __init__(self, data, batch_size):
        self.data = data
        self.batch_size = batch_size
        self.iter = self.make_random_iter()
        
    def next_batch(self):
        try:
            idxs = self.iter.next()
        except StopIteration:
            self.iter = self.make_random_iter()
            idxs = self.iter.next()
        X, Y = zip(*[self.data[i] for i in idxs])
        X = np.array(X).T
        Y = np.array(Y).T
        return X, Y

    def make_random_iter(self):
        splits = np.arange(self.batch_size, len(self.data), self.batch_size)
        it = np.split(np.random.permutation(range(len(self.data))), splits)[:-1]
        return iter(it)
    
train_iter = DataIterator(data_train, 128)
val_iter = DataIterator(data_val, 128)
test_iter = DataIterator(data_test, 128)

In [14]:
import sys

def get_feed(X, Y):
    feed_dict = {encode_input[t]: X[t] for t in range(input_seq_length)}
    feed_dict.update({labels[t]: Y[t] for t in range(output_seq_length)})
    return feed_dict

def train_batch(data_iter):
    X, Y = data_iter.next_batch()
    feed_dict = get_feed(X, Y)
    feed_dict[keep_prob] = 0.5
    _, out = sess.run([train_op, loss], feed_dict)
    return out

def get_eval_batch_data(data_iter):
    X, Y = data_iter.next_batch()
    feed_dict = get_feed(X, Y)
    feed_dict[keep_prob] = 1.
    all_output = sess.run([loss] + decode_outputs_test, feed_dict)
    eval_loss = all_output[0]
    decode_output = np.array(all_output[1:]).transpose([1,0,2])
    return eval_loss, decode_output, X, Y

def eval_batch(data_iter, num_batches):
    losses = []
    predict_loss = []
    for i in range(num_batches):
        eval_loss, output, X, Y = get_eval_batch_data(data_iter)
        losses.append(eval_loss)
        
        for index in range(len(output)):
            real = Y.T[index]
            predict = np.argmax(output, axis = 2)[index]
            predict_loss.append(all(real==predict))
    return np.mean(losses), np.mean(predict_loss)

In [None]:
from datetime import datetime

for i in range(100000):
    try:
        start = datetime.now()
        train_batch(train_iter)
        if i % 1000 == 0:
            print(i)
            val_loss, val_predict = eval_batch(val_iter, 16)
            train_loss, train_predict = eval_batch(train_iter, 16)
            print("val loss   : %f, val predict   = %.1f%%" %(val_loss, val_predict * 100))
            print("train loss : %f, train predict = %.1f%%" %(train_loss, train_predict * 100))
            print('time elapsed: %s'% (datetime.now() - start))
            print()
            sys.stdout.flush()
    except KeyboardInterrupt:
        print("interrupted by user")
        break

0
val loss   : 9.028131, val predict   = 0.0%

train loss : 9.028156, train predict = 0.0%
time elapsed: 0:08:58.397458

1000
val loss   : 1.846467, val predict   = 0.0%

train loss : 1.804592, train predict = 0.0%
time elapsed: 0:08:41.915604

2000
val loss   : 1.703288, val predict   = 0.0%

train loss : 1.646480, train predict = 0.0%
time elapsed: 0:08:51.067687

3000
val loss   : 1.598190, val predict   = 0.0%

train loss : 1.566288, train predict = 0.0%
time elapsed: 0:08:41.543225

4000
val loss   : 1.472756, val predict   = 2.1%

train loss : 1.460001, train predict = 1.5%
time elapsed: 0:08:36.432005

5000
val loss   : 1.394976, val predict   = 1.4%

train loss : 1.331762, train predict = 1.6%
time elapsed: 0:08:39.360730



### Evaluating model

In [41]:
eval_loss, output, X, Y = get_eval_batch_data(test_iter)
test_loss, test_predict = eval_batch(test_iter, 16)

In [None]:
print("form".ljust(40)),
print("real lemma".ljust(17)),
print("predicted lemma".ljust(17)),
print()"is correct"
print

for index in range(len(output)):
    ngrams = " ".join([index_to_ngram[p] for p in X.T[index]]) 
    real = [index_to_ngram[l] for l in Y.T[index]] 
    predict = [index_to_ngram[l] for l in np.argmax(output, axis = 2)[index]]
   
    print ngrams.split(" _")[0].ljust(40),
    print "".join(real).split("_")[0].ljust(17),
    print "".join(predict).split("_")[0].ljust(17),
    print str(real == predict)