## Sequence to sequence learning from TensorFlow

In [1]:
# ngram encoding and decoding

def word2ngrams(word, n=3):
    ngrams = [word[i:i+n] for i in range(len(word)-n+1)]
    return ngrams
    
def ngrams2word(ngrams, n=3):
    word = ''.join([ngram[0] for ngram in ngrams])
    try:
        word += ngrams[-1][-2] + ngrams[-1][-1]
    except IndexError:
        pass
    return word

In [3]:
# converting trigrams and letters to indices
import codecs

# letters = u"́ -_'abcdefghijklmnopqrstuvwxyzáóúíéṡḟōäïāūæēṅǽüöβīḯṁëăęɔĭĕœ"
letters = u"́-_'abcdefghijklmnopqrstuvwxyzáóúíéṡḟæǽβęœī"

# cleaned dictionary 
with codecs.open('LemmaDict.txt', 'r', encoding='utf-8') as f:
    LemmaDict ={}
    for line in f:
        try:
            items = line.strip('\r\n').split('\t')
            LemmaDict[items[0]] = items[1]
        except IndexError:
            pass

index_to_letter = dict(enumerate(letters))
letter_to_index = dict((v, k) for k,v in index_to_letter.items())


EncodedLemmaDict = {}
for form, lemma in LemmaDict.items():
    try:
        EncodedLemmaDict[lemma.lower()] = [letter_to_index[letter] for letter in form]
    except IndexError:
        pass
    
max_k = max([len(k) for k,v in EncodedLemmaDict.items()])
max_v = max([len(v) for k,v in EncodedLemmaDict.items()])
for k,v in EncodedLemmaDict.items():
    if len(k) == max_k or  len(v) == max_v:
        print(k)
        print(v)
        
print max_k
print max_v
print len(letters)

dochenélaigidir
[4, 21, 21, 18, 17, 7, 18, 12, 6, 11, 8, 17, 8, 1, 15, 4, 12, 10, 22, 12, 24, 21, 22, 4]
eschoitchennaighthe
[8, 22, 6, 11, 18, 12, 23, 6, 11, 8, 17, 17, 4, 12, 10, 11, 23, 11, 8]
triscatail-trénfher
[23, 21, 12, 22, 6, 4, 23, 4, 12, 15, 23, 21, 34, 17, 9, 11, 8, 21]
19
24
43


In [4]:
import numpy as np

pairs = np.random.permutation(list(EncodedLemmaDict.keys()))

input_ = np.zeros((len(pairs), 25))
labels_ = np.zeros((len(pairs), 19))

for i, k in enumerate(pairs):
    v = EncodedLemmaDict[k]
    k = k + "_" * (19 - len(k))
    v = v + [0] * (25 - len(v))
    for j, n in enumerate(v):
        input_[i][j] = n
    for j, letter in enumerate(k):
        try:
            labels_[i][j] = letter_to_index[letter]
        except IndexError:
            print k
        
input_ = input_.astype(np.int32)
labels_ = labels_.astype(np.int32)

input_test   = input_[:5000]
input_val    = input_[5000:10000]
input_train  = input_[10000:]
labels_test  = labels_[:5000]
labels_val   = labels_[5000:10000]
labels_train = labels_[10000:]

data_test  = zip(input_test, labels_test)
data_val   = zip(input_val, labels_val)
data_train = zip(input_train, labels_train)

In [5]:
import tensorflow as tf
import numpy as np
from tensorflow.python.framework import ops
from tensorflow.contrib.rnn import RNNCell, MultiRNNCell, DropoutWrapper, BasicLSTMCell
from tensorflow.contrib.legacy_seq2seq import embedding_rnn_seq2seq, sequence_loss
from tensorflow.contrib import seq2seq


tf.__version__

'1.0.0'

In [6]:
ops.reset_default_graph()
try:
    sess.close()
except:
    
    pass
sess = tf.InteractiveSession()

In [7]:
input_seq_length = 25
output_seq_length = 19
batch_size = 128

input_vocab_size = 9686
output_vocab_size = 43
embedding_dim = 256

In [8]:
encode_input = [tf.placeholder(tf.int32, 
                                shape=(None,),
                                name = "ei_%i" %i)
                                for i in range(input_seq_length)]

labels = [tf.placeholder(tf.int32,
                                shape=(None,),
                                name = "l_%i" %i)
                                for i in range(output_seq_length)]

decode_input = [tf.zeros_like(encode_input[0], dtype=np.int32, name="GO")] + labels[:-1]

In [9]:
keep_prob = tf.placeholder("float")

cells = [DropoutWrapper(
        BasicLSTMCell(embedding_dim), output_keep_prob=keep_prob) for i in range(3)]

stacked_lstm = MultiRNNCell(cells)

with tf.variable_scope("decoders") as scope:
    decode_outputs, decode_state = embedding_rnn_seq2seq(
        encode_input, decode_input, stacked_lstm, input_vocab_size, output_vocab_size, embedding_dim)
    
    scope.reuse_variables()
    
    decode_outputs_test, decode_state_test = embedding_rnn_seq2seq(
        encode_input, decode_input, stacked_lstm, input_vocab_size, output_vocab_size, embedding_dim,
    feed_previous=True)


In [10]:
loss_weights = [tf.ones_like(l, dtype=tf.float32) for l in labels]
loss = sequence_loss(decode_outputs, labels, loss_weights, output_vocab_size)
optimizer = tf.train.AdamOptimizer(1e-4)
train_op = optimizer.minimize(loss)

In [11]:
sess.run(tf.initialize_all_variables())

Instructions for updating:
Use `tf.global_variables_initializer` instead.


### Training model

In [12]:
print data_train[58]

(array([ 6, 18, 16, 23, 11, 30, 12, 17, 22, 12, 10, 12,  7,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0], dtype=int32), array([ 6, 18, 16, 23, 11, 30, 12, 17, 22, 12, 10, 12,  7,  2,  2,  2,  2,
        2,  2], dtype=int32))


In [13]:
class DataIterator:
    def __init__(self, data, batch_size):
        self.data = data
        self.batch_size = batch_size
        self.iter = self.make_random_iter()
        
    def next_batch(self):
        try:
            idxs = self.iter.next()
        except StopIteration:
            self.iter = self.make_random_iter()
            idxs = self.iter.next()
        X, Y = zip(*[self.data[i] for i in idxs])
        X = np.array(X).T
        Y = np.array(Y).T
        return X, Y

    def make_random_iter(self):
        splits = np.arange(self.batch_size, len(self.data), self.batch_size)
        it = np.split(np.random.permutation(range(len(self.data))), splits)[:-1]
        return iter(it)
    
train_iter = DataIterator(data_train, 128)
val_iter = DataIterator(data_val, 128)
test_iter = DataIterator(data_test, 128)

In [14]:
import sys

def get_feed(X, Y):
    feed_dict = {encode_input[t]: X[t] for t in range(input_seq_length)}
    feed_dict.update({labels[t]: Y[t] for t in range(output_seq_length)})
    return feed_dict

def train_batch(data_iter):
    X, Y = data_iter.next_batch()
    feed_dict = get_feed(X, Y)
    feed_dict[keep_prob] = 0.5
    _, out = sess.run([train_op, loss], feed_dict)
    return out

def get_eval_batch_data(data_iter):
    X, Y = data_iter.next_batch()
    feed_dict = get_feed(X, Y)
    feed_dict[keep_prob] = 1.
    all_output = sess.run([loss] + decode_outputs_test, feed_dict)
    eval_loss = all_output[0]
    decode_output = np.array(all_output[1:]).transpose([1,0,2])
    return eval_loss, decode_output, X, Y

def eval_batch(data_iter, num_batches):
    losses = []
    predict_loss = []
    for i in range(num_batches):
        eval_loss, output, X, Y = get_eval_batch_data(data_iter)
        losses.append(eval_loss)
        
        for index in range(len(output)):
            real = Y.T[index]
            predict = np.argmax(output, axis = 2)[index]
            predict_loss.append(all(real==predict))
    return np.mean(losses), np.mean(predict_loss)

In [15]:
from datetime import datetime

for i in range(100000):
    try:
        start = datetime.now()
        train_batch(train_iter)
        if i % 1000 == 0:
            val_loss, val_predict = eval_batch(val_iter, 16)
            train_loss, train_predict = eval_batch(train_iter, 16)
            print "val loss   : %f, val predict   = %.1f%%" %(val_loss, val_predict * 100)
            print "train loss : %f, train predict = %.1f%%" %(train_loss, train_predict * 100)
            print
            print 'time elapsed: %s'% (datetime.now() - start)
            print 
            sys.stdout.flush()
    except KeyboardInterrupt:
        print "interrupted by user"
        break

val loss   : 3.695979, val predict   = 0.0%
train loss : 3.695721, train predict = 0.0%

time elapsed: 0:00:33.940537

val loss   : 0.727289, val predict   = 0.6%
train loss : 0.761805, train predict = 0.6%

time elapsed: 0:00:31.475351

val loss   : 0.423291, val predict   = 16.5%
train loss : 0.423748, train predict = 18.0%

time elapsed: 0:00:30.732873

val loss   : 0.269698, val predict   = 40.3%
train loss : 0.280837, train predict = 38.8%

time elapsed: 0:00:30.683988

val loss   : 0.222835, val predict   = 56.0%
train loss : 0.215502, train predict = 56.0%

time elapsed: 0:00:30.744834

val loss   : 0.195200, val predict   = 61.8%
train loss : 0.189281, train predict = 62.6%

time elapsed: 0:00:30.834877

val loss   : 0.189888, val predict   = 65.3%
train loss : 0.154252, train predict = 68.3%

time elapsed: 0:00:30.887578

val loss   : 0.172446, val predict   = 67.3%
train loss : 0.138152, train predict = 71.0%

time elapsed: 0:00:30.950986

val loss   : 0.172786, val predict  

In [29]:
eval_loss, output, X, Y = get_eval_batch_data(test_iter)
test_loss, test_predict = eval_batch(test_iter, 16)

In [30]:
print "form".ljust(70),
print "real lemma".ljust(25),
print "predicted lemma".ljust(25),
print "is correct"
print

for index in range(len(output)):
    ngrams = " ".join([index_to_letter[p] for p in X.T[index]]) 
    real = [index_to_letter[l] for l in Y.T[index]] 
    predict = [index_to_letter[l] for l in np.argmax(output, axis = 2)[index]]
   
    print ngrams.split(" _")[0].ljust(70),
    print "".join(real).split("_")[0].ljust(25),
    print "".join(predict).split("_")[0].ljust(25),
    print str(real == predict)
    
print
print test_loss
print test_predict


form                                                                   real lemma                predicted lemma           is correct

b e i l l i t u s ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́                      belletus                  beillitus                 False
u i s c e m l a c h t ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́                      uiscemlacht               uiscemlacht               True
c o m c a i s i u ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́                      comcaisiu                 comcaisiu                 True
a s a í t e c h ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́                      asaítech                  asaítech                  True
l á n a m a n d a ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́                      lánamanda                 lánamanda                 True
a b n a i r e ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́                      abnaire                   anbaire                   False
d e c l a m ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́ ́                      declam                  

In [31]:
saver = tf.train.Saver()
saver.save(sess, 'char2char',global_step=1000)

'char2char-1000'

In [None]:
predictions = sess.run(feed_dict={X: image})
print(predictions)