In [1]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import time
import collections

np.set_printoptions(precision=4, linewidth=200)

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from utils.reader import europarl_raw_data

In [3]:
from utils.nmt_graph import NMTModel

In [4]:
def show_dict_contents(d):
    for k, v in d.items():
        for k2, v2 in v.items():
            print('{0: <30}: type={1: <30}{2}{3}'.format(
                '{0}.{1}'.format(k, k2),
                str(type(v2)),
                ' shape={0}'.format(v2.shape) if isinstance(v2, np.ndarray) else '',
                ' len={0}, contents type={1}'.format(
                    len(v2),
                    type(v2[0])
                ) if isinstance(v2, list) else '',
            ))

In [5]:
def unvectorize_sentence(sentence, idx2word):
    return ' '.join([idx2word[i] for i in sentence])

In [6]:
def sample_group(data, group_num, num_samples=10):
    samples = np.random.choice(len(data['train']['X'][group_num]), size=num_samples)
    for sample in samples:
        print(unvectorize_sentence(data['train']['X'][group_num][sample], data['vocab']['lang1_idx2word']))
        print(unvectorize_sentence(data['train']['y'][group_num][sample], data['vocab']['lang2_idx2word']))
        print()

In [7]:
data = europarl_raw_data()
show_dict_contents(data)
print(
    [(x.shape, y.shape) for x, y in sorted(zip(data['train']['X'], data['train']['y']), key=lambda t: t[0].shape[1]) if x.shape[1] > 2]
)

vocab.lang1_idx2word          : type=<class 'list'>                 len=93800, contents type=<class 'str'>
vocab.lang1_word2idx          : type=<class 'dict'>                
vocab.lang2_idx2word          : type=<class 'list'>                 len=44248, contents type=<class 'str'>
vocab.lang2_word2idx          : type=<class 'dict'>                
train.X                       : type=<class 'list'>                 len=4, contents type=<class 'numpy.ndarray'>
train.y                       : type=<class 'list'>                 len=4, contents type=<class 'numpy.ndarray'>
val.X                         : type=<class 'numpy.ndarray'>        shape=(160000, 604)
val.y                         : type=<class 'list'>                 len=160000, contents type=<class 'str'>
test.X                        : type=<class 'numpy.ndarray'>        shape=(160209, 640)
test.y                        : type=<class 'list'>                 len=160209, contents type=<class 'str'>
[((63624, 8), (63624, 32)), ((29

In [8]:
sample_group(data, 0, num_samples=5)

<bos> <eos> <pad> <pad> <pad> <pad> <pad> <pad>
<bos> It is an ambitious project for 2010 . <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>

<bos> ( Beifall ) <eos> <pad> <pad> <pad>
<bos> ( Applause ) <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>

<bos> Vielen Dank . <eos> <pad> <pad> <pad>
<bos> Thank you very much . <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>

<bos> Die Niederlande waren stets proeuropäisch . <eos>
<bos> The Netherlands has always been a pro-European country . <eos> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>

<bos> Daran besteht kein Zweifel . <eos> <pad>
<bos> Very 

In [9]:
data['train']['X'][0].shape

(63624, 8)

In [25]:
tf.reset_default_graph()
model = NMTModel(data['vocab']['lang1_idx2word'], data['vocab']['lang2_idx2word'], 128, 512)
all_graphs = model.make_all_graphs(16, data['train']['X'], data['train']['y'])
eval_graph = model.make_eval_graph(16, data['train']['X'][0].shape[1] - 2, 32, data['vocab']['lang2_word2idx']['<bos>'])
writer = tf.summary.FileWriter(logdir='logs', graph=tf.get_default_graph())
writer.flush()

In [26]:
show_dict_contents(all_graphs[0]['inputs_and_outputs'])

placeholders.inputs           : type=<class 'tensorflow.python.framework.ops.Tensor'>
placeholders.targets          : type=<class 'tensorflow.python.framework.ops.Tensor'>
placeholders.learning_rate    : type=<class 'tensorflow.python.framework.ops.Tensor'>
placeholders.max_norm         : type=<class 'tensorflow.python.framework.ops.Tensor'>
outputs.loss                  : type=<class 'tensorflow.python.framework.ops.Tensor'>
outputs.num_correct_predictions: type=<class 'tensorflow.python.framework.ops.Tensor'>
train_ops.train_op            : type=<class 'tensorflow.python.framework.ops.Operation'>
train_ops.gradient_global_norm: type=<class 'tensorflow.python.framework.ops.Tensor'>


In [None]:
INITIAL_LR=5e0
MAX_NORM=1
with tf.Session() as sess:
    run_id = time.time()
    writer = tf.summary.FileWriter('logs/{0}'.format(run_id), sess.graph)
    coord = tf.train.Coordinator()
    sess.run(tf.global_variables_initializer())
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)

    learning_rate = INITIAL_LR
    max_norm = MAX_NORM
    training_outputs = {
        **all_graphs[0]['inputs_and_outputs']['outputs'],
        **all_graphs[0]['inputs_and_outputs']['train_ops'],
    }
    for i in range(50):
        for j in range(10):
            start_idx = 0 * 16
            end_idx = (0+1) * 16
            inputs = data['train']['X'][0][start_idx:end_idx, 1:-1]
            targets = data['train']['y'][0][start_idx:end_idx]
            outputs = sess.run(
                training_outputs,
                feed_dict={
                    all_graphs[0]['inputs_and_outputs']['placeholders']['inputs']: inputs,
                    all_graphs[0]['inputs_and_outputs']['placeholders']['targets']: targets,
                    all_graphs[0]['inputs_and_outputs']['placeholders']['learning_rate']: learning_rate * (10.0 / (10.0 + np.sqrt(i))),
                    all_graphs[0]['inputs_and_outputs']['placeholders']['max_norm']: max_norm,
                }
            )
        eval_outputs = sess.run(
            eval_graph['outputs']['outputs'],
            feed_dict={
                eval_graph['placeholders']['inputs']: inputs,
            }
        )
        print('-' * 40)
        print('i = {0}'.format(i))
        print(outputs)
        for sample in range(start_idx, end_idx):
            print(unvectorize_sentence(data['train']['X'][0][sample], data['vocab']['lang1_idx2word']))
            print(unvectorize_sentence(data['train']['y'][0][sample], data['vocab']['lang2_idx2word']))
            print(unvectorize_sentence(eval_outputs[sample - start_idx], data['vocab']['lang2_idx2word']))
            print()
        print('-' * 40)
            

    # Bookkeeping        
    writer.close()
    coord.request_stop()
    coord.join(threads)
    
print(outputs)

In [14]:
16 * 31

496

In [None]:
with tf.Session() as sess:
    run_id = time.time()
    writer = tf.summary.FileWriter('logs/{0}'.format(run_id), sess.graph)
    coord = tf.train.Coordinator()
    sess.run(tf.global_variables_initializer())
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)

    learning_rate = INITIAL_LR
    max_norm = MAX_NORM
    for i in range(1): #NUM_EPOCHS_TOTAL):
#         if i >= NUM_EPOCHS_INIT_LR:
#             learning_rate *= LR_DECAY_RATE
        for batch_idx, (inputs, targets) in enumerate(generate_epoch(X_train, BATCH_SIZE, NUM_STEPS)):
            outputs = sess.run(
                training_outputs,
                feed_dict={
                    placeholders['inputs']: inputs,
                    placeholders['targets']: targets,
                    placeholders['learning_rate']: learning_rate,
                    placeholders['max_norm']: max_norm,
                }
            )
            if (batch_idx % 64 == 63):
                print('step: {0}    loss: {1}    gradient norm: {2}     correct words: {3}'.format(
                    batch_idx+1,
                    outputs['loss'],
                    outputs['gradient_global_norm'],
                    outputs['num_correct_predictions'],
                ))
                
        total_loss, total_batches = 0, 0
        for inputs, targets in generate_epoch(X_val, BATCH_SIZE, NUM_STEPS):
            outputs = sess.run(
                summary_nodes,
                feed_dict={
                    placeholders['inputs']: inputs,
                    placeholders['targets']: targets
                },
            )
            total_loss += outputs['loss']
            total_batches += 1
        print('validation perplexity:', np.exp(total_loss / total_batches))
        total_loss, total_batches = 0, 0
        for inputs, targets in generate_epoch(X_test, BATCH_SIZE, NUM_STEPS):
            outputs = sess.run(
                summary_nodes,
                feed_dict={
                    placeholders['inputs']: inputs,
                    placeholders['targets']: targets
                },
            )
            total_loss += outputs['loss']
            total_batches += 1
        print('test perplexity:', np.exp(total_loss / total_batches))

    # Bookkeeping        
    writer.close()
    coord.request_stop()
    coord.join(threads)