In [1]:
import tensorflow as tf
import numpy as np
import time

Configuration

In [2]:
# Training
dropout       = 0.5
batch_size    = 20

# Embedding
embedding_dim = 200

# RNN
hidden_dim    = 200
num_layers    = 2

Dataset

In [3]:
from english_data_provider import *

In [4]:
eigo = EnglishDataProvider()

In [5]:
my_x, my_y = eigo.get_word_pairs('train')

Vocabulary

In [6]:
vocab_size = len(eigo.get_vocabulary())
print 'Vocabulary size:', vocab_size

Vocabulary size: 10000


In [7]:
train_x, train_y = eigo.get_word_pairs('train')
valid_x, valid_y = eigo.get_word_pairs('valid')
test_x , test_y  = eigo.get_word_pairs('test')

In [8]:
input_seq_length = len(train_x[0])
print' Sequence length:', input_seq_length

 Sequence length: 35


Placeholders

In [9]:
input_ = tf.placeholder(tf.int32, shape=[batch_size, input_seq_length], name="input")

In [10]:
targets = tf.placeholder(tf.int64, [batch_size, input_seq_length], name='targets')

In [11]:
keep_prob = tf.placeholder(tf.float32)

Embeddings

In [12]:
embeddings = tf.get_variable('word_embedding', [vocab_size, embedding_dim])
input_embedded = tf.nn.embedding_lookup(embeddings, input_)

LSTM

In [13]:
if num_layers == 1:
    cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_dim, state_is_tuple=True, forget_bias=0.0)
else:
    cell = tf.nn.rnn_cell.MultiRNNCell([
                tf.nn.rnn_cell.BasicLSTMCell(hidden_dim, state_is_tuple=True, forget_bias=0.0)
                for _ in range(num_layers)
            ])

In [14]:
initial_state = cell.zero_state(batch_size, dtype=tf.float32)

In [15]:
rnn_inputs = tf.reshape(input_embedded, [batch_size, input_seq_length, -1])
rnn_inputs = [tf.squeeze(x, [1]) for x in tf.split(rnn_inputs, input_seq_length, 1)]

In [16]:
outputs, final_rnn_state = tf.contrib.rnn.static_rnn(cell, rnn_inputs, initial_state)

Output Projection

In [17]:
output_W = tf.get_variable('outW', shape=[hidden_dim, vocab_size])
output_b = tf.get_variable('outB', shape=[vocab_size])

In [18]:
logits = []

for idx, output in enumerate(outputs):
    logits += [ tf.matmul(output, output_W) + output_b ]

Loss

In [19]:
tf.squeeze(tf.split(targets, input_seq_length, 1)[0], [1])

<tf.Tensor 'Squeeze_35:0' shape=(20,) dtype=int64>

In [20]:
labels = [tf.squeeze(x, [1]) for x in tf.split(targets, input_seq_length, 1)]
loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels))

In [21]:
''' Builds training graph. '''
global_step = tf.Variable(0, name='global_step', trainable=False)
lr = 1.0
max_grad_norm = 5.0

with tf.variable_scope('SGD_Training'):
    # SGD learning parameter
    learning_rate = tf.Variable(lr, trainable=False, name='learning_rate')

    # collect all trainable variables
    tvars = tf.trainable_variables()
    grads, global_norm = tf.clip_by_global_norm(tf.gradients(loss * input_seq_length, tvars), max_grad_norm)

    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step)

In [22]:
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
session = tf.Session(config=config)
session.run(tf.assign(learning_rate, lr))

1.0

In [23]:
def model_size():

    params = tf.trainable_variables()
    size = 0
    for x in params:
        sz = 1
        for dim in x.get_shape():
            sz *= dim.value
        size += sz
    return size

In [26]:
clear_char_embedding_padding = tf.scatter_update(embeddings, [0], tf.constant(0.0, shape=[1, embedding_dim]))

In [27]:
session.run(tf.global_variables_initializer())
session.run(clear_char_embedding_padding)
print('Created and initialized fresh model. Size:', model_size())

('Created and initialized fresh model. Size:', 4651600)


In [28]:
''' training starts here '''
best_valid_loss = None
rnn_state = session.run(initial_state)

In [29]:
saver = tf.train.Saver(max_to_keep=50)

In [None]:
for epoch in range(25):
    epoch_start_time = time.time()
    avg_train_loss = 0.0
    count = 0
    
    for i in range(0, len(train_x) - batch_size, batch_size):
        count += 1
        start_time = time.time()

        t_loss, _, rnn_state, gradient_norm, step, _ = session.run([
            loss,
            train_op,
            final_rnn_state,
            global_norm,
            global_step,
            clear_char_embedding_padding
        ], {
            input_ : train_x[i:i+batch_size],
            targets: train_y[i:i+batch_size],
            initial_state: rnn_state,
            keep_prob: 1.0 - dropout
        })

        avg_train_loss += 0.05 * (t_loss - avg_train_loss)

        time_elapsed = time.time() - start_time

        if count % 500 == 0:
            print('%6d: %d [%5d/%5d], train_loss/perplexity = %6.8f/%6.7f secs/batch = %.4fs, grad.norm=%6.8f' % (step,
                                                    epoch, count,
                                                    len(train_x)/batch_size,
                                                    t_loss, np.exp(t_loss),
                                                    time_elapsed,
                                                    gradient_norm))

    print('Epoch training time:', time.time()-epoch_start_time)
    
    # epoch done: time to evaluate
    avg_valid_loss = 0.0
    count = 0
    rnn_state = session.run(initial_state)
    for i in range(0, len(valid_x) - batch_size, batch_size):
        count += 1
        start_time = time.time()

        t_loss, rnn_state = session.run([
            loss,
            final_rnn_state
        ], {
            input_ : valid_x[i:i+batch_size],
            targets: valid_y[i:i+batch_size],
            initial_state: rnn_state,
            keep_prob: 1.0
        })
        
        avg_valid_loss += t_loss / (len(valid_x)/batch_size)

    print("at the end of epoch:", epoch)
    print("train loss = %6.8f, perplexity = %6.8f" % (avg_train_loss, np.exp(avg_train_loss)))
    print("validation loss = %6.8f, perplexity = %6.8f" % (avg_valid_loss, np.exp(avg_valid_loss)))

    save_as = '%s/w_epoch%03d_%.4f.model' % ('cv', epoch, avg_valid_loss)
    #saver.save(session, save_as)
    print('Saved model', save_as)

    # learning rate update
    if best_valid_loss is not None and np.exp(avg_valid_loss) > np.exp(best_valid_loss) - 1.0:
        current_learning_rate = session.run(learning_rate)
        current_learning_rate *= 0.5
        
        if current_learning_rate < 1.e-5:
            break
        
        session.run(learning_rate.assign(current_learning_rate))
        print('new learning rate is:', current_learning_rate)
    else:
        best_valid_loss = avg_valid_loss

   500: 0 [  500/ 1327], train_loss/perplexity = 5.74982834/314.1367188 secs/batch = 0.0505s, grad.norm=5.22597599
  1000: 0 [ 1000/ 1327], train_loss/perplexity = 5.56640863/261.4932861 secs/batch = 0.0540s, grad.norm=5.82978010
('Epoch training time:', 70.69475293159485)
('at the end of epoch:', 0)
train loss = 5.43245171, perplexity = 228.70928667
validation loss = 5.43693614, perplexity = 229.73722087
('Saved model', 'cv/w_epoch000_5.4369.model')
  1827: 1 [  500/ 1327], train_loss/perplexity = 4.89489269/133.6056671 secs/batch = 0.0542s, grad.norm=6.75933123
  2327: 1 [ 1000/ 1327], train_loss/perplexity = 5.13991594/170.7014160 secs/batch = 0.0547s, grad.norm=6.55329752
('Epoch training time:', 70.20807313919067)
('at the end of epoch:', 1)
train loss = 5.03276399, perplexity = 153.35630210
validation loss = 5.15711841, perplexity = 173.66330807
('Saved model', 'cv/w_epoch001_5.1571.model')
  3154: 2 [  500/ 1327], train_loss/perplexity = 4.60654688/100.1377640 secs/batch = 0.054

In [None]:
rnn_state = session.run(initial_rnn_state)

count = 0
avg_loss = 0
start_time = time.time()
for x, y in test_reader.iter():
    count += 1
    t_loss, rnn_state = session.run([
        loss,
        final_rnn_state
    ], {
        input_ : x,
        targets: y,
        initial_rnn_state: rnn_state,
        keep_prob: 1.0
    })

    avg_loss += t_loss

avg_loss /= count

print("test loss = %6.8f, perplexity = %6.8f" % (avg_loss, np.exp(avg_loss)))
print("test samples:", count*batch_size)

In [None]:
session.close()