In [1]:
import tensorflow as tf
import numpy as np
import tsp_env

In [2]:
def attention(W_ref, W_q, v, enc_outputs, query):
    with tf.variable_scope("attention_mask"):
        u_i0s = tf.einsum('kl,itl->itk', W_ref, enc_outputs)
        u_i1s = tf.expand_dims(tf.einsum('kl,il->ik', W_q, query), 1)
        u_is = tf.einsum('k,itk->it', v, tf.tanh(u_i0s + u_i1s))
        return tf.einsum('itk,it->ik', enc_outputs, tf.nn.softmax(u_is))

In [3]:
def critic_network(enc_inputs, 
                   hidden_size = 128, embedding_size = 128,
                   max_time_steps = 5, input_size = 2,
                   batch_size = 128,
                   initialization_stddev = 0.1,
                   n_processing_steps = 5, d = 128):
    # Embed inputs in larger dimensional tensors
    W_embed = tf.Variable(tf.random_normal([embedding_size, input_size],
                                           stddev=initialization_stddev))
    embedded_inputs = tf.einsum('kl,itl->itk', W_embed, enc_inputs)

    # Define encoder
    with tf.variable_scope("encoder"):
        enc_rnn_cell = tf.nn.rnn_cell.LSTMCell(hidden_size)
        enc_outputs, enc_final_state = tf.nn.dynamic_rnn(cell=enc_rnn_cell,
                                                         inputs=embedded_inputs,
                                                         dtype=tf.float32)
    # Define process block
    with tf.variable_scope("process_block"):
        process_cell = tf.nn.rnn_cell.LSTMCell(hidden_size)
        first_process_block_input = tf.tile(tf.Variable(tf.random_normal([1, embedding_size]),
                                                        name='first_process_block_input'), 
                                            [batch_size, 1])
        # Define attention weights
        with tf.variable_scope("attention_weights", reuse=True):
            W_ref = tf.Variable(tf.random_normal([embedding_size, embedding_size],
                                                 stddev=initialization_stddev),
                                name='W_ref')
            W_q = tf.Variable(tf.random_normal([embedding_size, embedding_size],
                                               stddev=initialization_stddev),
                              name='W_q')
            v = tf.Variable(tf.random_normal([embedding_size], stddev=initialization_stddev),
                            name='v')

        # Processing chain
        processing_state = enc_final_state
        processing_input = first_process_block_input
        for t in range(n_processing_steps):
            processing_cell_output, processing_state = process_cell(inputs=processing_input,
                                                                   state=processing_state)
            processing_input = attention(W_ref, W_q, v, 
                                         enc_outputs=enc_outputs, query=processing_cell_output)


    # Apply 2 layers of ReLu for decoding the processed state
    return tf.squeeze(tf.layers.dense(inputs=tf.layers.dense(inputs=processing_cell_output,
                                                  units=d, activation=tf.nn.relu),
                           units=1, activation=None))

In [4]:
batch_size = 128; max_time_steps = 5; input_size = 2
enc_inputs = tf.placeholder(tf.float32, [batch_size, max_time_steps, input_size])
bsln_value = critic_network(enc_inputs,
                            hidden_size = 128, embedding_size = 128,
                            max_time_steps = 5, input_size = 2,
                            batch_size = 128,
                            initialization_stddev = 0.1,
                            n_processing_steps = 5, d = 128)
tours_rewards_ph = tf.placeholder(tf.float32, [batch_size])
loss = tf.losses.mean_squared_error(labels=tours_rewards_ph,
                                    predictions=bsln_value)
train_op = tf.train.AdamOptimizer(1e-2).minimize(loss)

In [5]:
##############################################################################
# Trying it out: can we learn the reward of the optimal policy for the TSP5? #
##############################################################################
def generate_batch(n_cities, batch_size):
    inputs_list = []; labels_list = []
    env = tsp_env.TSP_env(n_cities, use_alternative_state=True)
    for i in range(batch_size):
        env.reset()
        s = env.reset()
        coords = s.reshape([4, n_cities])[:2, ].T
        inputs_list.append(coords)
        labels_list.append(env.optimal_solution()[0])
    return np.array(inputs_list), np.array(labels_list)
# Create tf session and initialize variables
sess = tf.InteractiveSession()
tf.global_variables_initializer().run()
# Training loop
loss_vals = []
for i in range(10000):
    inputs_batch, labels_batch = generate_batch(max_time_steps, batch_size)
    loss_val, _ = sess.run([loss, train_op],
                          feed_dict={enc_inputs: inputs_batch,
                                     tours_rewards_ph: labels_batch})
    loss_vals.append(loss_val)
    if i % 50 == 0:
        print(loss_val)

4.34178
0.154837
0.139899
0.0980297


KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(np.log(loss_vals_slow_lr))
plt.xlabel('Number of iterations')
plt.ylabel('Log of mean squared error')

In [None]:
len(loss_vals)