# Understanding LSTM Networks

## 1. Datenaufbereitung

In [1]:
import numpy as np
from pprint import pprint
import datetime

import trainer
reload(trainer)

sequence_length = 6

reference_input_data, reference_output_data = trainer.getSequences(sequence_length)

# trainer.getSequences(sequence_length) generates all possible combinations of
# the characters '+-0I', so for a sequence length of 6 characters there are a
# a total of 4^6 = 4096 possible combinations. Some Examples:
# '+-+-+-' = 0
# '------' = -6
# '0++000' = 2
# 'I++000' = -2
#
# Those sequences are encoded: Every character is representated by a vector, so the actual
# return value from trainer.getSequences looks like this:
pprint(reference_input_data[0])

# There is a helper to decode that again:
pprint(trainer.decodeSequence(reference_input_data[0]))

# The solution for that sequence is:
pprint(reference_output_data[0])

instruction_count = np.array(reference_input_data).shape[2]

array([[0, 0, 0],
       [0, 0, 1],
       [1, 0, 0],
       [0, 1, 0],
       [0, 1, 0],
       [0, 0, 1]])
'0I+--I'
1


In [2]:
NUM_EXAMPLES = len(reference_input_data) / 4 # we use 1/4 of the data for the training

test_input = reference_input_data[NUM_EXAMPLES:]
test_output = reference_output_data[NUM_EXAMPLES:] # everything beyond NUM_EXAMPLES

train_input = reference_input_data[:NUM_EXAMPLES]
train_output = reference_output_data[:NUM_EXAMPLES]

print("We'll train using " + str(NUM_EXAMPLES) + "/" + str(len(reference_input_data)) + " Examples")

We'll train using 1024/4096 Examples


In [3]:
import tensorflow as tf

data = tf.placeholder(tf.float32, [None, sequence_length, instruction_count], name='data')
target = tf.transpose(tf.placeholder(tf.float32, [None], name='target'))

## 2. Die LSTM Schicht

In [4]:
LSTM_SETTINGS = {
    'num_cells': 24,
    'feature_size': 3
}

# All of the LSTM's weights and biases (should?!) have the same dimensions, so we'd rather don't repeat ourselves
def default_weights_and_bias():
    weights = tf.Variable(tf.truncated_normal([LSTM_SETTINGS['num_cells'], LSTM_SETTINGS['num_cells'] + LSTM_SETTINGS['feature_size']]))
#   Alternative?!:    weights = tf.transpose(tf.Variable(tf.truncated_normal([LSTM_SETTINGS['num_cells'], LSTM_SETTINGS['num_cells'] + LSTM_SETTINGS['feature_size']])))
    bias = tf.transpose(tf.Variable(tf.constant(0.1, shape = [LSTM_SETTINGS['num_cells']])))
#   Alternative?!:    bias = tf.Variable(tf.constant(0.1, shape = [LSTM_SETTINGS['num_cells']]))
    return weights, bias

### 2.1 Forget Layer

![](https://colah.github.io/posts/2015-08-Understanding-LSTMs/img/LSTM3-focus-f.png)

In [5]:
W_forget_layer, b_forget_layer = default_weights_and_bias()

def forget_layer(ht_minus_1_and_xt):
    print("ft: W", str(W_forget_layer.get_shape()))
    print("ft: ht_minus_1_and_xt", str(ht_minus_1_and_xt.get_shape()))
    print("ft: b_forget_layer", str(b_forget_layer.get_shape()))
    ft = tf.sigmoid(tf.transpose(tf.matmul(W_forget_layer, tf.transpose(ht_minus_1_and_xt))) + b_forget_layer)
#   Alternative?!:   ft = tf.sigmoid(tf.matmul(ht_minus_1_and_xt, W_forget_layer) + b_forget_layer)
    print("ft: ft", str(ft.get_shape()))

    return(ft)

### 2.2 Input Layer
![](https://colah.github.io/posts/2015-08-Understanding-LSTMs/img/LSTM3-focus-i.png)

In [6]:
W_input_layer, b_input_layer = default_weights_and_bias()

def input_gate_layer(ht_minus_1_and_xt):
    it = tf.sigmoid(tf.transpose(tf.matmul(W_input_layer, tf.transpose(ht_minus_1_and_xt))) + b_input_layer)
#   Alternative?!:    it = tf.sigmoid(tf.matmul(ht_minus_1_and_xt, W_input_layer) + b_input_layer)
    return it

W_candiate_layer, b_candiate_layer = default_weights_and_bias()

def new_candidate_values_layer(ht_minus_1_and_xt):
    C_candidate = tf.tanh(tf.transpose(tf.matmul(W_candiate_layer, tf.transpose(ht_minus_1_and_xt))) + b_candiate_layer)
#   Alternative?!:    C_candidate = tf.tanh(tf.matmul(ht_minus_1_and_xt, W_candiate_layer) + b_candiate_layer)
    
    return C_candidate

### 2.3 Update Layer

![](https://colah.github.io/posts/2015-08-Understanding-LSTMs/img/LSTM3-focus-C.png)

In [7]:
def update_conveyor(ft, it, Conveyor, CandidateConveyor):
    print("############")
    print("update_conveyor: ft", str(ft.get_shape()))
    print("update_conveyor: Conveyor", str(Conveyor.get_shape()))
    print("update_conveyor: it", str(it.get_shape()))
    print("update_conveyor: CandidateConveyor", str(CandidateConveyor.get_shape()))
    new_Conveyor = ft * Conveyor + it * CandidateConveyor
    return(new_Conveyor)

### 2.4 Output Layer

![](https://colah.github.io/posts/2015-08-Understanding-LSTMs/img/LSTM3-focus-o.png)

In [8]:
W_prediction_layer, b_prediction_layer = default_weights_and_bias()
    
def output_layer(ht_minus_1_and_xt, new_Conveyor):
    ot = tf.sigmoid(tf.transpose(tf.matmul(W_prediction_layer, tf.transpose(ht_minus_1_and_xt))) + b_prediction_layer)
#   Alternative?!: ot = tf.sigmoid(tf.matmul(ht_minus_1_and_xt, W_prediction_layer) + b_prediction_layer)
    prediction = ot * tf.tanh(new_Conveyor)
    return(prediction)

### 2.5 The LSTM Cell – Putting it all together
![](https://colah.github.io/posts/2015-08-Understanding-LSTMs/img/LSTM3-chain.png)

In [9]:
def lstm_cell(ht_minus_1_and_Conveyor, xt):
    ht_minus_1, Conveyor = ht_minus_1_and_Conveyor
    
    ht_minus_1_and_xt = tf.concat([ht_minus_1, xt], 1)
    
    ft                = forget_layer(ht_minus_1_and_xt)
    it                = input_gate_layer(ht_minus_1_and_xt)
    CandidateConveyor = new_candidate_values_layer(ht_minus_1_and_xt) # CandidateConveyor entspricht ~C_t
    
    new_Conveyor = update_conveyor(ft, it, Conveyor, CandidateConveyor)
    
    lstm_prediction = output_layer(ht_minus_1_and_xt, new_Conveyor)
    
    return(lstm_prediction, new_Conveyor)

In [10]:
data_length = tf.shape(data)[0]

initial_Conveyor = tf.zeros([data_length, LSTM_SETTINGS['num_cells']])
initial_prediction = tf.zeros([data_length, LSTM_SETTINGS['num_cells']])

# This loop gets called once for every "timestep" and gets one column of the input data
def lstm_loop(last_lstm_prediction, last_state, step):
    lstm_prediction, state = lstm_cell([last_lstm_prediction, last_state], data[:, step, :])
    return lstm_prediction, state, tf.add(step, 1)


timesteps = sequence_length

for_each_time_step = lambda a, b, step: tf.less(step, timesteps)

lstm_prediction, lstm_state, _ = tf.while_loop(for_each_time_step, lstm_loop, (initial_prediction, initial_Conveyor, 0), back_prop = True, parallel_iterations=6)

('ft: W', '(24, 27)')
('ft: ht_minus_1_and_xt', '(?, 27)')
('ft: b_forget_layer', '(24,)')
('ft: ft', '(?, 24)')
############
('update_conveyor: ft', '(?, 24)')
('update_conveyor: Conveyor', '(?, 24)')
('update_conveyor: it', '(?, 24)')
('update_conveyor: CandidateConveyor', '(?, 24)')


In [11]:
weight = tf.Variable(tf.truncated_normal([LSTM_SETTINGS['num_cells'], 1]))
bias = tf.Variable(tf.constant(0.1, shape=[1]))

prediction = tf.matmul(lstm_prediction, weight) + bias

## 3. Cost & Optimizing

In [12]:
with tf.name_scope('mean_square_error'):
    mean_square_error = tf.reduce_sum(tf.square(tf.subtract(target, tf.unstack(prediction, axis = 1))))
tf.summary.scalar('mean_square_error', mean_square_error)

<tf.Tensor 'mean_square_error_1:0' shape=() dtype=string>

In [13]:
optimizer = tf.train.AdamOptimizer()
minimize = optimizer.minimize(mean_square_error)

In [14]:
with tf.name_scope('error'):
    with tf.name_scope('mistakes'):
        mistakes = tf.not_equal(target, tf.round(tf.unstack(prediction, axis = 1)))
    with tf.name_scope('error'):
        error = tf.reduce_mean(tf.cast(mistakes, tf.float32))
tf.summary.scalar('error', error)

<tf.Tensor 'error_1:0' shape=() dtype=string>

## 4. Training

In [None]:
sess = tf.InteractiveSession()
merged = tf.summary.merge_all()

date = str(datetime.datetime.now())
train_writer = tf.summary.FileWriter('logs/selfmade_lstm/' + date + '/train', sess.graph)
test_writer = tf.summary.FileWriter('logs/selfmade_lstm/' + date + 'test')

model_checkpoint = 'lstm_self_built.chkpt'

tf_saver = tf.train.Saver(tf.global_variables())

init_op = tf.global_variables_initializer()
sess.run(init_op)

In [None]:
epoch = 4000

for i in range(epoch):
    if (i + 1) % 20 == 0:
        summary, incorrect, mean_squ_err = sess.run([merged, error, mean_square_error], {data: test_input, target: test_output})
        test_writer.add_summary(summary, i)
        
        print('Epoch {:4d} | incorrect {: 3.1f}% | mean squ error {: 3.1f}'.format(i + 1, incorrect * 100, mean_squ_err))
    else:
        summary, acc = sess.run([merged, error], {data: train_input, target: train_output})
        train_writer.add_summary(summary, i)
    
    sess.run(minimize,{data: train_input, target: train_output})
    
    if i % 100:
        tf_saver.save(sess, model_checkpoint)

Epoch   20 | incorrect  77.3% | mean squ error  9712.2
Epoch   40 | incorrect  73.8% | mean squ error  7820.1
Epoch   60 | incorrect  72.4% | mean squ error  6820.1
Epoch   80 | incorrect  70.9% | mean squ error  6157.0
Epoch  100 | incorrect  70.0% | mean squ error  5650.4
Epoch  120 | incorrect  69.0% | mean squ error  5258.0
Epoch  140 | incorrect  68.2% | mean squ error  4933.2
Epoch  160 | incorrect  67.0% | mean squ error  4647.6
Epoch  180 | incorrect  66.2% | mean squ error  4392.8
Epoch  200 | incorrect  65.6% | mean squ error  4163.2
Epoch  220 | incorrect  64.6% | mean squ error  3954.8
Epoch  240 | incorrect  64.0% | mean squ error  3762.8
Epoch  260 | incorrect  63.1% | mean squ error  3582.6
Epoch  280 | incorrect  62.0% | mean squ error  3410.8
Epoch  300 | incorrect  60.6% | mean squ error  3245.3
Epoch  320 | incorrect  59.7% | mean squ error  3086.0
Epoch  340 | incorrect  58.7% | mean squ error  2933.5
Epoch  360 | incorrect  57.1% | mean squ error  2788.9
Epoch  380

In [None]:
# reload(trainer)
sess.run(prediction, {data: [trainer.encodeSequence("II++++")]})

In [None]:
# sess.close()
# train_writer.close()
# test_writer.close()
