## Setup

In [1]:
import tensorflow as tf
import numpy as np
from keras.utils.data_utils import get_file
from keras.utils.np_utils import to_categorical

Using TensorFlow backend.


In [2]:
path = get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
text = open(path).read()
print('corpus length:', len(text))

corpus length: 600893


In [3]:
chars = sorted(list(set(text)))
chars.insert(0, "\0")
vocab_size = len(chars)

In [4]:
char_indices = {c:i for i,c in enumerate(chars)}
indices_char = {i:c for i,c in enumerate(chars)}

In [5]:
idx = [char_indices[c] for c in text]

In [6]:
sequence_len = 8

In [7]:
c_in_dat = [[idx[i+n] for i in range(0, len(idx)-1-sequence_len, sequence_len)]
            for n in range(sequence_len)]

In [8]:
c_out_dat = [[idx[i+n] for i in range(1, len(idx)-sequence_len, sequence_len)]
            for n in range(sequence_len)]

In [9]:
xs = [np.stack(c[:-2]) for c in c_in_dat]

In [10]:
ys = [np.stack(c[:-2]) for c in c_out_dat]

In [11]:
oh_ys = [to_categorical(o, vocab_size) for o in ys]
oh_y_rnn=np.stack(oh_ys, axis=1)

oh_xs = [to_categorical(o, vocab_size) for o in xs]
oh_x_rnn=np.stack(oh_xs, axis=1)

oh_x_rnn.shape, oh_y_rnn.shape

((75109, 8, 85), (75109, 8, 85))

## Model parameters

We're defining the matrices to act on a single character at a time, then we'll use the TF scan (which is kind of like a for loop but easily parallelizable) to compute across a whole sequence.

In [12]:
n_in = vocab_size #aka 85
n_hidden = 256
n_out = vocab_size

In [13]:
W_h = tf.Variable(tf.eye(n_hidden))
B_h = tf.Variable(tf.zeros([1, n_hidden]))

W_x = tf.Variable(tf.random_normal([n_in, n_hidden], stddev=0.35))
B_x = tf.Variable(tf.zeros([1, n_hidden]))

W = tf.concat([W_h, W_x], 0)
B = B_h+B_x

W_y = tf.Variable(tf.random_normal([n_hidden, n_out], stddev=0.35))
B_y = tf.Variable(tf.zeros([sequence_len, 1, n_out]))

## Model placeholders

In [14]:
h = tf.placeholder(tf.float32, [1, n_hidden])
t_inp = tf.placeholder(tf.float32, [sequence_len, n_in])
t_out = tf.placeholder(tf.float32, [sequence_len, n_out])
lr = tf.placeholder(tf.float32)

In [15]:
def step(h,x):
    #h, _ = h
    #Can concatenate the tensors to turn this into a single matrix multiplication
    #Also expand dims of the input vector so the concat works (which is higher 
    #rank so that TF can treat it as a matrix in .matmul)
    a = tf.concat([h,tf.expand_dims(x,0)], 1)
    h = tf.nn.relu(tf.matmul(a,W) + B)
    output = tf.nn.softmax(tf.matmul(h, W_y) + B_y)
    #return h, output
    return h

In [16]:
rnn_out = tf.scan(step, t_inp, initializer=h)

In [17]:
rnn_out

<tf.Tensor 'scan/TensorArrayStack/TensorArrayGatherV3:0' shape=(8, 1, 256) dtype=float32>

In [18]:
y = tf.tensordot(rnn_out, W_y, [[2],[0]]) + B_y
y = tf.squeeze(y)
y

<tf.Tensor 'Squeeze:0' shape=(8, 85) dtype=float32>

In [19]:
cross_entropy = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=t_out, logits=y))

In [20]:
train_step = tf.train.GradientDescentOptimizer(lr).minimize(cross_entropy)

In [36]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    learning = 0.1
    err = 0.0
    for i in range(len(oh_x_rnn)):
        feed_dict = {
            h:np.zeros([1,n_hidden]),
            t_inp:oh_x_rnn[i],
            t_out:oh_y_rnn[i],
            lr:learning
        }
        train_step.run(feed_dict=feed_dict)
        err += cross_entropy.eval(feed_dict)
        if i % 100 == 99:
            print('train loss %g' % (err/100))
            err = 0.0
        if i % 1000 == 999:
            print('lowering learning rate to %g' % (learning/10))
            learning = learning/10

train loss 9.16008
train loss 2.9369
train loss 2.77286
train loss 3.02166
train loss 2.87255
train loss 2.96434
train loss 2.66227
train loss 2.60069
train loss 2.58672
train loss 2.37206
lowering learning rate to 0.01
train loss 2.59689
train loss 2.59232
train loss 2.77964
train loss 2.62102
train loss 2.63537
train loss 2.63374
train loss 2.66401
train loss 2.6121
train loss 2.71464
train loss 2.69566
lowering learning rate to 0.001
train loss 2.88342
train loss 2.71474
train loss 2.69608
train loss 2.65896
train loss 2.57386
train loss 2.72585
train loss 2.68262
train loss 2.83088
train loss 2.61341
train loss 2.87793
lowering learning rate to 0.0001
train loss 2.73624
train loss 2.61864
train loss 2.66631
train loss 2.58107
train loss 2.84807
train loss 2.74861
train loss 2.67749
train loss 2.60221
train loss 2.76105
train loss 2.66755
lowering learning rate to 1e-05


KeyboardInterrupt: 