In [27]:
from __future__ import print_function
from collections import OrderedDict

%load_ext autoreload
%autoreload 1

import sys
import os
import time

import numpy as np
import theano
import theano.tensor as T
import lasagne
%aimport qrnn

theano.config.exception_verbosity='high'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# Hold options as static element in the opts class
class opts():
    hidden_size = 128
    seq_len = 100        # Data sequence length
    data_offset = 15     # Offset for every new input sequence
    batch_size = 64
    filter_width = 3     # Filter size to be used by QRNN layer

In [3]:
dataset = 'shakespeare'
data = open('data/' + dataset + '-compiled.txt', 'r').readlines()
if dataset != 'music':
    data = ''.join(data)
    chars = list(set(data))
else:
    data = [list(x.strip().split(' ') + ['\n']) for x in data]
    data = [item for sublist in data for item in sublist]
    chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print ('Vocabulary size = ' + str(vocab_size) + '; total data size = ' + str(data_size))
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }

Vocabulary size = 85; total data size = 6347705


In [4]:
def get_batch(data, b, b_size, seq_len, offset):
    start = int(float(len(data))/float(b_size))
    if start*(b_size - 1) + offset*b + seq_len >= len(data):
        return None, None
    X = np.zeros((b_size, seq_len, vocab_size), dtype=theano.config.floatX)
    y = np.zeros((b_size, seq_len, vocab_size), dtype=np.int8)

    for i in xrange(b_size):
        c = start*i + offset*b
        for j in xrange(seq_len):
            X[i, j, char_to_ix[data[c]]] = 1.0
            y[i, j, char_to_ix[data[c+1]]] = 1.0
            c += 1

    return X, y.reshape((b_size*seq_len, vocab_size))

In [25]:
def build_rnn_1(input_var, seq_len, dim, filter_width):
    # ----- INPUT LAYER -----
    l_in = lasagne.layers.InputLayer(shape=(None, seq_len, dim), input_var=input_var)
    batch_size, _, _ = l_in.input_var.shape

    # ----- LSTM LAYERS -----
    l_rec = qrnn.QRNNLayer(l_in, seq_len, dim, opts.hidden_size, filter_width, pooling='fo')
    l_rec = qrnn.QRNNLayer(l_rec, seq_len, opts.hidden_size, opts.hidden_size, filter_width, pooling='fo')
    l_rec = qrnn.QRNNLayer(l_rec, seq_len, opts.hidden_size, opts.hidden_size, filter_width, pooling='fo')

    # ----- FC LAYERS -----
    l_reshape = lasagne.layers.ReshapeLayer(l_rec, (batch_size * seq_len, opts.hidden_size))
    l_dense = lasagne.layers.DenseLayer(l_reshape, num_units=256,
                                        nonlinearity=lasagne.nonlinearities.tanh)
    l_dense = lasagne.layers.DenseLayer(l_dense, num_units=dim, nonlinearity=lasagne.nonlinearities.softmax)

    return l_dense

In [None]:
np.random.seed(3333)
lasagne.random.set_rng(np.random)

# Create network and compiling functions
print('Network creation and function compiling...')
input_var = T.tensor3('inputs')
output_var = T.bmatrix('outputs')
s_var = T.iscalar('svar')

network = build_rnn_1(input_var, opts.seq_len, vocab_size, opts.filter_width)
network_output = lasagne.layers.get_output(network)
loss = lasagne.objectives.categorical_crossentropy(network_output, output_var).mean()
all_params = lasagne.layers.get_all_params(network)
updates = lasagne.updates.adam(loss, all_params, learning_rate=0.001)

# Perplexity
no = network_output[s_var-1::s_var, :]
theano_perplexity = theano.function([input_var, output_var, s_var], 
                                    T.sum(-T.log(T.sum(no * output_var, axis=1))), on_unused_input='ignore')

train = theano.function(
    [input_var, output_var],
    loss, updates=updates, on_unused_input='ignore')

sample = theano.function(
    [input_var], network_output[-1, :], on_unused_input='ignore')

In [None]:
pre_params = None
c = 0

test_samples = 11200
train_data = data[0:-test_samples]
test_data = data[-test_samples:]

# Train procedure
print("Start training RNN...")
counter = 0
printouts = [1, 3, 6, 13, 26, 51, 101, 201, 401, 801, 1601, 3201, 6401, 12801]
max_counter = max(printouts)

while True:
    # Cyclic permutation of train set
    splitpoint = np.random.randint(0, len(train_data))
    train_data = train_data[splitpoint:] + train_data[:splitpoint]

    cost = 0.0
    b = 0.0
    while True:
        X, y = get_batch(train_data, int(b), opts.batch_size, opts.seq_len, opts.data_offset)
        if X is None or y is None:
            break
        counter += 1
        cost += train(X, y)

        b += 1.0

        if counter in printouts:
            # THEANO PERPLEXITY
            num, den = 0.0, 0.0
            tb = 0
            while True:
                Xt, yt = get_batch(test_data, tb, opts.batch_size, 100, 1)
                if Xt is None or yt is None:
                    break
                n2 = theano_perplexity(Xt, yt[100-1::100], 100)
                d2 = opts.batch_size
                num += n2
                den += d2
                tb += 1
            print(str(counter) + ':' + str(np.exp(num / den)))

            if counter >= max_counter:
                break

    if counter >= max_counter:
        break

Results QRNN model
---
3 stacked QRNN layers with filter width 3, dim 128 + dense 256 (sigmoid)

<b>$f$-pooling</b><br />
3201:7.02321607773<br />
6401:5.58554180259<br />
12801:4.91633971638<br />

<b>$fo$-pooling</b><br />
3201:6.48371135081<br />
6401:5.2369685117<br />
12801:4.42960824422<br />

<b>$ifo$-pooling</b><br />
3201:6.68349345005<br />
6401:5.21300633551<br />
12801:4.46497484423<br />

With sigmoid nonlinearity at Z gate, we cannot get any lower than 5.95 (without regularization).<br />
A linear nonlinearity at Z gate, results in 5.22.<br />
All tanh nonlinearities doesn't go lower than 32.0!!

In [12]:
def sample_text(length=200):
    # First take a random piece of bootstrap text
    start = np.random.randint(0, len(test_data)-opts.seq_len)
    s = test_data[start:start+opts.seq_len]
    
    # Convert to proper input data shape (here, batch size = 1)
    s_np = np.zeros((1, opts.seq_len, vocab_size), dtype=theano.config.floatX)
    for i in xrange(opts.seq_len):
        s_np[0, i, char_to_ix[s[i]]] = 1.0
    
    # Start sampling loop
    res = ''
    for k in xrange(length):
        # Predict next character
        predict = sample(s_np)
        predict_i = np.random.choice(range(vocab_size), p=predict.ravel())
        res += ix_to_char[predict_i]
        
        # Update s_np
        s_np[0, 0:-1, :] = s_np[0, 1:, :]
        s_np[0, -1, :] = 0.0
        s_np[0, -1, predict_i] = 1.0
    
    return res

In [17]:
print(sample_text(length=200))

wear Arients!

GLOUCESTER:
But may for you have my plant.

KING HENRY BOYDA:
Nenton in thee, for I am is it,
And save, I male shilt you with have charces, ruit
To hear bets bring being againse?

FROND
