In [1]:
import string

import numpy as np
import tensorflow as tf

from helpers.dataset import TensorFlowDataset, import_zip_file
from helpers.evaluate import TensorFlowModelEvaluator
from models.seq_to_seq import SequenceToSequenceLSTM
from vanilla_neural_nets.recurrent_neural_network.training_data import WordLevelRNNTrainingDataBuilder

The following network compiles, runs, predicts. However, its predictions are garbage. This is likely because:

- I'm embedding a 30-dimensional token space into a higher dimensional space at the head of the network - simply so as to avoid modifying the other RNN/LSTM objects included in this repository.
- I likely need to train on a lot more data for many more epochs.

For a real application, I'll be using the objects in TF.

# Create some data

In [2]:
PATH = '../data/text8.zip'
BATCH_SIZE = 128
ENCODER_TIME_STEPS = 10
DECODER_TIME_STEPS = ENCODER_TIME_STEPS + 2
TRAINING_EXAMPLES = BATCH_SIZE * 50

### Create token lookup

In [3]:
tokens = list(string.ascii_lowercase) + [' ', 'START_OUTPUT', 'END_OUTPUT', 'PAD']
token_to_index_lookup = {token: index for index, token in enumerate(tokens)}
index_to_token_lookup = {index: token for token, index in token_to_index_lookup.items()}

### Tokenize corpus

In [4]:
corpus = import_zip_file(path=PATH, n_characters=1000000)
corpus_as_indices = [token_to_index_lookup[token] for token in list(corpus)]

### Build training data and labels, where:
    - Training labels are the "flip" of training data
    - Training labels are prefixed with `START_OUTPUT`, and suffixed with `END_OUTPUT`, for use by our decoder network

In [5]:
training_data = np.array(corpus_as_indices[:TRAINING_EXAMPLES*(ENCODER_TIME_STEPS)])
training_data = training_data.reshape(TRAINING_EXAMPLES, ENCODER_TIME_STEPS)

training_labels = np.fliplr(training_data)
start_output_tokens = [token_to_index_lookup['START_OUTPUT']] * len(training_labels)
end_output_tokens = [token_to_index_lookup['END_OUTPUT']] * len(training_labels)
training_labels = np.c_[start_output_tokens, training_labels, end_output_tokens]

### Construct training, validation and test sets

In [6]:
training_dataset = TensorFlowDataset(
    data=training_data[:BATCH_SIZE*48], 
    labels=training_labels[:BATCH_SIZE*48]
)

validation_dataset = TensorFlowDataset(
    data=training_data[BATCH_SIZE*48:BATCH_SIZE*49], 
    labels=training_labels[BATCH_SIZE*48:BATCH_SIZE*49]
)

test_dataset = TensorFlowDataset(
    data=training_data[BATCH_SIZE*49:], 
    labels=training_labels[BATCH_SIZE*49:]
)

In [7]:
EMBEDDING_LAYER_SIZE = 128
HIDDEN_STATE_SIZE = 100
LEARNING_RATE = .2
N_EPOCHS = int(1e9)
N_CLASSES = VOCABULARY_SIZE = len(tokens)

START_OUTPUT_INDEX = token_to_index_lookup['START_OUTPUT']
END_OUTPUT_INDEX = token_to_index_lookup['END_OUTPUT']
PAD_INDEX = token_to_index_lookup['PAD']

# Define graph

In [8]:
graph = tf.Graph()

with graph.as_default():
    
    dataset = TensorFlowDataset(
        data=tf.placeholder(dtype=tf.int32, shape=[None, ENCODER_TIME_STEPS]),
        labels=tf.placeholder(dtype=tf.int32, shape=[None, DECODER_TIME_STEPS]),
    )
    
    model = SequenceToSequenceLSTM(
        dataset=dataset, 
        n_classes=N_CLASSES,
        embedding_layer_size=EMBEDDING_LAYER_SIZE,
        hidden_state_size=HIDDEN_STATE_SIZE,
        learning_rate=LEARNING_RATE
    )

# Define a padding function in order to generate sequences
- When generating sequences, our test data will always be of size `(1, DECODER_TIME_STEPS)`. 
- To generate new data, we first feed our decoder network the `START_OUTPUT_TOKEN` to predict the second token, then feed the first and second token to predict the third token, then feed the first and second and third token to predict the fourth token, etc. As such, our decoder inputs will be of size `(1, 1)`, `(1, 2)`, `(1, 3)`, etc., respectively.
- Because our decoder data placeholder requires input with shape `(None, DECODER_TIME_STEPS)`, we must right-*pad* our decoder input with the `PAD_INDEX` so as to ensure that our input always has shape `(1, DECODER_TIME_STEPS)`.
- In each step, we will feed the full, padded, sequence of shape `(1, DECODER_TIME_STEPS)` through the decoder, then pluck the predictions corresponding to the right-most non-`PAD_INDEX` token. We then feed this into a multinomial generator to create a hard prediction. 

In [9]:
def pad_predicted_sentence(predicted_sentence, pad_token=PAD_INDEX, decoder_time_steps=DECODER_TIME_STEPS):
    return predicted_sentence + (DECODER_TIME_STEPS - len(predicted_sentence))*[PAD_INDEX]

In [10]:
with tf.Session(graph=graph) as session:

    session.run(tf.initialize_all_variables())

    # Evaluate model
    evaluator = TensorFlowModelEvaluator(
        model=model,
        session=session,
        validation_dataset=validation_dataset,
        test_dataset=test_dataset
    )

    for epoch in range(N_EPOCHS):
        
        if epoch % int(1e7) == 0:

            mini_batch_data, mini_batch_labels = training_dataset.sample(BATCH_SIZE)
            mini_batch_dataset = TensorFlowDataset(data=mini_batch_data, labels=mini_batch_labels)
            evaluator.optimize(mini_batch_dataset)

            print('Epoch: {}'.format( int(epoch / 1e7) ))
            print('Train Loss: {:.3f}'.format(evaluator.training_loss))
            print('Validation Loss: {:.3f}\n'.format(evaluator.validation_loss))

    print('Test Loss: {0:.3f}\n'.format(evaluator.test_loss))
    
    # Predict on test data
    for test_sentence in test_dataset.data:
        predicted_sentence = [START_OUTPUT_INDEX]
        
        while (not predicted_sentence[-1] == END_OUTPUT_INDEX) and len(predicted_sentence) < DECODER_TIME_STEPS:
            
            next_token_index = len(predicted_sentence) - 1
            
            padded_predicted_sentence = pad_predicted_sentence(predicted_sentence)
            feed_dict = {
                model.dataset.data: test_sentence.reshape(1, -1), 
                model.dataset.labels: np.array(padded_predicted_sentence).reshape(1, -1)
            }
            next_token_index_predictions_for_all_timesteps = \
                session.run(model.predict_next_token(), feed_dict=feed_dict)
            next_token_predictions = next_token_index_predictions_for_all_timesteps[next_token_index].ravel()
            
            next_token_prediction = np.argmax(np.random.multinomial(1, next_token_predictions))
            if next_token_prediction not in [START_OUTPUT_INDEX, PAD_INDEX]:
                predicted_sentence.append(next_token_prediction)
    
        print('Test Sentence: {}\nPredicted Reversal: {}\n'.format(
                ''.join([index_to_token_lookup[index] for index in test_sentence]),
                ''.join(index_to_token_lookup[index] for index in predicted_sentence[1:-1]),      
        ))

Epoch: 0
Train Loss: 3.891
Validation Loss: 3.929

Epoch: 1
Train Loss: 3.810
Validation Loss: 3.836

Epoch: 2
Train Loss: 3.704
Validation Loss: 3.757

Epoch: 3
Train Loss: 3.685
Validation Loss: 3.688

Epoch: 4
Train Loss: 3.601
Validation Loss: 3.636

Epoch: 5
Train Loss: 3.582
Validation Loss: 3.588

Epoch: 6
Train Loss: 3.527
Validation Loss: 3.557

Epoch: 7
Train Loss: 3.464
Validation Loss: 3.530

Epoch: 8
Train Loss: 3.439
Validation Loss: 3.497

Epoch: 9
Train Loss: 3.450
Validation Loss: 3.486

Epoch: 10
Train Loss: 3.412
Validation Loss: 3.468

Epoch: 11
Train Loss: 3.399
Validation Loss: 3.425

Epoch: 12
Train Loss: 3.379
Validation Loss: 3.406

Epoch: 13
Train Loss: 3.424
Validation Loss: 3.411

Epoch: 14
Train Loss: 3.343
Validation Loss: 3.390

Epoch: 15
Train Loss: 3.329
Validation Loss: 3.375

Epoch: 16
Train Loss: 3.301
Validation Loss: 3.340

Epoch: 17
Train Loss: 3.249
Validation Loss: 3.310

Epoch: 18
Train Loss: 3.251
Validation Loss: 3.290

Epoch: 19
Train Loss: 