# Pickle to TFRecord

This notebook converts the original pickled data into TensorFlow records.

In [None]:
import cPickle
import datetime
import os

This next step loads the entire data set into system memory.  This may fail on small memory systems.  A 32GB workstations fail.

In [None]:
# load data from pickle
f = open( 'data.pkl', 'r' )

classes = cPickle.load( f )
chars = cPickle.load( f )
char_indices = cPickle.load( f )
indices_char = cPickle.load( f )

maxlen = cPickle.load( f )
step = cPickle.load( f )

X = cPickle.load( f )
y = cPickle.load( f )

f.close()

## Data

All sequences in `X` appear to be `99 + 12 = 111` characters in length.  The ground truth `y` is length `99`.

In [None]:
X.shape

In [None]:
y.shape

In [None]:
len(classes)

In [None]:
len(chars)

In [None]:
(maxlen, step)

In [None]:
classes

## Conversion to Serialized TFRecord

This section generates `tf-records.data` from the original `data.pkl` that was loaded into memory.  The record based approach is useful for reducing the size of the stored dataset and running on systems where the data can not be loaded completely into memory.

In [None]:
import itertools
import tensorflow as tf

In [None]:
def make_example(seq, label):
    return tf.train.Example(features=tf.train.Features(feature={
        'input': tf.train.Feature(int64_list=tf.train.Int64List(value=seq.astype(int).reshape(-1).tolist())),
        'label': tf.train.Feature(int64_list=tf.train.Int64List(value=label.astype(int).tolist())),
    }))

In [None]:
!mkdir -p data_00

In [None]:
with open("data_v1/records.data", "w") as output:
    writer = tf.python_io.TFRecordWriter(output.name)
    for input, label in itertools.izip(X,y):
        writer.write(make_example(input, label).SerializeToString())
    writer.close()

## Deserialize TFRecord / Example

This sections just shows the use of TFRecords in the compute graph.

In [None]:
def extract_features(serialized_example):
    """
    Extracts a `dict` of named features from the serialized `tf.train.Example`
    """
    return tf.parse_single_example(
        serialized=serialized_example,
        features={
            'label': tf.FixedLenFeature([99], dtype=tf.int64),
            'input': tf.FixedLenFeature([20,111], dtype=tf.int64),
        }
    )

def deserialize_example(serialized_example):
    """
    Converts a serialized `tf.train.Example` to FP32 Tensors
    """
    features = extract_features(serialized_example)
    input = tf.cast(features['input'], tf.float32)
    label = tf.cast(features['label'], tf.float32)
    return input, label

### Compute Graph

In [None]:
input = tf.placeholder(tf.string)
features, label = deserialize_example(input)

### Evaluate the Compute Graph

Compute graphs must be evaluated in a `tf.Session`

In [None]:
import numpy as np
from random import randint

random_index = randint(0, len(X)-1)
example = make_example(X[random_index], y[random_index]).SerializeToString()

with tf.Session() as sess:
    _x, _y = sess.run([features, label], feed_dict={
        input: example,
    })
    assert np.array_equal(_x, X[random_index])
    assert np.array_equal(_y, y[random_index])