# Pickle to TFRecord

This notebook converts the original pickled data into TensorFlow records.  Version 2 stored a compact representation of the data.

In [1]:
import cPickle
import datetime
import os

In [2]:
# load data from pickle
f = open( 'data.pkl', 'r' )

classes = cPickle.load( f )
chars = cPickle.load( f )
char_indices = cPickle.load( f )
indices_char = cPickle.load( f )

maxlen = cPickle.load( f )
step = cPickle.load( f )

X = cPickle.load( f )
y = cPickle.load( f )

f.close()

This next step loads the entire data set into system memory.  This may fail on small memory systems.  A 32GB workstations fail.

## Conversion to Serialized TFRecord

This section generates `tf-records.data` from the original `data.pkl` that was loaded into memory.  The record based approach is useful for reducing the size of the stored dataset and running on systems where the data can not be loaded completely into memory.

In [7]:
import itertools
import tensorflow as tf
import numpy as np

In [8]:
def make_example(seq, label):
    x = np.transpose(np.nonzero(seq))
    y = np.transpose(np.nonzero(label)).reshape(-1)
    return tf.train.Example(features=tf.train.Features(feature={
        'input_dense_dimensions': tf.train.Feature(int64_list=tf.train.Int64List(value=seq.shape)),
        'sparse_index_dimensions': tf.train.Feature(int64_list=tf.train.Int64List(value=x.shape)),
        'input': tf.train.Feature(int64_list=tf.train.Int64List(value=x.astype(int).reshape(-1))),
        'label': tf.train.Feature(int64_list=tf.train.Int64List(value=y)),
        'label_length': tf.train.Feature(int64_list=tf.train.Int64List(value=[len(label)])),
    }))

In [9]:
!mkdir -p data_v2

ValueError: filedescriptor out of range in select()

In [10]:
from random import randint

nfiles = 1024

files = [open("data_v2/records_{0:04d}".format(idx), "w") for idx in range(0,nfiles)]
writers = [tf.python_io.TFRecordWriter(file.name) for file in files]

for input, label in itertools.izip(X, y):
    random_file = randint(0, nfiles-1)
    writers[random_file].write(make_example(input, label).SerializeToString())

for file, writer in itertools.izip(files, writers):
    writer.close()
    file.close()

## Deserialize TFRecord / Example

This sections just shows the use of TFRecords in the compute graph.

In [13]:
def extract_features(serialized_example):
    """
    Extracts a `dict` of named features from the serialized `tf.train.Example`
    """
    return tf.parse_single_example(
        serialized=serialized_example,
        features={
            'input_dense_dimensions': tf.FixedLenFeature([2], dtype=tf.int64),
            'sparse_index_dimensions': tf.FixedLenFeature([2], dtype=tf.int64),
            'input': tf.FixedLenFeature([80], dtype=tf.int64),
            'label': tf.FixedLenFeature([1], dtype=tf.int64),
            'label_length': tf.FixedLenFeature([1], dtype=tf.int64),
        }
    )

def deserialize_example(serialized_example):
    """
    Converts a serialized `tf.train.Example` to FP32 Tensors
    """
    features = extract_features(serialized_example)
    shape_sparse = tf.cast(features['sparse_index_dimensions'], tf.int32)
    indices = tf.reshape(tf.cast(features['input'], tf.int32), shape_sparse)
    values = tf.ones([shape_sparse[0]])
    shape_dense = tf.cast(features['input_dense_dimensions'], tf.int32)
    input = tf.sparse_to_dense(indices, shape_dense, values)
    label = tf.one_hot(features['label'][0], 99, on_value=1., off_value=0., dtype=tf.float32)
    return input, label

### Compute Graph

In [14]:
input = tf.placeholder(tf.string)
features, label = deserialize_example(input)

### Evaluate the Compute Graph

Compute graphs must be evaluated in a `tf.Session`

In [18]:
import numpy as np

random_index = randint(0, len(X)-1)
example = make_example(X[random_index], y[random_index]).SerializeToString()

with tf.Session() as sess:
    _x, _y = sess.run([features, label], feed_dict={
        input: example,
    })
    assert np.array_equal(_x, X[random_index])
    assert np.array_equal(_y, y[random_index])