# Tensorflow Records example
This notebook contains scratch code for creating a single tf.record from an existing training example.

In [1]:
# imports
import numpy as np
import tensorflow as tf

from komorebi.libs.utilities.io_utils import load_pickle_object

## Load single training example
We load a single training example data type. 

In [9]:
sharded_dataset_path = "/Users/andy/Projects/biology/research/komorebi/data/attention_validation_dataset/sharded_attention_dataset.pkl"
dataset = load_pickle_object(sharded_dataset_path)
training_examples = dataset.get_training_examples(range(2))
te = training_examples[0]

sequence_shape = te.sequence.shape
label_shape = te.label.shape
annotation_shape = te.annotation.shape

print "sequence shape: {}".format(sequence_shape)
print "label shape: {}".format(label_shape)
print "annotation shape: {}".format(annotation_shape)

sequence shape: (1000, 4)
label shape: (919,)
annotation shape: (75, 320)


## Convert training example to tf record.
Prototype code to convert training example to tf record.

### Functions for writing tf example

In [5]:
SEQUENCE_SHAPE = (1000, 4)
ANNOTATION_SHAPE = (75, 320)
TF_SINGLE_RECORD_PATH = "/tmp/training_example_0.tfrecords"

# define feature translation functions
def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def convert_to_tf_example(training_example):
    """Convert training example type to tf example.
    
    For some reason, the features keyword argument needs to be present in Features() object.
    """
    return tf.train.Example(features=tf.train.Features(
        feature={
            'sequence_raw': _bytes_feature(training_example.sequence.tostring()),
            'label_raw': _bytes_feature(training_example.label.tostring()),
            'annotation_raw': _bytes_feature(training_example.annotation.tostring())}))

### Write TF example to disk

In [4]:
# convert training example to tf record
tf_example = convert_to_tf_example(training_examples[0])

# write tf_example to disk
writer = tf.python_io.TFRecordWriter(TF_SINGLE_RECORD_PATH)
writer.write(tf_example.SerializeToString())
writer.close()

### Functions for reading tf example

In [2]:
def parse_example(tf_example):
    """Parse tensorflow example"""
    
    features_map = {
        'sequence_raw': tf.FixedLenFeature([], tf.string),
        'label_raw': tf.FixedLenFeature([], tf.string),
        'annotation_raw': tf.FixedLenFeature([], tf.string)}
    
    parsed_example = tf.parse_single_example(serialized_example, features_map)
    
    sequence_raw = tf.decode_raw(parsed_example['sequence_raw'], tf.uint8)
    annotation_raw = tf.decode_raw(parsed_example['annotation_raw'], tf.float32)
    
    sequence = tf.reshape(sequence_raw, SEQUENCE_SHAPE)
    label = tf.decode_raw(parsed_example['label_raw'], tf.uint8)
    annotation = tf.reshape(annotation_raw, ANNOTATION_SHAPE)
    
    return {'sequence': sequence, 'label': label, 'annotation': annotation}

### Read tf example

In [6]:
TF_RECORD = "/tmp/attention_valiation_tf_records/example_1001.tfrecord"
record_iterator = tf.python_io.tf_record_iterator(TF_RECORD)
serialized_example = record_iterator.next()
feed_dict = parse_example(serialized_example)

### Validate deserialization

In [7]:
sess = tf.InteractiveSession()
decoded_sequence = feed_dict['sequence'].eval()
decoded_annotation = feed_dict['annotation'].eval()
decoded_label = feed_dict['label'].eval()  
sess.close()

In [11]:
# Validate tensors deserialized correctly
print decoded_sequence.shape
print decoded_annotation.shape
print decoded_label.shape

#print np.array_equal(decoded_sequence, te.sequence)
#print np.array_equal(decoded_label, te.label)
#print np.array_equal(decoded_annotation, te.annotation)

(1000, 4)
(75, 320)
(919,)


## Create validation dataset in tf records

In [9]:
TF_VALIDATION_DATASET = "/tmp/validation_dataset.tfrecord"
training_examples = dataset.get_training_examples(range(8000))

# convert to training examples to tf examples
tf_examples = [convert_to_tf_example(te) for te in training_examples]

# write to disk
writer = tf.python_io.TFRecordWriter(TF_VALIDATION_DATASET)
for tf_example in tf_examples:    
    writer.write(tf_example.SerializeToString())
writer.close()

In [12]:
# deserialize dataset
filenames = [TF_VALIDATION_DATASET]
tf_dataset = tf.data.TFRecordDataset(filenames)

In [14]:
tf_dataset = tf_dataset.map(parse_example, num_threads=6, output_buffer_size=250)

In [15]:
iterator = tf_dataset.make_one_shot_iterator()
next_element = iterator.get_next()

In [16]:
sess = tf.InteractiveSession()
sequence = next_element['sequence'].eval()
label = next_element['label'].eval()

print sequence.shape
print label.shape

(1000, 4)
(919,)


In [18]:
batched_dataset = tf_dataset.batch(100)
batched_iter = batched_dataset.make_one_shot_iterator()
batched_next = batched_iter.get_next()

In [21]:
b_element = batched_next
b_sequence = b_element['sequence']
b_annotation = b_element['annotation']
b_label = b_element['label']

print b_sequence.eval().shape
print b_annotation.eval().shape
print b_label.eval().shape

(100, 1000, 4)
(100, 75, 320)
(100, 919)
