In [None]:
import tensorflow as tf
import numpy as np

# TF Record

## Designing TF Records

- TF records should be split into multiple files to take advantage of parallel 
  IO 
- There should be 10 times as many files as there are hosts reading the data

### Size

- The size of TF records should be at least >10MB and ideally >100MB
- TF records that are too big can be slow to read

## Writing TF Record

In [None]:
def float_list_feature(values):
    return tf.train.Feature(float_list=tf.train.FloatList(value=values))

def string_list_feature(values):
    values = map(lambda x: bytes(x, "UTF-8"), values)
    values = list(values)
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=values))

def int64_list_feature(values):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=values))

In [None]:
def serialize_label(label):
    feature = {
        "location": int64_list_feature(label["location"]),
        "name": string_list_feature([label["name"]]),
        "data": float_list_feature(np.array([1, 2, 3], dtype=np.float32))
    }

    example = tf.train.Example(features=tf.train.Features(feature=feature))

    return example.SerializeToString()

In [None]:
serialize_label({ "name": "peter griffin", "location": [0, 1, 0]})

b'\nJ\n\x13\n\x08location\x12\x07\x1a\x05\n\x03\x00\x01\x00\n\x19\n\x04name\x12\x11\n\x0f\n\rpeter griffin\n\x18\n\x04data\x12\x10\x12\x0e\n\x0c\x00\x00\x80?\x00\x00\x00@\x00\x00@@'

In [None]:
with tf.io.TFRecordWriter("people.tfrecord") as writer:
    writer.write(
        serialize_label({ "name": "peter griffin", "location": [0, 1, 0]}))

    writer.write(
        serialize_label({ "name": "stewie griffin", "location": [1, 0, 0]}))

## Reading TF Record

In [None]:
raw_ds = tf.data.TFRecordDataset("people.tfrecord")

In [None]:
def decode(raw_person):
    return tf.io.parse_single_example(
        raw_person, 
        {
            "name": tf.io.FixedLenFeature([], dtype=tf.string),
            "location": tf.io.FixedLenFeature((3,), dtype=tf.int64),
            "data": tf.io.FixedLenFeature((3,), dtype=tf.float32),
        })

ds = raw_ds.map(decode)
ds


<MapDataset shapes: {data: (3,), location: (3,), name: ()}, types: {data: tf.float32, location: tf.int64, name: tf.string}>

In [None]:
for person in ds:
    location = person["location"]
    data = person["data"]
    name = person["name"]

    print(f"name = {name}, location = {location}, data = {data}")


name = b'peter griffin', location = [0 1 0], data = [1. 2. 3.]
name = b'stewie griffin', location = [1 0 0], data = [1. 2. 3.]


2022-03-21 21:20:09.929879: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
