
Create and Load TFRecords

A simple TensorFlow example to parse a dataset into TFRecord format, and then read that dataset.

In this example, the Titanic Dataset (in CSV format) will be used as a toy dataset, for parsing all the dataset features into TFRecord format, and then building an input pipeline that can be used for training models.


Titanic Dataset

The titanic dataset is a popular dataset for ML that provides a list of all passengers onboard the Titanic, along with various features such as their age, sex, class (1st, 2nd, 3rd)... And if the passenger survived the disaster or not.

It can be used to see that even though some luck was involved in surviving the sinking, some groups of people were more likely to survive than others, such as women, children, and the upper-class...


Variable Descriptions:

survived        Survived
                (0 = No; 1 = Yes)
                
pclass          Passenger Class
                (1 = 1st; 2 = 2nd; 3 = 3rd)
                
name            Name

sex             Sex

age             Age

sibsp           Number of Siblings/Spouses Aboard

parch           Number of Parents/Children Aboard

ticket          Ticket Number

fare            Passenger Fare

In [0]:
from __future__ import absolute_import, print_function, division

import csv
import requests
import tensorflow as tf

In [0]:
#download titanic dataset (in csv format)
d=requests.get("https://raw.githubusercontent.com/tflearn/tflearn.github.io/master/resources/titanic_dataset.csv")
with open("titanic_dataset.csv","wb") as f:
  f.write(d.content)

Create TFRecords

In [0]:
#generate integer features
def build_int64_feature(data):
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[data]))

#generate float features
def build_float_feature(data):
  return tf.train.Feature(float_list=tf.train.FloatList(value=[data]))

#generate string features
def build_string_feature(data):
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[data]))

#generate TF example parsing all features of dataset
def convert_to_tfexample(survived, pclass, name, sex, age, sibsp, parch, ticket, fare):
  return tf.train.Example(
  features=tf.train.Features(
  feature={
      'survived':build_int64_feature(survived),
      'pclass':build_int64_feature(pclass),
      'name':build_string_feature(name),
      'sex':build_string_feature(sex),
      'age':build_float_feature(age),
      'sibsp':build_int64_feature(sibsp),
      'parch':build_int64_feature(parch),
      'ticket':build_string_feature(ticket),
      'fare':build_float_feature(fare),
     
  }))

In [0]:
#open dataset file
with open("titanic_dataset.csv") as f:
  #output TFrecord file
  with tf.io.TFRecordWriter("titanic_dataset.tfrecord") as w:
    #generate tf example for all row in our dataset
    #csv reader will read and parse all rows
    reader=csv.reader(f, skipinitialspace=True)
    for i, record in enumerate(reader):
      #skip header
      if i==0:
        continue
      survived, pclass, name, sex, age, sibsp, parch, ticket, fare=record
      #parse each csv row to tf example using above function
      example=convert_to_tfexample(int(survived), int(pclass), name,sex, float(age), int(sibsp),
                                  int(parch), ticket, float(fare))
      #serialize each tf example to string, and write to TFrecord file
      w.write(example.SerializeToString())

Load TFRecords

In [0]:
#build features template, with types
features={
    'survived':tf.io.FixedLenFeature([],tf.int64),
    'pclass':tf.io.FixedLenFeature([],tf.int64),
    'name':tf.io.FixedLenFeature([],tf.string),
    'sex':tf.io.FixedLenFeature([],tf.string),
    'age':tf.io.FixedLenFeature([],tf.float32),
    'sibsp':tf.io.FixedLenFeature([],tf.int64),
    'parch':tf.io.FixedLenFeature([],tf.int64),
    'ticket':tf.io.FixedLenFeature([],tf.string),
    'fare':tf.io.FixedLenFeature([],tf.float32)
}

In [0]:
#create tensorflow session
sess=tf.Session()

#load tfrecord data
filenames=["titanic_dataset.tfrecord"]
data=tf.data.TFRecordDataset(filenames)

#parse features, using the above template
def parse_record(record):
  return tf.io.parse_single_example(record, features=features)

#apply the parsing to each record from dataset
data=data.map(parse_record)

#refill data indefinitely
data=data.repeat()

#shuffle data
data=data.shuffle(buffer_size=1000)
#batch data (aggreaget records together)
data=data.batch(batch_size=4)
#prefetch batch (pre-load batch for faster consumption)
data=data.prefetch(buffer_size=1)

#create an iterator over dataset
iterator=data.make_initializable_iterator()
#initialize the iterator
sess.run(iterator.initializer)

#get next data batch
x=iterator.get_next()

In [46]:
#deque data and display
for i in range(3):
  print(sess.run(x))
  print("")

{'fare': array([10.5   , 27.7208, 73.5   , 26.    ], dtype=float32), 'name': array(['Stokes, Mr. Philip Joseph',
       'Lindstrom, Mrs. Carl Johan (Sigrid Posse)',
       'Deacon, Mr. Percy William',
       'Phillips, Miss. Kate Florence ("Mrs Kate Louise Phillips Marshall")'],
      dtype=object), 'age': array([25., 55., 17., 19.], dtype=float32), 'parch': array([0, 0, 0, 0]), 'pclass': array([2, 1, 2, 2]), 'sex': array(['male', 'female', 'male', 'female'], dtype=object), 'survived': array([0, 1, 0, 1]), 'sibsp': array([0, 0, 0, 0]), 'ticket': array(['F.C.C. 13540', '112377', 'S.O.C. 14879', '250655'], dtype=object)}

{'fare': array([51.8625,  7.8542, 27.    ,  7.925 ], dtype=float32), 'name': array(['McCarthy, Mr. Timothy J', 'Braf, Miss. Elin Ester Maria',
       'Jacobsohn, Mrs. Sidney Samuel (Amy Frances Christy)',
       'Ilmakangas, Miss. Ida Livija'], dtype=object), 'age': array([54., 20., 24., 27.], dtype=float32), 'parch': array([0, 0, 1, 0]), 'pclass': array([1, 3, 2, 3]), 