
Load and parse data with TensorFlow

A TensorFlow example to build input pipelines for loading data efficiently.

    Numpy Arrays
    Images
    CSV file
    Custom data from a Generator

In [0]:
from __future__ import absolute_import, division, print_function

import numpy as np
import random
import requests
import string
import tarfile
import tensorflow as tf

In [0]:
#create toy dataset (even and odd nmbers with respective labels of 0 and 1)
evens=np.arange(0,100, step=2, dtype=np.int32)
evens_label=np.zeros(50,dtype=np.int32)
odds=np.arange(1,100,step=2,dtype=np.int32)
odds_label=np.ones(50,dtype=np.int32)
#concatenate arrays
features=np.concatenate([evens,odds])
labels=np.concatenate([evens_label, odds_label])

In [5]:
with tf.Graph().as_default():
  sess=tf.Session()
  
  #slice numpy arrays (each row becoming a record)
  data =tf.data.Dataset.from_tensor_slices((features,labels))
  #refill data indefinitely
  data=data.repeat()
  #shuffle data
  data=data.shuffle(buffer_size=100)
  #batch data (aggregate records together)
  data=data.batch(batch_size=4)
  #prefetch batch (pre-load batch for faster computation)
  data=data.prefetch(buffer_size=1)
  
  #create an iterator over dataset
  iterator=data.make_initializable_iterator()
  #initialize iterator
  sess.run(iterator.initializer)
  
  #get next data batch
  d=iterator.get_next()

W0826 12:53:57.257271 140463342319488 deprecation.py:323] From <ipython-input-5-d52c7b9345c6>:16: make_initializable_iterator (from tensorflow.python.data.ops.dataset_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `for ... in dataset:` to iterate over a dataset. If using `tf.estimator`, return the `Dataset` object directly from your input function. As a last resort, you can use `tf.compat.v1.data.make_initializable_iterator(dataset)`.


In [7]:
#display data
for i in range(5):
  x,y=sess.run(d)
  print(x,y)

[59 10 78 31] [1 0 0 1]
[62 18 34 47] [0 0 0 1]
[74 60 48 99] [0 0 0 1]
[28 55 69 23] [0 1 1 1]
[38  1 70 87] [0 1 0 1]


Load CSV files

Build a pipeline from features stroed in CSV file. For this example, Titanic dataset will be used as a toy dataset stored in CSV format.

In [0]:
#download titanic dataset (in csv format)
d=requests.get("https://raw.githubusercontent.com/tflearn/tflearn.github.io/master/resources/titanic_dataset.csv")
with open("titanic_dataset.csv","wb") as f:
  f.write(d.content)

In [0]:
#load titanic dataset
#original features: survived, pclass, name, sex, age, sibsp, parch , ticket, fare
#select specific colunmns : survived, plclass, name, sex, age, fare
columns_to_use=[0,1,2,3,4,8]
record_defaults=[tf.int32, tf.int32, tf.string, tf.string,tf.float32,tf.float32]

In [0]:
with tf.Graph().as_default():
  #create tf session
  sess=tf.Session()
  
  #load whole dataset file, slice each line
  data=tf.data.experimental.CsvDataset("titanic_dataset.csv",
                                      record_defaults, header=True, select_cols=columns_to_use)
  #refill data indefinitely
  data=data.repeat()
  #shuffle data
  data=data.shuffle(buffer_size=1000)
  #batch data (aggregate records toogetheer)
  data=data.batch(batch_size=2)
  #prefetch batch (pre-load batch for faster consumption)
  data=data.prefetch(buffer_size=1)
  
  #create iterator over dataset
  iterator=data.make_initializable_iterator()
  #initialize iterator
  sess.run(iterator.initializer)
  
  #get next data batch
  d=iterator.get_next()

In [15]:
#displlay data
for i in range(3):
  survived, pclass, name, sex, age, fare=sess.run(d)
  print(survived)
  print(pclass)
  print(name)
  print(sex)
  print(age)
  print(fare)
  print("")

[1 0]
[1 1]
['Maioni, Miss. Roberta' 'Ross, Mr. John Hugo']
['female' 'male']
[16. 36.]
[86.5   40.125]

[1 1]
[2 2]
['Quick, Miss. Winifred Vera' 'Caldwell, Mr. Albert Francis']
['female' 'male']
[ 8. 26.]
[26. 29.]

[0 1]
[1 3]
['Futrelle, Mr. Jacques Heath' 'Albimona, Mr. Nassef Cassem']
['male' 'male']
[37. 26.]
[53.1    18.7875]



Load Images

Build a data pipeline by loading images from disk. For this example, Oxford Flowers dataset will be used

In [0]:
#download Oxford 17 flowers dataset
d=requests.get("http://www.robots.ox.ac.uk/~vgg/data/flowers/17/17flowers.tgz")
with open("17flowers.tgz","wb") as f:
  f.write(d.content)
#extract archive
with tarfile.open("17flowers.tgz") as t:
  t.extractall()

In [0]:
#create a file to list all images path and their corresponding label
with open('jpg/dataset.csv','w') as f:
  c=0
  for i in range(1360):
    f.write("jpg/image_%04i.jpg,%i\n"%(i+1,c))
    if (i+1)%80==0:
      c+=1

In [0]:
with tf.Graph().as_default():
  
  #load images
  with open("jpg/dataset.csv") as f:
    dataset_file=f.read().splitlines()
    
  #create tf session
  sess=tf.Session()
  
  #load whole dataset file, and slice each line
  data=tf.data.Dataset.from_tensor_slices(dataset_file)
  #refill data indefinitely
  data=data.repeat()
  #shuffle data
  data=data.shuffle(buffer_size=1000)
  
  #load and pre process images
  def load_image(path):
    #read image from path
    image=tf.io.read_file(path)
    #decode jpeg image to array [0,255]
    image=tf.image.decode_jpeg(image)
    #resize image to a common size of 256x256
    image=tf.image.resize(image,[256,256])
    #rescale values to [-1,1]
    image=1.0 - image/127.5
    return image
  
  #decode each line from dataset file
  def parse_records(line):
    #file is in csv format : "image_path, label_id"
    #tensorflow requires a default value, but it will never be used
    image_path, image_label =tf.io.decode_csv(line, ["",0])
    #apply function to load images
    image=load_image(image_path)
    return image, image_label
  
  #use 'map' to apply above functions in parallel
  data=data.map(parse_records, num_parallel_calls=4)
  
  #batch data (aggreaget images-array together)
  data=data.batch(batch_size=2)
  #prefetch batch (pre-load batch for faster consumption)
  data=data.prefetch(buffer_size=1)
  
  #create an iterator over dataset
  iterator=data.make_initializable_iterator()
  #initialize iterator
  sess.run(iterator.initializer)
  
  #get next data batch
  d=iterator.get_next()

In [26]:
#display data
for i in range(1):
  batch_x, batch_y=sess.run(d)
  print(batch_x, batch_y)

[[[[ 0.84313726  0.9607843   0.9372549 ]
   [ 0.9185049   1.          1.        ]
   [ 0.8963235   0.9904412   0.9747549 ]
   ...
   [ 0.9973039   0.8915441   0.94509804]
   [ 1.          0.95735294  1.        ]
   [ 0.90931374  0.907598    0.9468137 ]]

  [[ 0.88235295  1.          0.9992647 ]
   [ 0.80053234  0.95592445  0.944874  ]
   [ 0.8418237   0.98324525  0.97973347]
   ...
   [ 0.96867913  0.8577091   0.911263  ]
   [ 0.9643038   0.9167509   0.9671607 ]
   [ 0.91074216  0.9090265   0.9482422 ]]

  [[ 0.6965686   0.92745095  0.94166666]
   [ 0.7243489   0.9456725   0.9642195 ]
   [ 0.660769    0.8752298   0.8972886 ]
   ...
   [ 0.813668    0.6873238   0.73633957]
   [ 0.9454274   0.8942019   0.9290135 ]
   [ 0.89244026  0.88084024  0.9200559 ]]

  ...

  [[ 0.27941173  0.42843133  0.4732843 ]
   [-0.15079093 -0.00177133  0.04308164]
   [-0.3964423  -0.24779415 -0.2197305 ]
   ...
   [ 0.45693928  0.48899162  0.65089035]
   [ 0.42541742  0.4257812   0.6143842 ]
   [ 0.75019336 

Load data from a Generator

Build a data pipeline from a custom generator.
For this example, a toy generator yielding random string, vector and int is used

In [0]:
#create dummy generator
def generate_features():
  #function to generate random string
  def random_string(length):
    return ''.join(random.choice(string.ascii_letters) for m in xrange(length))
  #return random string, random vector and random int
  yield random_string(4), np.random.uniform(size=4), random.randint(0,10)

In [0]:
with tf.Graph().as_default():
  sess=tf.Session()
  
  #create TF dataset from generator
  data=tf.data.Dataset.from_generator(generate_features, output_types=(tf.string,
                                                                      tf.float32,
                                                                      tf.int32))
  #refill data indefinitely
  data=data.repeat()
  #shuffle data
  data=data.shuffle(buffer_size=100)
  #batch data (aggregate records together)
  data=data.batch(batch_size=4)
  #prefetch batch (pre-load batch for faster consumption)
  data=data.prefetch(buffer_size=1)
  
  #create an iterator over dataset
  iterator=data.make_initializable_iterator()
  #initialize the iterator
  sess.run(iterator.initializer)
  
  #get next data batch
  d=iterator.get_next()

In [32]:
#display data
for i in range(5):
  batch_str, batch_vector, batch_int=sess.run(d)
  print(batch_str, batch_vector, batch_int)

['arbW' 'pScG' 'KZIr' 'zNRe'] [[0.27371687 0.04913669 0.25907984 0.84514105]
 [0.46897408 0.19587737 0.6731533  0.7314421 ]
 [0.11443488 0.90311354 0.8957534  0.1712661 ]
 [0.8081347  0.09573816 0.14854978 0.623093  ]] [3 6 5 4]
['LbuL' 'ghlr' 'UsZn' 'RmWl'] [[0.92380947 0.8743449  0.5858188  0.45991743]
 [0.19383323 0.47632334 0.5417587  0.34051272]
 [0.0375283  0.3907353  0.09617063 0.24519591]
 [0.37095007 0.10410413 0.32756004 0.5564047 ]] [3 9 8 6]
['KoMw' 'qFrf' 'OvIr' 'vCvd'] [[0.03106382 0.2384225  0.9419692  0.09193267]
 [0.3254956  0.89814734 0.39259553 0.06813686]
 [0.6031087  0.36409938 0.19767705 0.01070746]
 [0.5872436  0.22545953 0.96978426 0.5304217 ]] [ 1 10  0  1]
['lZeL' 'mjsy' 'xJRC' 'rQHn'] [[0.20834456 0.42881402 0.26402912 0.6814654 ]
 [0.6332764  0.5871682  0.3497198  0.28776738]
 [0.22815652 0.7827053  0.86866796 0.00186023]
 [0.72032577 0.2193191  0.5344611  0.7099987 ]] [10  0  7  2]
['aZbi' 'yulU' 'BbvR' 'LcVX'] [[0.42651826 0.6934289  0.17446402 0.59790736]