In [2]:
# only tf2 
import tensorflow as tf
import numpy as np

In [4]:
# https://github.com/kmonachopoulos/ImageNet-to-TFrecord

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from datetime import datetime
import os
import random
import sys
import threading
from six.moves import xrange  # pylint: disable=redefined-builtin


def _int64_feature(value):
    """Wrapper for inserting int64 features into Example proto."""
    if not isinstance(value, list):
        value = [value]
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))


def _float_feature(value):
    """Wrapper for inserting float features into Example proto."""
    if not isinstance(value, list):
        value = [value]
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))


def _bytes_feature(value):
    """Wrapper for inserting bytes features into Example proto."""
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))


In [5]:
def _convert_to_example(filename, image_buffer, label, synset, human,
                        height, width):
    """Build an Example proto for an example.

  Args:
    filename: string, path to an image file, e.g., '/path/to/example.JPG'
    image_buffer: string, JPEG encoding of RGB image
    label: integer, identifier for the ground truth for the network
    synset: string, unique WordNet ID specifying the label, e.g., 'n02323233'
    human: string, human-readable label, e.g., 'red fox, Vulpes vulpes'
    height: integer, image height in pixels
    width: integer, image width in pixels
  Returns:
    Example proto
    """

    colorspace = b'RGB'
    channels = 3
    image_format = b'JPEG'

    example = tf.train.Example(features=tf.train.Features(feature={
      'image/height': _int64_feature(height),
      'image/width': _int64_feature(width),
      'image/colorspace': _bytes_feature(colorspace),
      'image/channels': _int64_feature(channels),
      'image/class/label': _int64_feature(label),
      'image/class/synset': _bytes_feature(bytes(synset,'utf-8')),
      'image/class/text': _bytes_feature(bytes(human,'utf-8')),
      'image/format': _bytes_feature(image_format),
      'image/filename': _bytes_feature(bytes(os.path.basename(filename),'utf-8')),
      'image/encoded': _bytes_feature(image_buffer)}))
  
    return example


In [26]:
class ImageCoder(object):
    """helper class that provides TF iamge coding utils"""
    def __init__(self):
        pass

    def decode_jpeg(self, image_data):
        image = tf.image.decode_jpeg(image_data, channels=3)
        assert len(image.shape) == 3
        assert image.shape[2] == 3
        return image    
    

In [27]:
def _process_image(filename, coder):
    image_data = tf.io.gfile.GFile(filename, 'rb').read()
    # in normal mode:  image_data = open(filename, 'rb').read()
    image = coder.decode_jpeg(image_data)
    # Check that image converted to RGB
    assert len(image.shape) == 3
    height = image.shape[0]
    width = image.shape[1]
    assert image.shape[2] == 3

    return image_data, height, width    


In [18]:
output_directory = "/data/ImageNet_tiny/"

def _process_image_files_batch(coder, thread_index, ranges, name, filenames,
                               synsets, labels, humans, num_shards):
    """Processes and saves list of images as TFRecord in 1 thread.

  Args:
    coder: instance of ImageCoder to provide TensorFlow image coding utils.
    thread_index: integer, unique batch to run index is within [0, len(ranges)).
    ranges: list of pairs of integers specifying ranges of each batches to
      analyze in parallel.
    name: string, unique identifier specifying the data set
    filenames: list of strings; each string is a path to an image file
    synsets: list of strings; each string is a unique WordNet ID
    labels: list of integer; each integer identifies the ground truth
    humans: list of strings; each string is a human-readable label
    num_shards: integer number of shards for this data set.
    """
  # Each thread produces N shards where N = int(num_shards / num_threads).
  # For instance, if num_shards = 128, and the num_threads = 2, then the first
  # thread would produce shards [0, 64).
    
    global output_directory
    
    num_threads = len(ranges)
    assert not num_shards % num_threads
    num_shards_per_batch = int(num_shards / num_threads)
    print("**************num_shards_per_batch********** ", num_shards_per_batch)  #1031
    shard_ranges = np.linspace(ranges[thread_index][0],
                             ranges[thread_index][1],
                             num_shards_per_batch + 1).astype(int)
    num_files_in_thread = ranges[thread_index][1] - ranges[thread_index][0]
    counter = 0
    for s in  xrange(num_shards_per_batch):
     # Generate a sharded version of the file name, e.g. 'train-00002-of-00010'
        shard = thread_index * num_shards_per_batch + s
        output_filename = '%s-%.5d-of-%.5d' % (name, shard, num_shards)
        output_file = os.path.join(output_directory, output_filename)
        writer = tf.io.TFRecordWriter(output_file)   
        
        shard_counter = 0
        files_in_shard = np.arange(shard_ranges[s], shard_ranges[s + 1], dtype=int) # HERE
        for i in files_in_shard:
            filename = filenames[i]
            label = labels[i]
            synset = synsets[i]
            human = humans[i]

            image_buffer, height, width = _process_image(filename, coder)
            example = _convert_to_example(filename, image_buffer, label, synset, human, height, width)
            writer.write(example.SerializeToString())
            shard_counter += 1
            counter += 1
            
            if not counter % 1000:
                print('%s [thread %d]: Processed %d of %d images in thread batch.' %
                  (datetime.now(), thread_index, counter, num_files_in_thread))
                sys.stdout.flush()
        
        writer.close()
        print('%s [thread %d]: Wrote %d images to %s' %
               (datetime.now(), thread_index, shard_counter, output_file))
        sys.stdout.flush()
        shard_counter = 0
    print('%s [thread %d]: Wrote %d images to %d shards.' %
           (datetime.now(), thread_index, counter, num_files_in_thread))
    sys.stdout.flush()       
        

In [19]:
num_threads = 1

def _process_image_files(name, filenames, synsets, labels, humans, num_shards):
    """Process and save list of images as TFRecord of Example protos.

  Args:
    name: string, unique identifier specifying the data set
    filenames: list of strings; each string is a path to an image file
    synsets: list of strings; each string is a unique WordNet ID
    labels: list of integer; each integer identifies the ground truth
    humans: list of strings; each string is a human-readable label
    num_shards: integer number of shards for this data set.
    """
    assert len(filenames) == len(synsets)
    assert len(filenames) == len(labels)
    assert len(filenames) == len(humans)
    
    global num_threads 

    # Break all images into batches with a [ranges[i][0], ranges[i][1]].
    spacing = np.linspace(0, len(filenames), num_threads + 1).astype(np.int)
    ranges = []
    threads = []
    for i in xrange(len(spacing) - 1):
        ranges.append([spacing[i], spacing[i+1]])

    # Launch a thread for each batch.
    print('Launching %d threads for spacings: %s' % (num_threads, ranges))
    sys.stdout.flush()

    coord = tf.train.Coordinator()
    coder = ImageCoder()
    threads = []
    for thread_idx in xrange(len(ranges)):
        args = (coder, thread_idx, ranges, name, filenames, synsets, labels, humans, num_shards)
        t = threading.Thread(target=_process_image_files_batch, args=args)
        t.start()
        threads.append(t)
    
    coord.join(threads)
    print('%s: Finished writing all %d images in data set.' %
           (datetime.now(), len(filenames)))
    sys.stdout.flush()   


In [20]:
def _find_image_files(data_dir, labels_file):
    """Build a list of all images files and labels in the data set.

  Args:
    data_dir: string, path to the root directory of images.

      Assumes that the ImageNet data set resides in JPEG files located in
      the following directory structure.

        data_dir/n01440764/ILSVRC2012_val_00000293.JPEG
        data_dir/n01440764/ILSVRC2012_val_00000543.JPEG

      where 'n01440764' is the unique synset label associated with these images.

    labels_file: string, path to the labels file.

      The list of valid labels are held in this file. Assumes that the file
      contains entries as such:
        n01440764
        n01443537
        n01484850
      where each line corresponds to a label expressed as a synset. We map
      each synset contained in the file to an integer (based on the alphabetical
      ordering) starting with the integer 1 corresponding to the synset
      contained in the first line.

      The reason we start the integer labels at 1 is to reserve label 0 as an
      unused background class.

  Returns:
    filenames: list of strings; each string is a path to an image file.
    synsets: list of strings; each string is a unique WordNet ID.
    labels: list of integer; each integer identifies the ground truth.
    """
    print('Determining list of input files and labels from %s.' % data_dir)
    challenge_synsets = [l.strip() for l in
                   tf.io.gfile.GFile(labels_file, 'r').readlines()]

    labels = []
    filenames = []
    synsets = []

    # Leave label index 0 empty as a background class.
    label_index = 1

    # Construct the list of JPEG files and labels.
    for synset in challenge_synsets:
        jpeg_file_path = '%s/%s/*.JPEG' % (data_dir, synset)
        matching_files = tf.io.gfile.glob(jpeg_file_path)

        labels.extend([label_index] * len(matching_files))
        synsets.extend([synset] * len(matching_files))
        filenames.extend(matching_files)

        if not label_index % 100:
            print('Finished finding files in %d of %d classes.' % (
                      label_index, len(challenge_synsets)))
            label_index += 1

    # Shuffle the ordering of all image files in order to guarantee
    # random ordering of the images with respect to label in the
    # saved TFRecord files. Make the randomization repeatable.
    shuffled_index = range(len(filenames))
    random.seed(12345)

    random.shuffle(list(range(len(shuffled_index))))

    filenames = [filenames[i] for i in shuffled_index]
    synsets = [synsets[i] for i in shuffled_index]
    labels = [labels[i] for i in shuffled_index]

    print('Found %d JPEG files across %d labels inside %s.' %
            (len(filenames), len(challenge_synsets), data_dir))
    return filenames, synsets, labels


In [21]:
def _find_human_readable_labels(synsets, synset_to_human):
    humans = []
    for s in synsets:
        assert s in synset_to_human, ('Failed to find: %s' % s)
        humans.append(synset_to_human[s])
    return humans



In [22]:
def _process_dataset(name, directory, num_shards, synset_to_human):
    """ process a complete dataset and save it to TFRecord"""
    filenames, syssets, labels = _find_image_files(directory, "/data/ImageNet_tiny/imagenet_lsvrc_2015_synsets_tiny.txt")
    humans = _find_human_readable_labels(syssets, synset_to_human)
    _process_image_files(name, filenames, syssets, labels, humans, num_shards)



In [23]:
def _build_sysset_lookup():
    """ build lookup for synset to human-readable label"""
    imagenet_metadata_file = "/data/ImageNet_tiny/imagenet_metadata_tiny.txt"
    lines = tf.io.gfile.GFile(imagenet_metadata_file, 'r').readlines()
    sysnet_to_human = {}
    for l in lines:
        if l :
            parts = l.strip().split('\t')
            assert len(parts) == 2 
            sysnet = parts[0]
            human = parts[1]
            sysnet_to_human[sysnet] = human
    return sysnet_to_human


In [28]:
#test 
train_dir="/home/lidavid/data/ImageNet_tiny/"
train_shards = 1 # number of shards in training TFRecord files
sysnet_to_human = _build_sysset_lookup()
_process_dataset("train", train_dir, train_shards, sysnet_to_human)

Determining list of input files and labels from /home/lidavid/data/ImageNet_tiny/.
Found 1300 JPEG files across 1 labels inside /home/lidavid/data/ImageNet_tiny/.
Launching 1 threads for spacings: [[0, 1300]]
**************num_shards_per_batch**********  1
2021-06-09 14:43:40.195295 [thread 0]: Processed 1000 of 1300 images in thread batch.
2021-06-09 14:43:40.676097 [thread 0]: Wrote 1300 images to /home/lidavid/data/ImageNet_tiny/train-00000-of-00001
2021-06-09 14:43:40.676566 [thread 0]: Wrote 1300 images to 1300 shards.
2021-06-09 14:43:40.729073: Finished writing all 1300 images in data set.


In [30]:
# check generated ImageNet tfrecords 
import os
tiny_path = "/data/ImageNet_tiny/"
record_name = "train-00000-of-00001"
raw_dataset = tf.data.TFRecordDataset(os.path.join(tiny_path + record_name))

c = 0 
for sample in raw_dataset:
    c += 1
    
c

1300