In [1]:
!pwd

/Users/aaron/Documents/github/tensorflow/tensorflow/examples/how_tos/reading_data


In [3]:
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions

In [6]:
infile = '/tmp/data/mnist/train/images.gz'

with beam.Pipeline(options=PipelineOptions()) as p:
    lines = p | beam.io.Read(
        infile, 
        compression_type=beam.io.filesystem.CompressionTypes.GZIP)

UnicodeDecodeError: 'utf8' codec can't decode byte 0xea in position 6: invalid continuation byte

In [7]:
import gzip

In [9]:
from tensorflow.python.platform import gfile

with gfile.Open(infile, 'rb') as f:
    with gzip.GzipFile(fileobj=f) as bytestream:
        data = bytestream.read(4)

In [10]:
type(data)

str

In [11]:
len(data)

4

In [12]:
import numpy

In [13]:
dt = numpy.dtype(numpy.uint32).newbyteorder('>')
images = numpy.frombuffer(data, dtype=dt)[0]
type(images)

numpy.uint32

In [14]:
images

2051

# in-memory data

In [31]:
"""
This module should implement `base_example_gen_executor.BaseExampleGenExecutor`

It will take MNIST data from a given directory and convert it to tf.Example
records using Apache Beam for use by its parent's class

Helper functions code borrowed from and adapted from here in order to read Gzip MNIST files
from an arbitrary location and return them as numpy.ndarray

https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/learn/python/learn/datasets/mnist.py
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import gzip

import numpy
from six.moves import xrange  # pylint: disable=redefined-builtin

from tensorflow.contrib.learn.python.learn.datasets import base
from tensorflow.python.framework import dtypes
from tensorflow.python.framework import random_seed
from tensorflow.python.platform import gfile
from tensorflow.python.util.deprecation import deprecated

def _read32(bytestream):
  dt = numpy.dtype(numpy.uint32).newbyteorder('>')
  return numpy.frombuffer(bytestream.read(4), dtype=dt)[0]

def extract_images(f):
  """Extract the images into a 4D uint8 numpy array [index, y, x, depth].

  Args:
    f: A file object that can be passed into a gzip reader.

  Returns:
    data: A 4D uint8 numpy array [index, y, x, depth].

  Raises:
    ValueError: If the bytestream does not start with 2051.

  """
  print('Extracting', f.name)
  with gzip.GzipFile(fileobj=f) as bytestream:
    magic = _read32(bytestream)
    if magic != 2051:
      raise ValueError('Invalid magic number %d in MNIST image file: %s' %
                       (magic, f.name))
    num_images = _read32(bytestream)
    rows = _read32(bytestream)
    cols = _read32(bytestream)
    buf = bytestream.read(rows * cols * num_images)
    data = numpy.frombuffer(buf, dtype=numpy.uint8)
    data = data.reshape(num_images, rows, cols, 1)
    return data


def extract_labels(f, one_hot=False, num_classes=10):
  """Extract the labels into a 1D uint8 numpy array [index].

  Args:
    f: A file object that can be passed into a gzip reader.
    one_hot: Does one hot encoding for the result.
    num_classes: Number of classes for the one hot encoding.

  Returns:
    labels: a 1D uint8 numpy array.

  Raises:
    ValueError: If the bystream doesn't start with 2049.
  """
  print('Extracting', f.name)
  with gzip.GzipFile(fileobj=f) as bytestream:
    magic = _read32(bytestream)
    if magic != 2049:
      raise ValueError('Invalid magic number %d in MNIST label file: %s' %
                       (magic, f.name))
    num_items = _read32(bytestream)
    buf = bytestream.read(num_items)
    labels = numpy.frombuffer(buf, dtype=numpy.uint8)
    if one_hot:
      return tf.one_hot(labels, num_classes)
    return labels


def get_images_and_labels(images_path, labels_path):
  """
  Extract gzip images/labels from path
  """
  with gfile.Open(images_path, 'rb') as f:
    images = extract_images(f)

  with gfile.Open(labels_path, 'rb') as f:
    labels = extract_labels(f)

  return images, labels

In [32]:
images_path = '/tmp/data/mnist/train/images.gz'
labels_path = '/tmp/data/mnist/train/labels.gz'
    
train_images, train_labels = get_images_and_labels(images_path, labels_path)

train_images.shape, train_labels.shape

Extracting /tmp/data/mnist/train/images.gz
Extracting /tmp/data/mnist/train/labels.gz


((60000, 28, 28, 1), (60000,))

In [33]:
images_path = '/tmp/data/mnist/val/images.gz'
labels_path = '/tmp/data/mnist/val/labels.gz'
    
val_images, val_labels = get_images_and_labels(images_path, labels_path)

val_images.shape, val_labels.shape

Extracting /tmp/data/mnist/val/images.gz
Extracting /tmp/data/mnist/val/labels.gz


((10000, 28, 28, 1), (10000,))

In [40]:
labels_w_index = [(i,x) for i,x in enumerate(val_labels)]

labels_w_index[:5]

[(0, 7), (1, 2), (2, 1), (3, 0), (4, 4)]

In [44]:
images_w_index = [(i,x) for i,x in enumerate(val_images)]

images_w_index[0][0], images_w_index[0][1].shape

(0, (28, 28, 1))

In [46]:
def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

In [48]:
import tensorflow as tf

In [49]:
images = val_images
N = images.shape[0]
rows = images.shape[1]
cols = images.shape[2]
depth = images.shape[3]

In [50]:
N, rows, cols, depth

(10000, 28, 28, 1)

In [61]:
def group_by_tf_example(key_value):
    _, value = key_value
    image = value['image'][0]
    label = value['label'][0]
    example = tf.train.Example(
          features=tf.train.Features(
              feature={
                  'height': _int64_feature(rows),
                  'width': _int64_feature(cols),
                  'depth': _int64_feature(depth),
                  'label': _int64_feature(int(label)),
                  'image_raw': _bytes_feature(image.tostring())
              })
        )
    return example

with beam.Pipeline(options=PipelineOptions()) as p:
    label_line = p | "CreateLabel" >> beam.Create(labels_w_index[:1])
    image_line = p | "CreateImage" >> beam.Create(images_w_index[:1])
    
    group_by = ({'label': label_line, 'image': image_line}) | beam.CoGroupByKey()
    
    tf_example = group_by | "GroupByToTfExample" >> beam.Map(group_by_tf_example)
    
    serialize = (tf_example | 'SerializeDeterministically' >>
          beam.Map(lambda x: x.SerializeToString(deterministic=True)))
        
    output = serialize | beam.io.WriteToTFRecord('mnist-out-4', file_name_suffix='.gz')



In [62]:
!cat mnist-out-4-00000-of-00001.gz

�      �ff ��x�
�l\|\�)�%B�R�\��\�\l���%�%l\������E��B�ظ&�qM`c6 d���6*������Ǡ`�	��s�G����U�z��_�fA'>gggk�7��p����)����qik��N����v�����͊C���+\���m�%����%)����b>��29��|qH�?�ẄC���?R��p�U��w[��y\q���QNbRj$۰���)�|%s%7f. �J�{  

In [None]:
# download MNIST data

In [27]:
from tensorflow.contrib.learn.python import datasets

In [28]:
datasets.mnist.read_data_sets('/tmp/data/')

Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
Instructions for updating:
Please write your own downloading logic.
Instructions for updating:
Please use urllib or similar directly.
Instructions for updating:
Please use tf.data to implement this functionality.


Successfully downloaded train-images-idx3-ubyte.gz 9912422 bytes.
Extracting /tmp/data/train-images-idx3-ubyte.gz


Instructions for updating:
Please use tf.data to implement this functionality.


Extracting /tmp/data/train-labels-idx1-ubyte.gz
Successfully downloaded t10k-images-idx3-ubyte.gz 1648877 bytes.
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Successfully downloaded t10k-labels-idx1-ubyte.gz 4542 bytes.
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz


Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.


Datasets(train=<tensorflow.contrib.learn.python.learn.datasets.mnist.DataSet object at 0x118d0b890>, validation=<tensorflow.contrib.learn.python.learn.datasets.mnist.DataSet object at 0x118d0b4d0>, test=<tensorflow.contrib.learn.python.learn.datasets.mnist.DataSet object at 0x118d0b250>)