In [13]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import sys

import numpy as np
import pandas
import tensorflow as tf
import pdb

def np_to_tfrecords(X, Y, file_path_prefix, verbose=True):
    """
    Converts a Numpy array (or two Numpy arrays) into a tfrecord file.
    For supervised learning, feed training inputs to X and training labels to Y.
    For unsupervised learning, only feed training inputs to X, and feed None to Y.
    The length of the first dimensions of X and Y should be the number of samples.
    
    Parameters
    ----------
    X : numpy.ndarray of rank 2
        Numpy array for training inputs. Its dtype should be float32, float64, or int64.
        If X has a higher rank, it should be rshape before fed to this function.
    Y : numpy.ndarray of rank 2 or None
        Numpy array for training labels. Its dtype should be float32, float64, or int64.
        None if there is no label array.
    file_path_prefix : str
        The path and name of the resulting tfrecord file to be generated, without '.tfrecords'
    verbose : bool
        If true, progress is reported.
    
    Raises
    ------
    ValueError
        If input type is not float (64 or 32) or int.
    
    """
    def _dtype_feature(ndarray):
        """match appropriate tf.train.Feature class with dtype of ndarray. """        
        assert isinstance(ndarray, np.ndarray)
        dtype_ = ndarray.dtype
        if dtype_ == np.float64 or dtype_ == np.float32:
            return lambda array: tf.train.Feature(float_list=tf.train.FloatList(value=array))
        elif dtype_ == np.int64:
            return lambda array: tf.train.Feature(int64_list=tf.train.Int64List(value=array))
        else:  
            raise ValueError("The input should be numpy ndarray. \
                               Instaed got {}".format(ndarray.dtype))
            
    #pdb.set_trace()
    assert isinstance(X, np.ndarray)
    assert len(X.shape) == 2  # If X has a higher rank, 
                               # it should be rshape before fed to this function.
    assert isinstance(Y, np.ndarray) or Y is None
    
    # load appropriate tf.train.Feature class depending on dtype
    dtype_feature_x = _dtype_feature(X)
    if Y is not None:
        assert X.shape[0] == Y.shape[0]
        assert len(Y.shape) == 2
        dtype_feature_y = _dtype_feature(Y)            
    
    # Generate tfrecord writer
    result_tf_file = file_path_prefix + '.tfrecords'
    writer = tf.python_io.TFRecordWriter(result_tf_file)
    if verbose:
        print ("Serializing {:d} examples into {}".format(X.shape[0], result_tf_file))
        
    # iterate over each sample,
    # and serialize it as ProtoBuf.
    for idx in range(X.shape[0]):
        x = X[idx]
        if Y is not None:
            y = Y[idx]
        
        d_feature = {}
        d_feature['X'] = dtype_feature_x(x)
        if Y is not None:
            d_feature['Y'] = dtype_feature_y(y)
            
        features = tf.train.Features(feature=d_feature)
        example = tf.train.Example(features=features)
        serialized = example.SerializeToString()
        writer.write(serialized)
    
    if verbose:
        print ("Writing {} done!".format(result_tf_file))
        

In [2]:
MAX_DOCUMENT_LENGTH=500

In [14]:
# Prepare training and testing data
# Prepare training and testing data
dbpedia = tf.contrib.learn.datasets.load_dataset(
  'dbpedia', size='small', test_with_fake_data=False)

print("Shuffling data set...")
x_train = dbpedia.train.data[:, 1]
y_train = dbpedia.train.target
s = np.arange(len(y_train))
np.random.shuffle(s)
x_train = x_train[s]
y_train = y_train[s]
print("Done!")  

x_train = pandas.Series(x_train)
y_train = pandas.Series(y_train)
x_test = pandas.Series(dbpedia.test.data[:, 1])
y_test = pandas.Series(dbpedia.test.target)

print('Train data size:', x_train.shape)
print('Test data size:', x_test.shape)

# Process vocabulary
char_processor = tf.contrib.learn.preprocessing.ByteProcessor(
  MAX_DOCUMENT_LENGTH)
x_train_fit = np.array(list(char_processor.fit_transform(x_train)), dtype=np.int)
x_test_fit = np.array(list(char_processor.transform(x_test)), dtype=np.int)

y_train = np.expand_dims(np.asarray(y_train), axis=1) 
y_test = np.expand_dims(np.asarray(y_test), axis=1) 

Shuffling data set...
Done!
Train data size: (560,)
Test data size: (70,)


In [21]:
x_train_fit

array([[ 32,  80, 104, ...,   0,   0,   0],
       [ 32,  77, 105, ...,   0,   0,   0],
       [ 32,  82, 101, ...,   0,   0,   0],
       ...,
       [ 32,  83, 111, ...,   0,   0,   0],
       [ 32,  66, 117, ...,   0,   0,   0],
       [ 32,  68,  97, ...,   0,   0,   0]])

In [6]:
type(x_train_fit)

numpy.ndarray

In [31]:
y_train.shape

(560, 1)

In [57]:
np_to_tfrecords(x_train_fit, np.asarray(y_train, np.int), 'train', verbose=True)

Serializing 560 examples into train.tfrecords
Writing train.tfrecords done!


In [58]:
np_to_tfrecords(x_test_fit, np.asarray(y_test, np.int), 'test', verbose=True)

Serializing 70 examples into test.tfrecords
Writing test.tfrecords done!


In [59]:
for serialized_example in tf.python_io.tf_record_iterator('train.tfrecords'):
    example = tf.train.Example()
    example.ParseFromString(serialized_example)
    x_1 = np.array(example.features.feature['X'].int64_list.value)
    y_1 = np.array(example.features.feature['Y'].int64_list.value)
    break

In [60]:
x_1 - x_train_fit[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [29]:
y_1 - y_train[0]

array([0])

In [27]:
x_1

array([ 32,  80, 104, 105, 108, 105, 112, 112, 101,  39, 115,  32, 111,
       114,  32,  80, 104, 105, 108, 108, 105, 112, 112, 101,  32, 116,
       104, 101,  32,  79, 114, 105, 103, 105, 110,  97, 108,  32,  40,
        47, 102, 201, 168, 203, 136, 108, 105, 203, 144, 112, 101, 201,
       170,  47,  32, 102, 105,  45,  76,  69,  69,  45, 112,  97, 121,
        41,  32, 104, 105, 115, 116, 111, 114, 105,  99,  97, 108, 108,
       121,  32,  47, 102, 201, 170, 203, 136, 108, 105, 203, 144, 112,
       115,  47,  32, 102, 105, 108,  45,  76,  69,  69,  80,  83,  32,
       105, 115,  32,  97,  32, 114, 101, 115, 116,  97, 117, 114,  97,
       110, 116,  32, 108, 111,  99,  97, 116, 101, 100,  32, 105, 110,
        32, 100, 111, 119, 110, 116, 111, 119, 110,  32,  76, 111, 115,
        32,  65, 110, 103, 101, 108, 101, 115,  32,  67,  97, 108, 105,
       102, 111, 114, 110, 105,  97,  46,  32,  84, 104, 101,  32, 114,
       101, 115, 116,  97, 117, 114,  97, 110, 116,  32, 105, 11

In [26]:
example

features {
  feature {
    key: "X"
    value {
      int64_list {
        value: 32
        value: 80
        value: 104
        value: 105
        value: 108
        value: 105
        value: 112
        value: 112
        value: 101
        value: 39
        value: 115
        value: 32
        value: 111
        value: 114
        value: 32
        value: 80
        value: 104
        value: 105
        value: 108
        value: 108
        value: 105
        value: 112
        value: 112
        value: 101
        value: 32
        value: 116
        value: 104
        value: 101
        value: 32
        value: 79
        value: 114
        value: 105
        value: 103
        value: 105
        value: 110
        value: 97
        value: 108
        value: 32
        value: 40
        value: 47
        value: 102
        value: 201
        value: 168
        value: 203
        value: 136
        value: 108
        value: 105
        value: 203
        value: 144
        value: 

In [24]:
example.features.feature['X']

int64_list {
  value: 32
  value: 80
  value: 104
  value: 105
  value: 108
  value: 105
  value: 112
  value: 112
  value: 101
  value: 39
  value: 115
  value: 32
  value: 111
  value: 114
  value: 32
  value: 80
  value: 104
  value: 105
  value: 108
  value: 108
  value: 105
  value: 112
  value: 112
  value: 101
  value: 32
  value: 116
  value: 104
  value: 101
  value: 32
  value: 79
  value: 114
  value: 105
  value: 103
  value: 105
  value: 110
  value: 97
  value: 108
  value: 32
  value: 40
  value: 47
  value: 102
  value: 201
  value: 168
  value: 203
  value: 136
  value: 108
  value: 105
  value: 203
  value: 144
  value: 112
  value: 101
  value: 201
  value: 170
  value: 47
  value: 32
  value: 102
  value: 105
  value: 45
  value: 76
  value: 69
  value: 69
  value: 45
  value: 112
  value: 97
  value: 121
  value: 41
  value: 32
  value: 104
  value: 105
  value: 115
  value: 116
  value: 111
  value: 114
  value: 105
  value: 99
  value: 97
  value: 108
  value: 10

In [20]:
serialized_example

b"\n\x9d\x04\n\x8e\x04\n\x01X\x12\x88\x04\x1a\x85\x04\n\x82\x04 Philippe's or Phillippe the Original (/f\xc9\x01\xa8\x01\xcb\x01\x88\x01li\xcb\x01\x90\x01pe\xc9\x01\xaa\x01/ fi-LEE-pay) historically /f\xc9\x01\xaa\x01\xcb\x01\x88\x01li\xcb\x01\x90\x01ps/ fil-LEEPS is a restaurant located in downtown Los Angeles California. The restaurant is well known for continuously operating since 1908 making it one of the oldest restaurants in Los Angeles. It is also renowned for claiming to be the inventor of the French Dip sandwich.The restaurant has been located at 1001 N.\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\n\n\n\x01Y\x12\x05\x1a\x03\n\x01\x07"

In [32]:
err = 0
for i, serialized_example in enumerate(tf.python_io.tf_record_iterator('train.tfrecords')):
    example = tf.train.Example()
    example.ParseFromString(serialized_example)
    x_1 = np.array(example.features.feature['X'].int64_list.value)
    y_1 = np.array(example.features.feature['Y'].int64_list.value)
    err += np.linalg.norm(x_train_fit[i]-x_1) + np.linalg.norm(y_train[i]-y_1)

for i, serialized_example in enumerate(tf.python_io.tf_record_iterator('test.tfrecords')):
    example = tf.train.Example()
    example.ParseFromString(serialized_example)
    x_1 = np.array(example.features.feature['X'].int64_list.value)
    y_1 = np.array(example.features.feature['Y'].int64_list.value)
    err += np.linalg.norm(x_test[i]-x_1) + np.linalg.norm(y_test[i]-y_1)    
print('Error: %f', err)

TypeError: ufunc 'subtract' did not contain a loop with signature matching types dtype('<U377') dtype('<U377') dtype('<U377')

In [63]:
    total_err = 0
    for i, serialized_example in enumerate(tf.python_io.tf_record_iterator('train.tfrecords')):
        example = tf.train.Example()
        example.ParseFromString(serialized_example)
        x_1 = np.array(example.features.feature['X'].int64_list.value)
        y_1 = np.array(example.features.feature['Y'].int64_list.value)
        err += np.linalg.norm(x_train_fit[i]-x_1) + np.linalg.norm(y_train[i]-y_1)
        total_err += err
        if err>0:
            pass
            #break
    print('Train set Error: %f', err)
    



Train set Error: %f 0.0


In [53]:
x_train_fit[i]-x_1

array([   0,   15,   -1,    6,    4,    0,   80,   41,    4,  -68,   -2,
        -83,   10,    4,    0,   13,   -7,   -3,    0,    7,    2,   11,
         80,   61, -198,  -16,  -51, -130, -127,  -86, -115,  -68,  -63,
       -125,  -69,  -40, -123, -127, -133, -183,  -54,   42,  -61,   39,
        -31, -121,  -68,   37,  112,   47,   -4,  102,   66,  -58,    0,
         31,    8,  -62,  -41,  -46,  -32,  -65,   80,   13,   24,  -69,
        -75,   -1,   73,   47,   19,    6,   11,    8,   -8,  -20,   67,
         76,   16,  -83,   15,    5,  169,   58,   89,   31,  -10,    8,
         87,   43,   80,    9,  -70,  -78,   -3,   -6,   -6,   13,  -23,
        -42,  -39,  -28,  -18,  -71,    4,   83,  -73,  -13,    0,   35,
         -6,   18,   -6,    0,   10,    9,   65,   45,   11,  -67,    4,
          6,   67,   23,   19,  -11,    3,  -78,   59,  110,   32,  100,
        111,  119,  110,  116,  111,  119,  110,   32,   76,  111,  115,
         32,   65,  110,  103,  101,  108,  101,  1

In [64]:
    err = 0
    for i, serialized_example in enumerate(tf.python_io.tf_record_iterator('test.tfrecords')):
        example = tf.train.Example()
        example.ParseFromString(serialized_example)
        x_1 = np.array(example.features.feature['X'].int64_list.value)
        y_1 = np.array(example.features.feature['Y'].int64_list.value)
        err += np.linalg.norm(x_test_fit[i]-x_1) + np.linalg.norm(y_test[i]-y_1)    
    print('Test set Error: %f', err)

Test set Error: %f 0.0


In [62]:
i

0

In [43]:
        err += np.linalg.norm(x_train_fit[i]-x_1) + np.linalg.norm(y_train[i]-y_1)

In [44]:
err

722682.2438464825

In [94]:
import os
class DBPediaInput(object):
  """Wrapper class that acts as the input_fn to TPUEstimator."""

  def __init__(self, is_training, data_dir=None):
    self.is_training = is_training
    self.data_dir = data_dir if data_dir else '.' #FLAGS.data_dir

  def dataset_parser(self, value):
    """Parse an Imagenet record from value."""
    keys_to_features = {
        'X': tf.FixedLenSequenceFeature([500,], tf.int64, -1),
        'Y': tf.FixedLenFeature([], tf.int64, -1)            
    }
    parsed = tf.parse_single_example(value, keys_to_features)
    
    """
    example = tf.train.Example()
    example.ParseFromString(value)
    X = np.array(example.features.feature['X'].int64_list.value)
    Y = np.array(example.features.feature['Y'].int64_list.value)
    """
    
    return tf.squeeze(parsed['X']), parsed['Y']

  def __call__(self, params):
    """Input function which provides a single batch for train or eval."""
    # Retrieves the batch size for the current shard. The # of shards is
    # computed according to the input pipeline deployment. See
    # `tf.contrib.tpu.RunConfig` for details.
    batch_size = params['batch_size']

    # Shuffle the filenames to ensure better randomization
    file_pattern = os.path.join(
        self.data_dir, 'train*' if self.is_training else 'test*')
    dataset = tf.data.Dataset.list_files(file_pattern)
    if self.is_training:
      dataset = dataset.shuffle(buffer_size=1024)  # 1024 files in dataset

    if self.is_training:
      dataset = dataset.repeat()

    def prefetch_dataset(filename):
      buffer_size =  1000 #FLAGS.prefetch_buffer_size
      dataset = tf.data.TFRecordDataset(filename, buffer_size=buffer_size)
      return dataset

    dataset = dataset.apply(
        tf.contrib.data.parallel_interleave(
            prefetch_dataset, cycle_length= 8, #FLAGS.num_files_infeed,
            sloppy=True))
    dataset = dataset.shuffle(1000) #FLAGS.shuffle_buffer_size)

    dataset = dataset.map(
        self.dataset_parser,
        num_parallel_calls=8) #FLAGS.num_parallel_calls)
    dataset = dataset.prefetch(batch_size)
    dataset = dataset.apply(
        tf.contrib.data.batch_and_drop_remainder(batch_size))

    dataset = dataset.prefetch(2)  # Prefetch overlaps in-feed with training
    images, labels = dataset.make_one_shot_iterator().get_next()
    return images, labels

In [95]:
my_input=DBPediaInput(is_training=True)
i= my_input(params={'batch_size':10})
i[0]

<tf.Tensor 'IteratorGetNext_33:0' shape=<unknown> dtype=int64>

In [99]:
my_input=DBPediaInput(is_training=True)
i= my_input(params={'batch_size':10})
with tf.Session():
    print(i[0].eval())
    print(i[0].eval().shape)
    print(i[1].eval())
    print(i[1].eval().shape)

[[ 32  72 121 ...   0   0   0]
 [ 32  82 101 ...   0   0   0]
 [ 32  83 119 ...   0   0   0]
 ...
 [ 32  67 104 ...   0   0   0]
 [ 32  65 108 ...   0   0   0]
 [ 32  83 119 ...   0   0   0]]
(10, 500)
[ 1  3  8 13 13 11  3  1  7 10]
(10,)


In [98]:
i= my_input(params={'batch_size':5})
with tf.Session():
    print(i[0].eval())
    print(i[1].eval())

[[ 32  84 104 ...   0   0   0]
 [ 32  77 111 ...   0   0   0]
 [ 32  84 104 ...   0   0   0]
 [ 32  74 111 ...   0   0   0]
 [ 32  79 116 ...   0   0   0]]
[ 8  8  9 13 10]


In [44]:
i[0]

<tensorflow.python.framework.sparse_tensor.SparseTensor at 0x7ff024066470>