In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import sys

import numpy as np
import pandas
import tensorflow as tf
import pdb

def np_to_tfrecords(X, Y, file_path_prefix, verbose=True):
    def _int64_feature(value):
        return tf.train.Feature(int64_list=tf.train.Int64List(value=value.reshape(-1)))
    # Generate tfrecord writer
    result_tf_file = file_path_prefix + '.tfrecords'
    writer = tf.python_io.TFRecordWriter(result_tf_file)
    if verbose:
        print ("Serializing {:d} examples into {}".format(X.shape[0], result_tf_file))        
    # iterate over each sample,
    # and serialize it as ProtoBuf.
    for idx in range(X.shape[0]):
        #pdb.set_trace()
        example = tf.train.Example(features=tf.train.Features(feature={
        'X': _int64_feature(X[idx]),
        'Y': _int64_feature(Y[idx])}))        
        serialized = example.SerializeToString()
        writer.write(serialized)    
    if verbose:
        print ("Writing {} done!".format(result_tf_file))

In [3]:
MAX_DOCUMENT_LENGTH = 50
n_words = None
MAX_LABEL = 15
WORDS_FEATURE = 'words'  # Name of the input words feature.

In [4]:
# Prepare training and testing data
dbpedia = tf.contrib.learn.datasets.load_dataset(
'dbpedia', size='large', test_with_fake_data=False)

print("Shuffling data set...")
x_train = dbpedia.train.data[:, 1]
y_train = dbpedia.train.target
s = np.arange(len(y_train))
np.random.shuffle(s)
x_train = x_train[s]
y_train = y_train[s]
print("Done!")  

x_train = pandas.Series(x_train)
y_train = pandas.Series(y_train)
x_test = pandas.Series(dbpedia.test.data[:, 1])
y_test = pandas.Series(dbpedia.test.target)

print('Train data size:', x_train.shape)
print('Test data size:', x_test.shape)
# Process vocabulary
vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(
MAX_DOCUMENT_LENGTH)

x_transform_train = vocab_processor.fit_transform(x_train)
x_transform_test = vocab_processor.transform(x_test)

x_train_fit = np.array(list(x_transform_train))
x_test_fit = np.array(list(x_transform_test))

n_words = len(vocab_processor.vocabulary_)
print('Total words: %d' % n_words)


y_train = np.expand_dims(np.asarray(y_train), axis=1) 
y_test = np.expand_dims(np.asarray(y_test), axis=1) 

Instructions for updating:
Use the retry module or similar alternatives.
Instructions for updating:
Please use tf.data.
Instructions for updating:
See contrib/learn/README.md
Instructions for updating:
See contrib/learn/README.md
Instructions for updating:
Use tf.data instead.
Shuffling data set...
Done!
Train data size: (560000,)
Test data size: (70000,)
Instructions for updating:
Please use tensorflow/transform or tf.data.
Instructions for updating:
Please use tensorflow/transform or tf.data.
Instructions for updating:
Please use tensorflow/transform or tf.data.
Total words: 822383


In [8]:
x_test_fit.shape

(70000, 50)

In [9]:
np_to_tfrecords(x_train_fit, np.asarray(y_train, np.int), 'word-train', verbose=True)
np_to_tfrecords(x_test_fit, np.asarray(y_test, np.int), 'word-test', verbose=True)


Serializing 560000 examples into word-train.tfrecords
Writing word-train.tfrecords done!
Serializing 70000 examples into word-test.tfrecords
Writing word-test.tfrecords done!


In [10]:
tf.__version__

'1.7.0'

In [11]:
from tqdm import tqdm
total_err = 0
err = 0
for i, serialized_example in tqdm(enumerate(tf.python_io.tf_record_iterator('word-train.tfrecords'))):
    example = tf.train.Example()
    example.ParseFromString(serialized_example)
    x_1 = np.array(example.features.feature['X'].int64_list.value)
    y_1 = np.array(example.features.feature['Y'].int64_list.value)
    err += np.linalg.norm(x_train_fit[i]-x_1) + np.linalg.norm(y_train[i]-y_1)
    total_err += err
    if err>0:
        pass
        #break
print('Train set Error: %f'% total_err)

err = 0
for i, serialized_example in enumerate(tf.python_io.tf_record_iterator('word-test.tfrecords')):
    example = tf.train.Example()
    example.ParseFromString(serialized_example)
    x_1 = np.array(example.features.feature['X'].int64_list.value)
    y_1 = np.array(example.features.feature['Y'].int64_list.value)
    err += np.linalg.norm(x_test_fit[i]-x_1) + np.linalg.norm(y_test[i]-y_1) 
    
print('Test set Error: %f'% err)

560000it [01:00, 9310.89it/s]


Train set Error: 0.000000
Test set Error: 0.000000


In [39]:
from tqdm import tqdm
total_err = 0
err = 0
keys_to_features = {
    'X': tf.FixedLenFeature(shape=[MAX_DOCUMENT_LENGTH], dtype=tf.int64),
    'Y': tf.FixedLenFeature(shape=[1], dtype=tf.int64)            
}
    
with tf.Session() as sess:
    for i, serialized_example in tqdm(enumerate(tf.python_io.tf_record_iterator('word-train.tfrecords'))):
        #example = tf.train.Example()
        #example.ParseFromString(serialized_example)
        parsed = tf.parse_single_example(serialized_example, keys_to_features)
        parsed = sess.run([parsed])[0]
        x_1 = np.array(parsed['X'])
        y_1 = np.array(parsed['Y'])
        err += np.linalg.norm(x_train_fit[i]-x_1) + np.linalg.norm(y_train[i]-y_1)
        total_err += err
        if i==100:            
            break
            
print('Train set Error: %f'% total_err)




0it [00:00, ?it/s][A
1it [00:01,  1.03s/it][A
2it [00:01,  1.31it/s][A
3it [00:02,  1.47it/s][A
100it [00:54,  1.82it/s]

Train set Error: 0.000000


In [38]:
total_err = 0
err = 0
keys_to_features = {
    'X': tf.FixedLenFeature(shape=[MAX_DOCUMENT_LENGTH], dtype=tf.int64),
    'Y': tf.FixedLenFeature(shape=[1], dtype=tf.int64)            
}
    
with tf.Session() as sess:
    for i, serialized_example in tqdm(enumerate(tf.python_io.tf_record_iterator('word-train.tfrecords'))):
        #example = tf.train.Example()
        #example.ParseFromString(serialized_example)

        parsed = tf.parse_single_example(serialized_example, keys_to_features)
        parsed = sess.run([parsed])[0]

        x_1 = np.array(parsed['X'])
        y_1 = np.array(parsed['Y'])
        err += np.linalg.norm(x_train_fit[i]-x_1) + np.linalg.norm(y_train[i]-y_1)
        total_err += err
        if i==100:            
            break
print('Test set Error: %f'% total_err)


0it [00:00, ?it/s][A
1it [00:01,  1.06s/it][A
2it [00:01,  1.30it/s][A
3it [00:02,  1.49it/s][A
4it [00:02,  1.60it/s][A
5it [00:03,  1.63it/s][A
6it [00:03,  1.65it/s][A
7it [00:04,  1.68it/s][A
8it [00:04,  1.71it/s][A
9it [00:05,  1.68it/s][A
10it [00:05,  1.70it/s][A
11it [00:06,  1.71it/s][A
12it [00:06,  1.73it/s][A
13it [00:07,  1.74it/s][A
14it [00:07,  1.76it/s][A
15it [00:08,  1.77it/s][A
16it [00:09,  1.78it/s][A
100it [00:50,  1.98it/s]

Test set Error: 0.000000
