In [1]:
import tensorflow as tf
from keras.preprocessing.text import text_to_word_sequence

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def get_lines(fname):
    with open(fname, "rt") as f:
        return f.read().splitlines()

In [3]:
def to_bytearray_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[bytes(value, "utf8")]))
def wrap_float_value(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

In [4]:
train_user_lines = get_lines("data/train_u_reviews.txt")
train_item_lines = get_lines("data/train_i_reviews.txt")
train_ratings = get_lines("data/train_ratings.txt")

In [5]:
tfrecords_filename = "data/demo.tfrecords"
writer = tf.python_io.TFRecordWriter(tfrecords_filename)

for user, item, rating in zip(train_user_lines, train_item_lines, train_ratings):
    example = tf.train.Example(
        features=tf.train.Features(
            feature={
                'user_review': to_bytearray_feature(user),
                'item_review': to_bytearray_feature(item),
                'rating': wrap_float_value(float(rating))
            }
        )
    )
    
    writer.write(example.SerializeToString())

In [6]:
def parse_fn(record):
    features = {
            "user_review": tf.FixedLenSequenceFeature([], tf.string, allow_missing=True),
            "item_review": tf.FixedLenSequenceFeature([], tf.string, allow_missing=True),
            "rating": tf.FixedLenFeature([1], tf.float32)
        }
    parsed_features = tf.parse_single_example(record, features)
    return parsed_features["user_review"], parsed_features["item_review"], parsed_features["rating"]

In [7]:
def split_fn(user, item, rating):
    user = tf.string_split(user)
    item = tf.string_split(item)
    return user.values, item.values, rating

In [19]:
def truncate_fn(user, item, rating):
    return user[:400], item[:400], rating

In [30]:
dataset = tf.data.TFRecordDataset("data/demo.tfrecords")
dataset = dataset.map(parse_fn)
dataset = dataset.map(split_fn)
dataset = dataset.map(truncate_fn)
dataset = dataset.padded_batch(16, padded_shapes=([400], [400], [None]), padding_values=("unk", "unk", 0.0))
iterator = dataset.make_one_shot_iterator()
data_point = iterator.get_next()
data_point[0].eval(session=tf.Session())

array([[b'i', b'really', b'like', ..., b'unk', b'unk', b'unk'],
       [b'this', b'is', b'one', ..., b'unk', b'unk', b'unk'],
       [b'i', b'watched', b'this', ..., b'unk', b'unk', b'unk'],
       ...,
       [b'enjoyed', b'some', b'of', ..., b'unk', b'unk', b'unk'],
       [b'loved', b'season', b'2', ..., b'direction', b'less', b'so'],
       [b'his', b'shows', b'are', ..., b'unk', b'unk', b'unk']],
      dtype=object)