In [2]:
import pickle as pkl
import tensorflow as tf
from keras.preprocessing.text import text_to_word_sequence

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
fnames = [
    "../data/train_u_reviews.txt",
    "../data/train_i_reviews.txt",
    "../data/test_u_reviews.txt",
    "../data/test_i_reviews.txt",
    "../data/train_ratings.txt",
    "../data/test_ratings.txt"
]

In [4]:
def get_lines(fname):
    with open(fname, "rt") as f:
        return f.read().splitlines()

In [5]:
data = [get_lines(fname) for fname in fnames]

In [6]:
def clean(text):
    return text_to_word_sequence(text,
                                 filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                                 lower=True, split=" ")


In [7]:
def wrap_float_value(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
def wrap_int_list(values):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=values))

In [8]:
with open("../data/dictionary.pkl", "rb") as dat:
    dictionary = pkl.load(dat)

In [9]:
word_2_idx = {word: idx for idx, word in enumerate(dictionary)}
word_2_idx["UNK"] = len(dictionary)
idx_2_word = {word_2_idx[word]: word for word in word_2_idx}

In [10]:
with open("../data/word_2_idx.pkl", "wb") as w2i:
    pkl.dump(word_2_idx, w2i)
with open("../data/idx_2_word.pkl", "wb") as i2w:
    pkl.dump(idx_2_word, i2w)

In [11]:
tfrecords_filename = "../data/train.tfrecords"
writer = tf.python_io.TFRecordWriter(tfrecords_filename)

for u_review, i_review, rating in zip(data[0], data[1], data[4]):
    example = tf.train.Example(
        features=tf.train.Features(
            feature={
                'user_review': wrap_int_list([word_2_idx[word] for word in u_review.split()]),
                'item_review': wrap_int_list([word_2_idx[word] for word in i_review.split()]),
                'rating': wrap_float_value(float(rating))
            }
        )
    )
    
    writer.write(example.SerializeToString())

In [12]:
tfrecords_filename = "../data/test.tfrecords"
writer = tf.python_io.TFRecordWriter(tfrecords_filename)

for u_review, i_review, rating in zip(data[2], data[3], data[5]):
    example = tf.train.Example(
        features=tf.train.Features(
            feature={
                'user_review': wrap_int_list([word_2_idx[word] for word in u_review.split()]),
                'item_review': wrap_int_list([word_2_idx[word] for word in i_review.split()]),
                'rating': wrap_float_value(float(rating))
            }
        )
    )
    
    writer.write(example.SerializeToString())