In [1]:
import numpy as np
import pandas as pd
np.random.seed(42)

In [2]:
raw_data = pd.read_csv("../data/unembedded_grouped_cleaned_data.csv")

In [3]:
# Train/test split for our model is unique, we need to hold out a
# set of users and movies so that our network never learns those 
test_size = 0.005

# get test_size percentage of users
unique_users = raw_data.loc[:, "reviewerID"].unique()
users_size = len(unique_users)
test_idx = np.random.choice(users_size,
                              size=int(users_size * test_size),
                              replace=False)

# get test users
test_users = unique_users[test_idx]

# everyone else is a training user
train_users = np.delete(unique_users, test_idx)

test = raw_data[raw_data["reviewerID"].isin(test_users)]
train = raw_data[raw_data["reviewerID"].isin(train_users)]

unique_test_movies = test["asin"].unique()

# drop the movies that also appear in our test set. In order to be
# a true train/test split, we are forced to discard some data entirely
train = train.where(np.logical_not(train["asin"].isin(unique_test_movies))).dropna()

In [4]:
def to_single_sentence(str_of_lists):
    return " ".join([" ".join(eval(x)) for x in str_of_lists.replace(" ", "").replace("][", "]:|:[").split(":|:")])

def write_list_to_lines(lst, filename):
    with open(filename, "wt") as f:
        for item in lst:
            f.write(str(item) + "\n")

In [None]:
u_reviews = [to_single_sentence(review) for review in train["userReviews"]]
i_reviews = [to_single_sentence(review) for review in train["movieReviews"]]

In [None]:
u_reviews_test = [to_single_sentence(review) for review in test["userReviews"]]
i_reviews_test = [to_single_sentence(review) for review in test["movieReviews"]]

In [None]:
write_list_to_lines(u_reviews, "../data/train_u_reviews.txt")
write_list_to_lines(u_reviews, "../data/train_i_reviews.txt")
write_list_to_lines(u_reviews_test, "../data/test_u_reviews.txt")
write_list_to_lines(i_reviews_test, "../data/test_i_reviews.txt")

In [None]:
write_list_to_lines(list(train["overall"]), "../data/train_ratings.txt")
write_list_to_lines(list(test["overall"]), "../data/test_ratings.txt")

In [None]:
train.to_csv("../data/train.csv")
test.to_csv("../data/test.csv")