In [20]:
import numpy as np
import random
import scipy.sparse as sp

In [21]:
def make_test_ratings(path):
    test_ratings = {}
    with open(path, 'r') as f:
        for line in f:
            arr = line.split('::')
            user, item, timestamp = int(arr[0])-1, int(arr[1])-1, int(arr[3])
            if user in test_ratings.keys():
                if test_ratings[user][0] < timestamp:
                    test_ratings[user] = (item, timestamp)
            else:
                test_ratings[user] = (item, timestamp)
    rating_list = []
    for user in test_ratings.keys():
        rating_list.append([user, test_ratings[user][0]])

    return rating_list, test_ratings

In [1]:
def make_train_ratings(path, test_ratings_dict):
    ratings = []
    with open(path, 'r') as f:
        num_users, num_items = 0, 0
        for line in f:
            arr = line.split('::')
            u, i = arr[0], arr[1]
            num_users = max(num_users, int(u))
            num_items = max(num_items, int(i))

    matrix = sp.dok_matrix((num_users, num_items), dtype=np.float32)
    with open(path, 'r') as f:
        for line in f:
            arr = line.split('::')
            user, item, rating, timestamp = int(
                arr[0])-1, int(arr[1])-1, int(arr[2]), int(arr[3])
            if test_ratings_dict[user][0] != item:
                if rating > 0:
                    matrix[user, item] = 1.0
                ratings.append([user, item, rating, timestamp])

    return ratings, matrix

In [44]:
def make_test_negatives(train_matrix, test_ratings):
    test_negatives = []
    for user in range(0, train_matrix.shape[0]):
        range_items = range(0, train_matrix.shape[1])
        interected_items = set(train_matrix[user].nonzero()[1])
        available_numbers = [
            num for num in range_items if num not in interected_items]
        negatives = random.sample(available_numbers, 99)

        test_negatives.append(
            [(user, test_ratings[user][1]), negatives])
    return test_negatives

In [62]:
users = []
items = []
ratings = []
timestamps = []
with open('../Data/ml-1m/ratings.dat', 'r') as f:
    for line in f:
        user, item, rating, timestamp = line.split('::')
        users.append(int(user))
        items.append(int(item))
        ratings.append(int(rating))
        timestamps.append(int(timestamp))

In [45]:
test_ratings, test_ratings_dict = make_test_ratings(
    '../Data/ml-1m/ratings.dat')

In [46]:
ratings, train_matrix = make_train_ratings(
    '../Data/ml-1m/ratings.dat', test_ratings_dict)

In [53]:
items = {}
for i in ratings:
    if i[1] not in items:
        items[i[1]] = 1
    else:
        items[i[1]] += 1

In [47]:
test_negatives = make_test_negatives(train_matrix, test_ratings)

In [48]:
with open('./output/ml-1m.train.rating', 'w') as f:
    for i in ratings:
        f.write('\t'.join([str(x) for x in i]) + '\n')

In [49]:
with open('./output/ml-1m.test.rating', 'w') as f:
    for i in test_ratings:
        f.write('\t'.join([str(x) for x in i]) + '\n')

In [50]:
with open('./output/ml-1m.test.negative', 'w') as f:
    for i in test_negatives:
        f.write(str(i[0]))
        f.write('\t')
        f.write('\t'.join([str(x) for x in i[1]]) + '\n')