In [1]:
import numpy as np
import pandas as pd

In [2]:
import random
from scipy import sparse

In [3]:
import csv
from collections import defaultdict

In [4]:
# !pip install tensorrec

In [9]:
import tensorrec

In [6]:
PATH_TO_DATA = '../data/processed/'

In [7]:
# %%time

# df_train = pd.read_csv(PATH_TO_DATA + "train.csv")
# df_test = pd.read_csv(PATH_TO_DATA + "test.csv")

In [8]:
# df_train.head()

In [None]:
# df_full = pd.read_csv(PATH_TO_DATA + "full_rating.csv")

In [None]:
# df_full.head()

### Trying out tensorrec

In [None]:
!ls /home/ubuntu/movielens/data/ml-latest-small/

In [10]:
%%time
print('Loading ratings')
with open(PATH_TO_DATA+"full_rating.csv", 'r') as ratings_file:
    ratings_file_reader = csv.reader(ratings_file)
    raw_ratings = list(ratings_file_reader)
    raw_ratings_header = raw_ratings.pop(0)

Loading ratings
CPU times: user 2.84 s, sys: 184 ms, total: 3.03 s
Wall time: 2.97 s


In [11]:
# Iterate through the input to map yoochoose IDs to new internal IDs
# The new internal IDs will be created by the defaultdict on insertion

yoochoose_to_internal_user_ids = defaultdict(lambda: len(yoochoose_to_internal_user_ids))
yoochoose_to_internal_item_ids = defaultdict(lambda: len(yoochoose_to_internal_item_ids))

In [12]:
%%time
for row in raw_ratings:
    row[0] = yoochoose_to_internal_user_ids[int(row[0])]
    row[1] = yoochoose_to_internal_item_ids[int(row[1])]
    row[2] = float(row[2])

CPU times: user 1.68 s, sys: 4 ms, total: 1.69 s
Wall time: 1.69 s


In [13]:
n_users = len(yoochoose_to_internal_user_ids)
n_items = len(yoochoose_to_internal_item_ids)

# Look at an example raw rating
print("Raw ratings example:\n{}\n{}".format(raw_ratings_header, raw_ratings[0]))

Raw ratings example:
['SessionId', 'ItemId', 'ActionScore', 'Time']
[0, 0, 5.0, '1396851560.9789999']


In [14]:
# Shuffle the ratings and split them in to train/test sets 70%/30%

random.shuffle(raw_ratings)  # Shuffles the list in-place
cutoff = int(.7 * len(raw_ratings))
train_ratings = raw_ratings[:cutoff]
test_ratings = raw_ratings[cutoff:]
print("{} train ratings, {} test ratings".format(len(train_ratings), len(test_ratings)))

876967 train ratings, 375844 test ratings


In [43]:
train_ratings[:5]

[[16476, 3440, 1.0, '1402379948.725'],
 [40385, 5024, 1.0, '1411418028.4220002'],
 [19677, 555, 1.0, '1404583144.726'],
 [17754, 857, 1.0, '1403099825.221'],
 [17689, 2746, 1.0, '1403432447.2779999']]

In [58]:
temp_interaction = train_ratings[0:2]
temp_interaction

[[16476, 3440, 1.0, '1402379948.725'],
 [40385, 5024, 1.0, '1411418028.4220002']]

In [59]:
users_column, items_column, ratings_column, _ = zip(*temp_interaction)

In [61]:
users_column, items_column, ratings_column

((16476, 40385), (3440, 5024), (1.0, 1.0))

In [67]:
sparse.coo_matrix((ratings_column, (users_column, items_column)),
                             shape=(n_users, n_items)).toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [63]:
n_users, n_items

(42144, 5120)

In [15]:
%%time
# This method converts a list of (user, item, rating, time) to a sparse matrix

def interactions_list_to_sparse_matrix(interactions):
    users_column, items_column, ratings_column, _ = zip(*interactions)
    return sparse.coo_matrix((ratings_column, (users_column, items_column)),
                             shape=(n_users, n_items))

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 6.2 µs


In [16]:
%%time
# Create sparse matrices of interaction data

sparse_train_ratings = interactions_list_to_sparse_matrix(train_ratings)
sparse_test_ratings = interactions_list_to_sparse_matrix(test_ratings)

CPU times: user 5.45 s, sys: 0 ns, total: 5.45 s
Wall time: 5.43 s


In [17]:
%%time
# Construct indicator features for users and items

user_indicator_features = sparse.identity(n_users)
item_indicator_features = sparse.identity(n_items)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 369 µs


In [18]:
%%time
# Build a matrix factorization collaborative filter model

cf_model = tensorrec.TensorRec(n_components=5)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 47.2 µs


In [69]:
%%time
dense_train_ratings = sparse_train_ratings.toarray()
dense_user_indicator_features = user_indicator_features.toarray()
dense_item_indicator_features = item_indicator_features.toarray()

CPU times: user 0 ns, sys: 4.32 s, total: 4.32 s
Wall time: 4.32 s


In [74]:
# %%time
# # Fit the collaborative filter model

# print("Training collaborative filter")

# cf_model.fit(interactions=dense_train_ratings,
#              user_features=dense_user_indicator_features,
#              item_features=dense_item_indicator_features)

# # ValueError: Input must be a scipy sparse matrix, an iterable of scipy sprase matrices, or a TensorFlow Dataset


In [19]:
%%time
# Fit the collaborative filter model

print("Training collaborative filter")

cf_model.fit(interactions=sparse_train_ratings,
             user_features=user_indicator_features,
             item_features=item_indicator_features)

Training collaborative filter


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


CPU times: user 37.9 s, sys: 2.48 s, total: 40.4 s
Wall time: 11.4 s


In [20]:
# # Create sets of train/test interactions that are only ratings >= 4.0

# sparse_train_ratings_4plus = sparse_train_ratings.multiply(sparse_train_ratings >= 4.0)
# sparse_test_ratings_4plus = sparse_test_ratings.multiply(sparse_test_ratings >= 4.0)

In [30]:
# This method consumes item ranks for each user and prints out recall@10 train/test metrics

def check_results(ranks):
    train_recall_at_10 = tensorrec.eval.recall_at_k(
        test_interactions=sparse_train_ratings,
        predicted_ranks=ranks,
        k=10
    ).mean()

    test_recall_at_10 = tensorrec.eval.recall_at_k(
        test_interactions=sparse_test_ratings,
        predicted_ranks=ranks,
        k=10
    ).mean()

    print("Recall at 10: Train: {:.4f} Test: {:.4f}".format(train_recall_at_10,
                                                            test_recall_at_10))

#    print("Recall at 10: Test: {:.4f}".format(test_recall_at_10))


In [31]:
%%time
# Check the results of the MF CF model

print("Matrix factorization collaborative filter:")

predicted_ranks = cf_model.predict_rank(user_features=user_indicator_features,
                                        item_features=item_indicator_features)

Matrix factorization collaborative filter:
CPU times: user 38.6 s, sys: 1.2 s, total: 39.8 s
Wall time: 5.18 s


In [32]:
check_results(predicted_ranks)

Recall at 10: Train: 0.0120 Test: 0.0150


In [33]:
# Let's try a new loss function: WMRB

print("Training collaborative filter with WMRB loss")

ranking_cf_model = tensorrec.TensorRec(n_components=5,
                                       loss_graph=tensorrec.loss_graphs.WMRBLossGraph())

Training collaborative filter with WMRB loss


In [35]:
%%time
ranking_cf_model.fit(interactions=sparse_train_ratings,
                     user_features=user_indicator_features,
                     item_features=item_indicator_features,
                     n_sampled_items=int(n_items * .01))

CPU times: user 17min 43s, sys: 1min 23s, total: 19min 6s
Wall time: 15min 54s


In [36]:
%%time
# Check the results of the WMRB MF CF model

print("WMRB matrix factorization collaborative filter:")
predicted_ranks = ranking_cf_model.predict_rank(user_features=user_indicator_features,
                                                item_features=item_indicator_features)

WMRB matrix factorization collaborative filter:
CPU times: user 37.4 s, sys: 1.19 s, total: 38.6 s
Wall time: 5.04 s


In [37]:
%%time
check_results(predicted_ranks)

Recall at 10: Train: 0.1745 Test: 0.1732
CPU times: user 2.62 s, sys: 440 ms, total: 3.06 s
Wall time: 3.06 s
