In [28]:
from collections import defaultdict
import pandas as pd

from surprise import SVD
from surprise import Dataset, Reader

In [29]:
def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n



In [53]:
# Read itens info
movies = pd.read_csv('ml-latest-small/movies.csv', low_memory=False)
# movies.head()
my_movie = movies.loc[movies['movieId'] == 4]
my_movie.values[0][1]

'Waiting to Exhale (1995)'

In [31]:
# First train an SVD algorithm on the movielens dataset.
reader = Reader(line_format='user item rating timestamp', sep=',')
data = Dataset.load_from_file('ml-latest-small/ratings.csv', reader)
trainset = data.build_full_trainset()
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fcdc8c5fcc0>

In [32]:
# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

In [33]:
len(predictions)

5830804

In [34]:
user_filtered = list(filter(lambda x: x.uid == '2', predictions))
len(user_filtered)

9695

In [35]:
top_n = get_top_n(user_filtered, n=10)

In [58]:
# Print the recommended items for each user
for uid, user_ratings in top_n.items():
#     print(uid, [iid for (iid, _) in user_ratings])


2 ['Forrest Gump (1994)', 'Lawrence of Arabia (1962)', 'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)', 'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)', 'Seven Samurai (Shichinin no samurai) (1954)', 'Cinema Paradiso (Nuovo cinema Paradiso) (1989)', 'Goodfellas (1990)', 'Rear Window (1954)', "Rosemary's Baby (1968)", 'Pulp Fiction (1994)']
