# Build a recommender system via collaborative filtering

In [31]:
from collections import defaultdict
from functools import partial
import re
import os
import sys

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from surprise import Reader, Dataset, SVD, accuracy
from surprise.model_selection import cross_validate, train_test_split

sys.path.append("../src")
from manipulate_data import parse_date_in_title, get_value_for_key

In [34]:
def get_top_k(predictions, k=10):
    """Return the top-k recommendation for each user from a set of predictions.

    Args:
        predictions (list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        k (int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size k.
    """

    # First map the predictions to each user.
    top_k = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_k[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_k.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_k[uid] = user_ratings[:k]

    return top_k


## Load and prepare data

In [2]:
df_movies = pd.read_csv("../data/movies_small.csv")
# parse date
df_movies['date'] = df_movies['title'].apply(parse_date_in_title)
df_movies.head(5)

Unnamed: 0,movieId,title,genres,date
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II (1995),Comedy,1995


In [3]:
#sum(df.movieId.duplicated()) # each movieId is unique
get_value_for_key(df_movies, key=1, col_key='movieId', col_value='title')

'Toy Story (1995)'

In [4]:
# open csv with the users and ratings, main data for the collaborative filtering
df_ratings = pd.read_csv("../data/ratings_small.csv")
df_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [5]:
# add date to rating data
df_ratings = df_ratings.merge(
    df_movies[['movieId', 'date']], 
    how='left', 
    on='movieId'
)

## Very quick EDA on the ratings

In [6]:
df_ratings_by_movie = df_ratings[['movieId', 'userId']].groupby("movieId").agg({
    'userId': 'count'
})
df_ratings_by_movie.columns = ['nb_ratings']

In [7]:
df_ratings_by_movie.describe()

Unnamed: 0,nb_ratings
count,9724.0
mean,10.369807
std,22.401005
min,1.0
25%,1.0
50%,3.0
75%,9.0
max,329.0


In [8]:
df_ratings_by_user = df_ratings[['movieId', 'userId']].groupby("userId").agg({
    'movieId': 'count'
})
df_ratings_by_user.columns = ['nb_ratings']
df_ratings_by_user.describe()

Unnamed: 0,nb_ratings
count,610.0
mean,165.304918
std,269.480584
min,20.0
25%,35.0
50%,70.5
75%,168.0
max,2698.0


## Build recommender

In [9]:
reader = Reader()
data = Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']], reader)
data_train, data_test = train_test_split(data, test_size=.33)
svd = SVD()
#TODO: add hyperparameter selection
svd.fit(data_train)
predictions = svd.test(data_test)
accuracy.rmse(predictions) #TODO: use other metric to taking ranking into account

RMSE: 0.8815


0.8815030957754424

In [35]:
get_top_k(predictions)

defaultdict(list,
            {356: [(1197, 4.8895588260571525),
              (4226, 4.654940962116936),
              (720, 4.632539204694965),
              (2968, 4.616073129140951),
              (1732, 4.585256211721486),
              (50, 4.57649008199468),
              (7438, 4.56039035632811),
              (1270, 4.537009767557239),
              (4993, 4.5064039764482455),
              (1148, 4.503072920897152)],
             477: [(608, 4.600234619043931),
              (589, 4.560384032605661),
              (1193, 4.5512648277720045),
              (593, 4.528543926741909),
              (1225, 4.512658048248036),
              (5618, 4.500867672454832),
              (1276, 4.491642879398761),
              (1136, 4.471128063808701),
              (1036, 4.416003647761196),
              (356, 4.398173586487599)],
             469: [(904, 4.819443088971735),
              (260, 4.607118482281434),
              (858, 4.475714745112129),
              (1214, 4.47397000

## Evaluate

In [10]:
# select a sample to evaluate the model 
test_users = np.random.choice(df_ratings_by_user.index.values, 100)
df_test = df_ratings[df_ratings.userId.apply(lambda user: user in test_users)].copy()
# predict ratings
df_test['predicted_rating'] = df_test.apply(
    lambda row: svd.predict(uid=row['userId'], iid=row['movieId']).est,
    axis=1
)

In [11]:
fig = px.violin(df_test, x="rating", y="predicted_rating")
fig.show()

## Give recommendations

In [12]:
k = 10
uid = 5
# find top k favorite movies
df_user = df_ratings[df_ratings['userId'] == uid].sort_values('rating', ascending=False).reset_index(drop=True)
actual_top_k = df_user.loc[:k, 'movieId'].values

# predict top k favorite movies
df_user['predicted_rating'] = df_user['movieId'].apply(
    lambda movie_id: svd.predict(uid=5, iid=movie_id).est
)
predicted_top_k = df_user.loc[
    df_user.predicted_rating.sort_values(ascending=False).iloc[:k].index, 
    'movieId'
].values

#print(actual_top_k, predicted_top_k)
print(df_user)

    userId  movieId  rating  timestamp  date  predicted_rating
0        5      590     5.0  847434747  1990          4.124455
1        5      475     5.0  847435311  1993          4.095349
2        5      527     5.0  847434960  1993          4.513377
3        5       58     5.0  847435238  1994          4.117533
4        5      596     5.0  847435292  1940          3.785320
5        5      595     5.0  847434832  1991          3.687081
6        5      247     5.0  847435337  1994          4.065591
7        5      594     5.0  847435238  1937          3.566950
8        5      296     5.0  847434748  1994          4.412580
9        5      290     5.0  847435311  1994          4.149472
10       5        1     4.0  847434962  1995          3.872513
11       5      367     4.0  847435129  1994          3.467039
12       5       21     4.0  847435238  1995          3.763320
13       5      531     4.0  847435258  1993          3.414757
14       5      588     4.0  847434801  1992          3

In [13]:
fig = px.scatter(df_user, x="rating", y="predicted_rating")
fig.show()

In [14]:
get_value_for_key(df_movies, key=515, col_key='movieId', col_value='title')

'Remains of the Day, The (1993)'