# Build a recommender system via collaborative filtering

In [1]:
from collections import defaultdict
from functools import partial
import json
import re
import os
import sys

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from surprise import Reader, Dataset, SVD, accuracy
from surprise.model_selection import cross_validate, train_test_split

sys.path.append("../src")
from manipulate_data import parse_date_in_title, get_value_for_key

In [2]:
def get_top_k(predictions, k=10):
    """Return the top-k recommendation for each user from a set of predictions.

    Args:
        predictions (list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        k (int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size k.
    """

    # First map the predictions to each user.
    top_k = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_k[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_k.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_k[uid] = user_ratings[:k]

    return top_k


In [3]:
year_recommendations = 2002

## Load and prepare data

In [4]:
df_movies = pd.read_csv("../data/movies_small.csv")
# parse date
df_movies['date'] = df_movies['title'].apply(parse_date_in_title)
df_movies.head(5)

Unnamed: 0,movieId,title,genres,date
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995.0
2,3,Grumpier Old Men (1995),Comedy|Romance,1995.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995.0
4,5,Father of the Bride Part II (1995),Comedy,1995.0


In [5]:
#sum(df.movieId.duplicated()) # each movieId is unique
get_value_for_key(df_movies, key=1, col_key='movieId', col_value='title')

'Toy Story (1995)'

In [6]:
# open csv with the users and ratings, main data for the collaborative filtering
df_ratings = pd.read_csv("../data/ratings_small.csv")
df_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [7]:
# add date to rating data
df_ratings = df_ratings.merge(
    df_movies[['movieId', 'date']], 
    how='left', 
    on='movieId'
)

In [8]:
ratings_per_year = pd.value_counts(df_ratings.date).reset_index()
ratings_per_year.columns = ['year', 'nb_ratings']
fig = px.bar(ratings_per_year, x='year', y='nb_ratings')
fig.show()

In [9]:
# find most rated movies in 2002
df_ratings[df_ratings.date == year_recommendations]

Unnamed: 0,userId,movieId,rating,timestamp,date
288,3,5048,0.5,1306464284,2002.0
952,7,5218,3.5,1106712866,2002.0
953,7,5349,3.5,1106636606,2002.0
954,7,5378,0.5,1106635735,2002.0
955,7,5445,4.0,1106636707,2002.0
...,...,...,...,...,...
100003,610,7090,3.0,1493845971,2002.0
100037,610,7981,4.0,1493847542,2002.0
100098,610,27592,5.0,1479545128,2002.0
100415,610,72424,0.5,1493849030,2002.0


## Very quick EDA on the ratings

In [10]:
df_ratings_by_movie = df_ratings[['movieId', 'userId']].groupby("movieId").agg({
    'userId': 'count'
})
df_ratings_by_movie.columns = ['nb_ratings']

In [11]:
df_ratings_by_movie.describe()

Unnamed: 0,nb_ratings
count,9724.0
mean,10.369807
std,22.401005
min,1.0
25%,1.0
50%,3.0
75%,9.0
max,329.0


In [12]:
df_ratings_by_user = df_ratings[['movieId', 'userId']].groupby("userId").agg({
    'movieId': 'count'
})
df_ratings_by_user.columns = ['nb_ratings']
df_ratings_by_user.describe()

Unnamed: 0,nb_ratings
count,610.0
mean,165.304918
std,269.480584
min,20.0
25%,35.0
50%,70.5
75%,168.0
max,2698.0


## Build recommender

In [13]:
reader = Reader()
data = Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']], reader)
data_train, data_test = train_test_split(data, test_size=.33)
svd = SVD()
#TODO: add hyperparameter selection
svd.fit(data_train)
predictions = svd.test(data_test)
accuracy.rmse(predictions) #TODO: use other metric to taking ranking into account

RMSE: 0.8762


0.8761530865974416

In [14]:
get_top_k(predictions)

defaultdict(list,
            {186: [(750, 4.77976247658679),
              (1210, 4.61603926121402),
              (1259, 4.612854820289875),
              (260, 4.577949278551107),
              (1214, 4.571718156503191),
              (593, 4.5670609678254515),
              (2529, 4.565069177282738),
              (2186, 4.53741795428313),
              (1265, 4.521761168764964),
              (3740, 4.5102426994984155)],
             387: [(318, 4.377041416929698),
              (1079, 4.201325930918941),
              (1262, 4.130130469804977),
              (904, 4.074397008839559),
              (866, 4.072017500040394),
              (1223, 4.06548638591659),
              (260, 4.05480069049752),
              (1204, 4.050710641375378),
              (1136, 4.048084337024931),
              (1242, 4.043941985996298)],
             460: [(2959, 4.595313574178711),
              (68954, 4.54335655188893),
              (60069, 4.500735397587651),
              (1704, 4.49722224

## Evaluate

In [15]:
df_test = pd.read_json(json.dumps(predictions), orient='records')
df_test.columns = ['userId', 'movieId', 'rating', 'predicted_rating', 'log']
df_test = df_test.merge(df_movies[['movieId', 'date']], how='left')

In [16]:
# the segmentation could really be improved #TODO: hyperparameter tuning
fig = go.Figure()
fig.add_trace(go.Violin(
    x=df_test['rating'],
    y=df_test['predicted_rating'],
    name='Distribution of predictions'
))    
fig.add_trace(go.Scatter(
    x=np.arange(0.5, 5.5, 0.5),
    y=np.arange(0.5, 5.5, 0.5),
    name='y=x',
    line=dict(color='grey', width=4, dash='dash') 
))

fig.show()

In [17]:
# Within 2002, the segmentation seems actually slightly better, except for very low ratings (#TODO EDA on low ratings)
fig = go.Figure()
fig.add_trace(go.Violin(
    x=df_test.loc[df_test.date == year_recommendations, 'rating'],
    y=df_test.loc[df_test.date == year_recommendations, 'predicted_rating'],
    name='Distribution of predictions'
))    
fig.add_trace(go.Scatter(
    x=np.arange(0.5, 5.5, 0.5),
    y=np.arange(0.5, 5.5, 0.5),
    name='y=x',
    line=dict(color='grey', width=4, dash='dash') 
))

fig.show()

## Give recommendations

In [18]:
# predicted ratings for an user for the year
userId = 169
k = 3
df_user = df_test[
    (df_test['userId'] == userId)
    & (df_test['date'] == year_recommendations)
].sort_values('rating', ascending=False).reset_index(drop=True)
df_user = df_user.merge(df_movies[['movieId', 'title']], how='left')

In [19]:
fig = px.scatter(df_user, x="rating", y="predicted_rating", hover_data=['title'])
fig.show()

## Interprete

In [20]:
# https://github.com/NicolasHug/Surprise/issues/119 latent user factors with algo.pu and item factors with algo.qi
svd.pu.shape

(610, 100)

In [21]:
data_train.all_users()

range(0, 610)

In [22]:
svd.qi.shape

(8404, 100)

In [23]:
data_train.all_items()

range(0, 8404)