# Build a recommender system via collaborative filtering

In [1]:
from collections import defaultdict
from functools import partial
import json
import re
import os
import sys

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from surprise import Reader, Dataset, SVD, accuracy
from surprise.model_selection import cross_validate, train_test_split

sys.path.append("../src")
from manipulate_data import parse_date_in_title, get_value_for_key

In [2]:
def match_word_in_str(str_to_check, word, list_separator="|"):
    list_to_check = str_to_check.split(list_separator)
    if word in list_to_check:
        return 1
    return 0

def get_top_k(predictions, k=10):
    """Return the top-k recommendation for each user from a set of predictions.

    Args:
        predictions (list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        k (int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size k.
    """

    # First map the predictions to each user.
    top_k = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_k[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_k.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_k[uid] = user_ratings[:k]

    return top_k


In [3]:
year_recommendations = 2002

## Load and prepare data

In [4]:
df_movies = pd.read_csv("../data/movies_small.csv")
# parse date
df_movies['date'] = df_movies['title'].apply(parse_date_in_title)
df_movies.head(5)

Unnamed: 0,movieId,title,genres,date
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995.0
2,3,Grumpier Old Men (1995),Comedy|Romance,1995.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995.0
4,5,Father of the Bride Part II (1995),Comedy,1995.0


In [5]:
list_genres = [
    "Action",
    "Adventure",
    "Animation",
    "Children's",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western",
]
for genre in list_genres:
    func_find_genre = partial(match_word_in_str, word=genre, list_separator="|")
    df_movies["is_" + genre] = df_movies['genres'].apply(func_find_genre)

In [6]:
df_movies.sum()

movieId                                                   411115845
title             Toy Story (1995)Jumanji (1995)Grumpier Old Men...
genres            Adventure|Animation|Children|Comedy|FantasyAdv...
date                                                     19383561.0
is_Action                                                      1828
is_Adventure                                                   1263
is_Animation                                                    611
is_Children's                                                     0
is_Comedy                                                      3756
is_Crime                                                       1199
is_Documentary                                                  440
is_Drama                                                       4361
is_Fantasy                                                      779
is_Film-Noir                                                     87
is_Horror                                       

In [7]:
# open csv with the users and ratings, main data for the collaborative filtering
df_ratings = pd.read_csv("../data/ratings_small.csv")
df_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [8]:
# add date to rating data
df_ratings = df_ratings.merge(
    df_movies[['movieId', 'date']], 
    how='left', 
    on='movieId'
)

In [9]:
ratings_per_year = pd.value_counts(df_ratings.date).reset_index()
ratings_per_year.columns = ['year', 'nb_ratings']
fig = px.bar(ratings_per_year, x='year', y='nb_ratings')
fig.show()

In [10]:
# find most rated movies in 2002
df_ratings[df_ratings.date == year_recommendations]

Unnamed: 0,userId,movieId,rating,timestamp,date
288,3,5048,0.5,1306464284,2002.0
952,7,5218,3.5,1106712866,2002.0
953,7,5349,3.5,1106636606,2002.0
954,7,5378,0.5,1106635735,2002.0
955,7,5445,4.0,1106636707,2002.0
...,...,...,...,...,...
100003,610,7090,3.0,1493845971,2002.0
100037,610,7981,4.0,1493847542,2002.0
100098,610,27592,5.0,1479545128,2002.0
100415,610,72424,0.5,1493849030,2002.0


## Very quick EDA on the ratings

In [11]:
df_ratings_by_movie = df_ratings[['movieId', 'userId']].groupby("movieId").agg({
    'userId': 'count'
})
df_ratings_by_movie = df_ratings_by_movie.reset_index()
df_ratings_by_movie.columns = ['movieId', 'nb_ratings']

In [12]:
df_ratings_by_movie.describe()

Unnamed: 0,movieId,nb_ratings
count,9724.0,9724.0
mean,42245.024373,10.369807
std,52191.13732,22.401005
min,1.0,1.0
25%,3245.5,1.0
50%,7300.0,3.0
75%,76739.25,9.0
max,193609.0,329.0


In [13]:
df_movies = df_movies.merge(df_ratings_by_movie, how='left', on='movieId')
df_movies['nb_ratings'] = df_movies['nb_ratings'].fillna(0)

In [14]:
df_ratings_by_user = df_ratings[['movieId', 'userId']].groupby("userId").agg({
    'movieId': 'count'
})
df_ratings_by_user.columns = ['nb_ratings']
df_ratings_by_user.describe()

Unnamed: 0,nb_ratings
count,610.0
mean,165.304918
std,269.480584
min,20.0
25%,35.0
50%,70.5
75%,168.0
max,2698.0


## Build recommender

In [15]:
reader = Reader()
data = Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']], reader)
data_train, data_test = train_test_split(data, test_size=.25, random_state=42)
svd = SVD(random_state=42)
#TODO: add hyperparameter selection
svd.fit(data_train)
predictions = svd.test(data_test)
accuracy.rmse(predictions) #TODO: use other metric to taking ranking into account

RMSE: 0.8820


0.8820442070964672

In [16]:
get_top_k(predictions)

defaultdict(list,
            {50: [(1204, 3.7267437830524823),
              (2959, 3.459895749007173),
              (914, 3.418147342447325),
              (5618, 3.4048871858261127),
              (1136, 3.3373550357747193),
              (1281, 3.275875850600982),
              (1148, 3.253110511033512),
              (158238, 3.2520702098759475),
              (111, 3.233054950307342),
              (909, 3.1846405463616527)],
             603: [(1221, 4.737371397513856),
              (1172, 4.479717393349792),
              (2858, 4.454986308470075),
              (290, 4.351162613246912),
              (1193, 4.307413517684094),
              (858, 4.297098176278411),
              (1248, 4.228965373412181),
              (293, 4.1242697693189205),
              (223, 4.117830368251335),
              (1196, 4.097505202509978)],
             140: [(2947, 4.236231270538797),
              (1036, 4.19489384262814),
              (2067, 4.147739008122713),
              (1408, 4.

## Evaluate

In [17]:
df_test = pd.read_json(json.dumps(predictions), orient='records')
df_test.columns = ['userId', 'movieId', 'rating', 'predicted_rating', 'log']
df_test = df_test.merge(df_movies[['movieId', 'date']], how='left')

In [18]:
# the segmentation could really be improved #TODO: hyperparameter tuning
fig = go.Figure()
fig.add_trace(go.Violin(
    x=df_test['rating'],
    y=df_test['predicted_rating'],
    name='Distribution of predictions'
))    
fig.add_trace(go.Scatter(
    x=np.arange(0.5, 5.5, 0.5),
    y=np.arange(0.5, 5.5, 0.5),
    name='y=x',
    line=dict(color='grey', width=4, dash='dash') 
))

fig.show()

In [19]:
# Within 2002, the segmentation seems actually slightly better, except for very low ratings (#TODO EDA on low ratings and fine tune to get better performance)
fig = go.Figure()
fig.add_trace(go.Violin(
    x=df_test.loc[df_test.date == year_recommendations, 'rating'],
    y=df_test.loc[df_test.date == year_recommendations, 'predicted_rating'],
    name='Distribution of predictions'
))    
fig.add_trace(go.Scatter(
    x=np.arange(0.5, 5.5, 0.5),
    y=np.arange(0.5, 5.5, 0.5),
    name='y=x',
    line=dict(color='grey', width=4, dash='dash') 
))

fig.show()

## Give recommendations

### To a user in the dataset

In [20]:
# predicted ratings for an user for the year
userId = 169
k = 3
df_user = df_test[
    (df_test['userId'] == userId)
    & (df_test['date'] == year_recommendations)
].sort_values('rating', ascending=False).reset_index(drop=True)
df_user = df_user.merge(df_movies[['movieId', 'title']], how='left')

In [21]:
# There seem to be a positive relation between actual and predicted ratings, which is encouraging
fig = px.scatter(df_user, x="rating", y="predicted_rating", hover_data=['title'])
fig.show()

### To a fictitious user

In [22]:
find_movie_id = partial(get_value_for_key, df=df_movies, col_key='title', col_value='movieId')

In [23]:
# Create an user who rated the 10 most seen movies only
movies = [
    'Forrest Gump (1994)', 'Shawshank Redemption, The (1994)',
    'Pulp Fiction (1994)', 'Silence of the Lambs, The (1991)',
    'Matrix, The (1999)', 'Star Wars: Episode IV - A New Hope (1977)',
    'Jurassic Park (1993)', 'Braveheart (1995)',
    'Terminator 2: Judgment Day (1991)', "Schindler's List (1993)"
]
ratings = [10, 9, 8, 3, 3, 4, 5, 6, 4, 7]
df_fake_user = pd.DataFrame()
df_fake_user['title'] = movies
df_fake_user['rating'] = ratings
df_fake_user['movieId'] = df_fake_user['title'].apply(find_movie_id)
#df_ratings.userId.max()
df_fake_user['userId'] = 611
df_fake_user.sort_values('rating')


Unnamed: 0,title,rating,movieId,userId
3,"Silence of the Lambs, The (1991)",3,593,611
4,"Matrix, The (1999)",3,2571,611
5,Star Wars: Episode IV - A New Hope (1977),4,260,611
8,Terminator 2: Judgment Day (1991),4,589,611
6,Jurassic Park (1993),5,480,611
7,Braveheart (1995),6,110,611
9,Schindler's List (1993),7,527,611
2,Pulp Fiction (1994),8,296,611
1,"Shawshank Redemption, The (1994)",9,318,611
0,Forrest Gump (1994),10,356,611


In [24]:
df_fake_user = df_fake_user[['userId', 'movieId', 'rating']]
fake_predictions = svd.test(df_fake_user.values)
accuracy.rmse(fake_predictions) 
#that's a really bad score... Random guess might do better.
# Hyperparameter tuning should help, or maybe such a user would be really different from the rest of users

RMSE: 2.9202


2.9201539362120887

In [25]:
# TODO predict for all movies and get top K recommendations for year 2002

## Interprete

Let's plot the two first dimensions of the latent movie space to interprete the SVD a little bit.

In [26]:
# Access the latent item factors
# https://github.com/NicolasHug/Surprise/issues/119 latent user factors with algo.pu and item factors with algo.qi
svd.pu.shape, svd.qi.shape
# ie number of users in train set * number of factors, number of items * nb factors

((610, 100), (8731, 100))

In [27]:
# Recap in a DataFrame the first two factors and movie information
df_factors = pd.DataFrame()
df_factors['1st'] = svd.qi[:, 0]
df_factors['2nd'] = svd.qi[:, 1]
df_factors['uid'] = data_train.all_items() 
df_factors['movieId'] = df_factors['uid'].apply(data_train.to_raw_iid)
df_factors = df_factors.merge(df_movies, how="left")

In [28]:
# Plot first two dimensions of the latent movie space
# Good sign: on the first two dimensions, there doesn't seem to be a clear pattern with year
threshold_ratings = 100
df_factors_filtered = df_factors.loc[df_factors.nb_ratings >= threshold_ratings]

sizeref = 2. * max(df_factors_filtered['nb_ratings']) / (40 ** 2)

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=df_factors_filtered['1st'],
    y=df_factors_filtered['2nd'],
    mode='markers',
    text=df_factors_filtered['title'],
    marker=dict(
        color=df_factors_filtered['is_Comedy'],
        #opacity=ratings_scaled,
        size=df_factors_filtered['nb_ratings'],
        sizemode='area',
        sizeref=2. * max(df_factors_filtered['nb_ratings']) / (40. ** 2),
        sizemin=4
    ),
))
fig.show()

In [29]:
df_movies.sort_values('nb_ratings', ascending=False)[:10]

Unnamed: 0,movieId,title,genres,date,is_Action,is_Adventure,is_Animation,is_Children's,is_Comedy,is_Crime,...,is_Film-Noir,is_Horror,is_Musical,is_Mystery,is_Romance,is_Sci-Fi,is_Thriller,is_War,is_Western,nb_ratings
314,356,Forrest Gump (1994),Comedy|Drama|Romance|War,1994.0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,1,0,329.0
277,318,"Shawshank Redemption, The (1994)",Crime|Drama,1994.0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,317.0
257,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,1994.0,0,0,0,0,1,1,...,0,0,0,0,0,0,1,0,0,307.0
510,593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,1991.0,0,0,0,0,0,1,...,0,1,0,0,0,0,1,0,0,279.0
1939,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,1999.0,1,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,278.0
224,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,1977.0,1,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,251.0
418,480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller,1993.0,1,1,0,0,0,0,...,0,0,0,0,0,1,1,0,0,238.0
97,110,Braveheart (1995),Action|Drama|War,1995.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,237.0
507,589,Terminator 2: Judgment Day (1991),Action|Sci-Fi,1991.0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,224.0
461,527,Schindler's List (1993),Drama|War,1993.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,220.0
