# Colaborative Filtering Recommender System


## Surprise

With surprise Library, we will benchmark the following algorithms. We use "rmse" as our accuracy metric for the predictions

In [9]:
import pandas as pd
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise import SVD, SVDpp, SlopeOne, NMF, NormalPredictor, KNNBaseline, \
    KNNBasic, KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering


### Preparing the dataset

In [10]:
ratings_df = pd.read_csv('./data/ratings_cleaned.csv')
ratings_df

Unnamed: 0,movieId,userId,rating
0,42594,0,6.0
1,42594,1,5.0
2,65891,2,7.0
3,11202,3,9.0
4,11202,4,9.0
...,...,...,...
11669,850165,11669,7.0
11670,823452,11670,7.0
11671,365620,11671,6.0
11672,365620,11672,6.0


In [11]:
movies_df = pd.read_csv('./data/movies.csv')
movies_df['title_year'] = movies_df['title'] + ' (' + movies_df['year'].astype(str) + ')'
movies_df

Unnamed: 0,movieId,title,cast_and_crew,year,poster_path,genre,title_year
0,42594,Scream and Scream Again,Gordon Hessler (director); Christopher Wicking...,1970,/49m9QJ2ubKuVtdDj9B7XqgCyriv.jpg,"Horror, Science Fiction",Scream and Scream Again (1970)
1,280133,Jenny,George Bloomfield (director/screenplay); Marti...,1970,/iaJ0PyRh17KoMcUlXxTVu6Vi85s.jpg,"Drama, Romance",Jenny (1970)
2,225155,The Adventures of Gerard,Jerzy Skolimowski (director/screenplay); Arthu...,1970,/aoOV7vWvGiNjaUTHCSIEAeHSvOw.jpg,"Adventure, Comedy, Drama",The Adventures of Gerard (1970)
3,85255,...tick...tick...tick...,Ralph Nelson (director); James Lee Barrett (sc...,1970,/rBNgytHXZEXgOl805pYWKiBQe7s.jpg,"Drama, Action",...tick...tick...tick... (1970)
4,117999,Last of the Mobile Hot Shots,Sidney Lumet (director); Gore Vidal (screenpla...,1970,/xj5Rub6H5B9yMh6GL3oomOM1SLg.jpg,Drama,Last of the Mobile Hot Shots (1970)
...,...,...,...,...,...,...,...
11500,979097,Memory,Michel Franco (director/screenplay); Jessica C...,2023,/cZgTA5ZOKOIRGyYClJMu02VUNcE.jpg,Drama,Memory (2023)
11501,558915,The Color Purple,"Blitz Bazawule (director), Marcus Gardley (scr...",2023,/3Jc93sCl0DqkePYjw47zHpqj7YS.jpg,Drama,The Color Purple (2023)
11502,823452,The Boys in the Boat,"George Clooney (director), Mark L. Smith (scre...",2023,/ncJMztHprw3gLRAnDjNnnT23CIt.jpg,"Drama, History",The Boys in the Boat (2023)
11503,365620,Ferrari,"Michael Mann (director), Troy Kennedy Martin (...",2023,/nNMoJMDCeF4Q5wpWvKuh5b8K2sX.jpg,"Drama, History",Ferrari (2023)


In [13]:
reader = Reader(line_format='user item rating', rating_scale=(0.5,10))

In [14]:
data = Dataset.load_from_df(df=ratings_df[['userId', 'movieId', 'rating']], reader=reader)

### Benchmarking

In [35]:
algo_list = [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]
benchmark = []

for algo in algo_list:
    results = cross_validate(algo=algo, data=data, measures=["rmse"], cv=3, n_jobs=-1, verbose=False)
    algo_name = str(algo).split(' ')[0].split('.')[-1]
    print(algo_name)
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp['Algorithm'] = algo_name
    benchmark.append(tmp)

SVD
SVDpp
SlopeOne
NMF
NormalPredictor
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
KNNBaseline
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
KNNBasic
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
KNNWithMeans
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Computing the msd similarity matrix...
Done computing similarity matrix.
Done computing similarity 

In [36]:
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVDpp,2.050737,0.181432,0.054385
SVD,2.05349,0.232434,0.03635
KNNBaseline,2.054142,1.813233,0.03685
BaselineOnly,2.057744,0.047688,0.039441
NMF,2.10332,1.140366,0.034578
KNNWithZScore,2.103476,2.439715,0.034581
KNNBasic,2.103563,1.5024,0.035521
KNNWithMeans,2.103637,1.912575,0.041369
CoClustering,2.103744,2.178326,0.048279
SlopeOne,2.103758,0.645761,0.041934


## Training and Prediction

Since the `SVDpp` algorithm yielded the best RMSE, we will utilize it for training and prediction using a simple grid search.

In [37]:
from surprise.model_selection import GridSearchCV

param_grid = {'n_factors': [10, 15, 20, 25, 30], 
              'n_epochs': [25, 30, 35, 40], 
              'lr_all': [0.02, 0.03],
              'reg_all': [0.01, 0.02, 0.03],
              'random_state': [0]}

gs = GridSearchCV(algo_class=SVDpp, param_grid=param_grid, measures=['rmse'], cv=3, n_jobs=-1 )

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

2.035417048161671
{'n_factors': 10, 'n_epochs': 25, 'lr_all': 0.02, 'reg_all': 0.01, 'random_state': 0}


In [None]:
# 2.0321557585693815
# {'n_factors': 30, 'n_epochs': 40, 'lr_all': 0.03, 'reg_all': 0.02, 'random_state': 0}

You can now use these optimal hyperparameters to train your SVDpp model on the entire dataset and make predictions.

In [16]:
trainset = data.build_full_trainset()

best_parameters = {'n_factors': 30, 'n_epochs': 40, 'lr_all': 0.03, 'reg_all': 0.02, 'random_state': 0}
# best_parameters = gs.best_params['rmse']
# Create an SVD algorithm with the best hyperparameters
optimal_svd = SVDpp(**best_parameters)

# Train the algorithm on the training set
optimal_svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x7fc7f655bc10>

In [39]:
trainset.all_items()

range(0, 5243)

In [40]:
USER_ID = 463 # raw
print(f"RAW USER: {USER_ID}")
inner_uid = optimal_svd.trainset.to_inner_uid(USER_ID) # inner
print(f"INNER USER: {inner_uid}")
movies_rated_inner = [int(item[0]) for item in optimal_svd.trainset.ur[inner_uid]] # inner
movies_rated = [int(optimal_svd.trainset.to_raw_iid(item)) for item in movies_rated_inner] # inner -> raw
list_rated = movies_df[movies_df['movieId'].isin(movies_rated)]['title'].tolist() # raw

print()
print("RATED:")
for e in list_rated[:10]:
    print(e)

unrated_movies_inner = [id for id in optimal_svd.trainset.all_items() if id not in movies_rated_inner] # inner
predictions = [optimal_svd.predict(uid=USER_ID, iid=optimal_svd.trainset.to_raw_iid(item)) for item in unrated_movies_inner] # raw -> raw

# Sort predictions by estimated rating in descending order
sorted_predictions = sorted(predictions, key=lambda x: x.est, reverse=True) # raw

# Predictions for USER
top_n = []
for _, iid, _, est, _ in sorted_predictions[:20]:
    top_n.append(int(iid))

list_movies = movies_df[movies_df['movieId'].isin(top_n)]['title'].tolist()

print()
print("RECOMMENDED:")
for e in list_movies:
    print(e)

RAW USER: 463
INNER USER: 463

RATED:
Jaws

RECOMMENDED:
Deliverance
Hickey & Boggs
Monty Python and the Holy Grail
The Princess Bride
Bert Rigby, You're a Fool
The Adventures of Baron Munchausen
Dead Bang
The Ballad of Little Jo
Forrest Gump
Ride with the Devil
The Lord of the Rings: The Two Towers
How to Lose a Guy in 10 Days
The Best Exotic Marigold Hotel
Nightcrawler
American Made
Coco
Alpha
Sound of Metal
Wish Dragon
Spider-Man: No Way Home


This result is not exceptional, likely due to the highly sparse nature of the dataset. However, what could we do next? An immediate solution is to explore content-based filtering instead of collaborative filtering.

## Content-Base Filtering

**Under Development**