In [3]:
# libraries

import pandas as pd
import numpy as np 
from sklearn.metrics import mean_squared_error
from math import sqrt
from scipy.sparse.linalg import svds
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from surprise.model_selection import cross_validate
from surprise import SVD, Dataset, Reader
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from sklearn.model_selection import ParameterGrid

In [ ]:
!pip install surprise

## Collaborative Filtering

Collaborative filtering is a method of making automatic predictions (filtering) about the interests of a user by collecting preferences from many users (collaborating). The underlying assumption of the collaborative filtering approach is that if a person A has the same opinion as a person B on an issue, A is more likely to have B's opinion on a different issue than that of a randomly chosen person.

There are two types of collaborative filtering: user-based and item-based. User-based collaborative filtering is based on the similarity between users and item-based collaborative filtering is based on the similarity between items. For our recommender system we chose an item-based approach. The reasons for that are many. Item-based collaborative filtering is often preferred over user-based collaborative filtering, particularly in environments where the item catalog is relatively stable and doesn't grow as quickly as the user base. Item-based systems have a better scalability and efficiency, especially with large user bases. Unlike user preferences, which can change rapidly and complicate similarity calculations, the characteristics of movies remain constant, making it easier to calculate and store the item similarities as their relationship are stable. An item-based approach sidesteps the complexity and computational demand of constantly updating user similarities, making it a more straightforward choice for delivering recommendations also for new users and less popular items.



### Item-based Collaborative Filtering

To build an item-based collaborative filtering system, we need to calculate the similarity between items based on the ratings users have given to those items. We will use the cosine similarity to calculate the similarity between items. 



In [4]:
df_merged = pd.read_pickle('data/df_movies_cleaned.pkl')
df_ratings = pd.read_pickle('data/df_ratings_cleaned.pkl')



In [5]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24848104 entries, 0 to 24848103
Data columns (total 6 columns):
 #   Column            Dtype         
---  ------            -----         
 0   userId            int64         
 1   movieId           int64         
 2   rating            Float64       
 3   timestamp         datetime64[ns]
 4   user_mean_rating  Float64       
 5   liked_by_user     boolean       
dtypes: Float64(2), boolean(1), datetime64[ns](1), int64(2)
memory usage: 1.0 GB


In [6]:
df_ratings = df_ratings.drop(columns=['user_mean_rating', 'liked_by_user'])

In [7]:
df_ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,24848100.0,24848100.0,24848104.0,24848104
mean,135003.6,16211.73,3.528737,2007-02-21 23:13:44.095162112
min,1.0,1.0,0.5,1995-01-09 11:46:44
25%,67126.0,1088.0,3.0,2001-06-09 05:22:35
50%,135134.0,2670.0,3.5,2006-06-16 19:53:25.500000
75%,202642.0,6711.0,4.0,2013-02-19 17:38:24.249999872
max,270896.0,176275.0,5.0,2017-08-04 06:57:50
std,78175.12,31358.02,1.060048,


To safe computational time, we will use a subset of the data. We will only use ratings from 2016 onwards. A final implementation could use the entire dataset to improve accuracy.

In [8]:
#df_ratings['timestamp'] = pd.to_datetime(df_ratings['timestamp'])
df_ratings = df_ratings.sort_values('timestamp')
df_ratings_subset = df_ratings[df_ratings['timestamp'] > '2016-01-01']
df_ratings_subset = df_ratings_subset.drop(columns=['timestamp'])
df_ratings_subset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3321925 entries, 1319144 to 16502857
Data columns (total 3 columns):
 #   Column   Dtype  
---  ------   -----  
 0   userId   int64  
 1   movieId  int64  
 2   rating   Float64
dtypes: Float64(1), int64(2)
memory usage: 104.5 MB


In [9]:
# memory usage of subset / original
(df_ratings_subset.memory_usage() / df_ratings.memory_usage()) * 100

Index        13.368927
movieId      13.368927
rating       13.368927
timestamp          NaN
userId       13.368927
dtype: float64

For the train/test split we will split the data historically. Temporal splitting ensures that the training data contains information from the past, and the test data contains information from the future. This reflects a real-world scenario better, where the system is trained on historical data and evaluated on more recent/future data to assess its performance. We also tried a random split that resulted in better RMSE values than the temporal split. However, we decided to use the temporal split for the sake of a more realistic approach and to align with industry standards. As a model deployment is not possible we can ensure a better real-world performance by that, at this stage.  

We will use 80% of the data for training and 20% for testing.

In [10]:
# Temporal Train/Test Split
split_index = int(len(df_ratings_subset) * 0.8)

train_data = df_ratings_subset[:split_index]
test_data = df_ratings_subset[split_index:]

We will now create a similarity matrix. The matrix will contain the similarity between each pair of items. We will use the cosine similarity to calculate the similarity between items.

In [11]:
# User-Item Matrix for Training
user_item_matrix_train = train_data.pivot_table(index='userId', columns='movieId', values='rating')

# Item-Item Similarity Matrix
item_similarity = cosine_similarity(user_item_matrix_train.fillna(0).T)
item_similarity_df = pd.DataFrame(item_similarity, index=user_item_matrix_train.columns, columns=user_item_matrix_train.columns)

print(item_similarity_df)
print(item_similarity_df.info())


movieId    1         2         3         4         5         6         7       \
movieId                                                                         
1        1.000000  0.379882  0.132247  0.033639  0.164564  0.237477  0.122782   
2        0.379882  1.000000  0.177338  0.053671  0.175422  0.191025  0.113127   
3        0.132247  0.177338  1.000000  0.048691  0.283075  0.120667  0.173906   
4        0.033639  0.053671  0.048691  1.000000  0.114062  0.031977  0.036082   
5        0.164564  0.175422  0.283075  0.114062  1.000000  0.109077  0.268335   
...           ...       ...       ...       ...       ...       ...       ...   
170747   0.010918  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
170749   0.000000  0.000000  0.064965  0.000000  0.000000  0.000000  0.000000   
170751   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
170753   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
170755   0.000000  0.020036 

Looking at our output we encounter a first problem: the matrix size. By only using roughly 13 % of the original data (calculated in memory usage) we end up with a similarity matrix of almost 10 GB in size. This is not feasible for our use case. Consequently, we will implement a Singular Value Decomposition (SVD) to reduce the dimensionality of the matrix.

### Singular Value Decomposition (SVD)


SVD helps in extracting latent factors that explain observed ratings, efficiently reducing data dimensionality while preserving essential information. This significantly speeds up calculations, making the process of predicting ratings more efficient, especially when dealing with a large dataset like ours. Additionally, by focusing on these latent factors, SVD enables a deeper understanding of user preferences and item characteristics, promising more personalized and accurate recommendations.

In [12]:
# create matrix
user_item_matrix_train = train_data.pivot(index='userId', columns='movieId', values='rating').fillna(0)
user_item_matrix_sparse = csr_matrix(user_item_matrix_train.values.astype(float))

# mean centering
mean_user_rating = user_item_matrix_sparse.mean(axis=1)
user_item_matrix_centered = user_item_matrix_sparse - mean_user_rating

# SVD 
U, sigma, Vt = svds(user_item_matrix_centered, k=50) # k selected manually at this stage
sigma_matrix = np.diag(sigma)

# Predict ratings for all users
all_user_predicted_ratings = np.dot(np.dot(U, sigma_matrix), Vt) + mean_user_rating.A1.reshape(-1, 1)

# Create a DataFrame with the predicted ratings
preds_df = pd.DataFrame(all_user_predicted_ratings, index=user_item_matrix_train.index, columns=user_item_matrix_train.columns)

# Predict ratings for the test set
def safe_get_prediction(row):
    try:
        return preds_df.loc[row['userId'], row['movieId']]
    except KeyError:
        return np.nan

test_data['predicted'] = test_data.apply(safe_get_prediction, axis=1)

# filter only rows where we have a prediction
filtered_test_data = test_data.dropna(subset=['predicted'])

# RMSE 
rmse = sqrt(mean_squared_error(filtered_test_data['rating'], filtered_test_data['predicted']))
print(f'RMSE: {rmse}')


RMSE: 3.1650502206124087


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted'] = test_data.apply(safe_get_prediction, axis=1)


Given a 1-to-5 scale, an RMSE of 3.165 is quite high, indicating that the predictions can be quite far off from the actual ratings. Let's try to improve our model.

For that, we will use the surprise library. Surprise automatically handles normalization and scaling of the data as well as the handling of cold start and sparsity issues.


In [13]:
reader = Reader()
data = Dataset.load_from_df(train_data[['userId', 'movieId', 'rating']], reader)

svd = SVD()

# Fit the model 
svd.fit(data.build_full_trainset())

# Predict ratings for the test set
testset = list(zip(test_data['userId'].values, test_data['movieId'].values, test_data['rating'].values))
predictions = svd.test(testset)

print(accuracy.rmse(predictions))
print(accuracy.mae(predictions))

RMSE: 0.9694
0.9693988323484404
MAE:  0.7421
0.7421251681528029


We can see a major improvement of the metrics when using the Surprise library compared to our previous approach!

A Root Mean Square Error (RMSE) of approximately 0.9694 suggests that, on average, our predicted ratings deviate from the actual ratings by around 0.97 units on a scale of 1 to 5. Without considering their direction, they deviate around around 0.7420 units (MAE). We consider this level of error as moderate to good. 


Let's also compute the RMSE and MAE with a random split for illustrative purposes before fine tuning the model on a temporal split.

In [14]:
reader_random = Reader()

data_random_split = Dataset.load_from_df(df_ratings_subset[['userId', 'movieId', 'rating']], reader_random)

svd_random = SVD()

cross_validate(svd_random, data_random_split, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.7848  0.7854  0.7840  0.7847  0.7846  0.7847  0.0005  
MAE (testset)     0.5841  0.5849  0.5834  0.5838  0.5838  0.5840  0.0005  
Fit time          18.08   18.23   18.77   18.17   18.26   18.30   0.24    
Test time         2.99    4.10    2.73    2.45    2.66    2.99    0.58    


{'test_rmse': array([0.78475205, 0.78539078, 0.78395027, 0.78474373, 0.78458848]),
 'test_mae': array([0.58412186, 0.58493104, 0.58343147, 0.58379567, 0.58384229]),
 'fit_time': (18.07552695274353,
  18.23200798034668,
  18.76857304573059,
  18.16711926460266,
  18.25535297393799),
 'test_time': (2.991689920425415,
  4.100869178771973,
  2.7325620651245117,
  2.454019784927368,
  2.6558640003204346)}

The superior performance of the random split (model RMSE 0.7847) suggests that it may offer a more balanced and varied dataset for both training and testing phases, potentially leading to a model that is better at generalizing across the entire dataset. 

Yet, as already mentioned for a real-world recommender systems, a temporal split is often preferred to account for evolving preferences and trends over time. For a movie recommender system, especially one like DreamStream that might experience frequent updates to its movie catalog and shifts in user preferences, we  suggest a temporal split. This approach acknowledges the evolving nature of both movies and user tastes, preparing the system to adapt to real-world scenarios more effectively. It also allows the system to better handle cold start problems with new releases. 

Let's get back to our temporal split and try to optimize our model using a GridSearch to find the best combination of hyperparameter for the model. 

In [15]:
reader = Reader(rating_scale=(1, 5))

data = Dataset.load_from_df(train_data[['userId', 'movieId', 'rating']], reader)

trainset = data.build_full_trainset()
testset = list(zip(test_data['userId'].values, test_data['movieId'].values, test_data['rating'].values))

# our grid of parameters
param_grid = {'n_factors': [50, 100],  # Number of factors
              'n_epochs': [20],         # Number of iterations
              'lr_all': [0.005, 0.01],      # Learning rate
              'reg_all': [0.02, 0.05]}      # Regularization term

svd = SVD()

best_rmse = float('inf')
best_params = None

# Loop through parameter combinations
for params in ParameterGrid(param_grid):
    svd = SVD(**params)
    svd.fit(trainset)


    predictions = svd.test(testset)

    # RMSE
    rmse = accuracy.rmse(predictions)

    # Update best RMSE and parameters if necessary
    if rmse < best_rmse:
        best_rmse = rmse
        best_params = params

print("Best RMSE score obtained: ", best_rmse)
print("Best parameters: ", best_params)




RMSE: 0.9688
RMSE: 0.9688
RMSE: 0.9692
RMSE: 0.9687
RMSE: 0.9719
RMSE: 0.9701
RMSE: 0.9725
RMSE: 0.9699
Best RMSE score obtained:  0.9687174189065906
Best parameters:  {'lr_all': 0.005, 'n_epochs': 20, 'n_factors': 100, 'reg_all': 0.05}


The best RMSE score obtained is 0.9687 with the following parameters: 

lr_all: 0.005, n_epochs: 20, n_factors: 100, reg_all: 0.05

This is a slightly  better RMSE score as we obtained with the default parameters (RMSE 0.9698). With higher computational power and time, we could further optimize the model by testing more hyperparameters and combinations. At this stage we will stick with the selected parameters from our GridSearch.

Let us now train the best version of our model on the full subset and predict the top ten recommendations for a selected user.

In [18]:
svd = SVD(**best_params)
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0xbc4baecd0>

Prediction for user: 14204

In [19]:
selected_user_id = 14204
rated_movie_ids = df_ratings_subset[df_ratings_subset['userId'] == selected_user_id]['movieId'].unique()
all_movie_ids = df_ratings_subset['movieId'].unique()

# Predict ratings for all movies that the selected user has not rated yet
predicted_unrated_movies = []
for movie_id in all_movie_ids:
    if movie_id not in rated_movie_ids:
        prediction = svd.predict(uid=selected_user_id, iid=movie_id)
        predicted_unrated_movies.append((movie_id, prediction.est))

# sorting
sorted_predicted_unrated_movies = sorted(predicted_unrated_movies, key=lambda x: x[1], reverse=True)
top_10_unrated_movies = sorted_predicted_unrated_movies[:10]

# Top 10 predicted ratings for the selected user
print(f"Top 10 recommended movies for User {selected_user_id}:")
for i, (movie_id, predicted_rating) in enumerate(top_10_unrated_movies, start=1):
    print(f"Rank {i}: Movie ID {movie_id}, Predicted Rating: {predicted_rating}")


Top 10 recommended movies for User 14204:
Rank 1: Movie ID 93040, Predicted Rating: 4.209856235490476
Rank 2: Movie ID 137904, Predicted Rating: 4.120377594522206
Rank 3: Movie ID 159817, Predicted Rating: 4.104339770487309
Rank 4: Movie ID 8484, Predicted Rating: 4.071723551405211
Rank 5: Movie ID 3677, Predicted Rating: 4.056649558591241
Rank 6: Movie ID 97673, Predicted Rating: 4.046851602150687
Rank 7: Movie ID 105250, Predicted Rating: 4.038531369323463
Rank 8: Movie ID 82143, Predicted Rating: 4.027010837440242
Rank 9: Movie ID 69830, Predicted Rating: 4.01387029733196
Rank 10: Movie ID 54229, Predicted Rating: 4.011497108180964
