In [1]:
# libraries

import pandas as pd
import numpy as np 
from sklearn.metrics import mean_squared_error
from math import sqrt
from scipy.sparse.linalg import svds
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from surprise.model_selection import cross_validate
from surprise import SVD, Dataset, Reader
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from sklearn.model_selection import ParameterGrid

In [2]:
!pip install surprise



## Collaborative Filtering

Collaborative filtering is a method of making automatic predictions (filtering) about the interests of a user by collecting preferences from many users (collaborating). The underlying assumption of the collaborative filtering approach is that if a person A has the same opinion as a person B on an issue, A is more likely to have B's opinion on a different issue than that of a randomly chosen person.

There are two types of collaborative filtering: user-based and item-based. User-based collaborative filtering is based on the similarity between users and item-based collaborative filtering is based on the similarity between items. For our recommender system we chose an item-based approach. The reasons for that are many. Item-based collaborative filtering is often preferred over user-based collaborative filtering, particularly in environments where the item catalog is relatively stable and doesn't grow as quickly as the user base. Item-based systems have a better scalability and efficiency, especially with large user bases.



### Recapt Part 1: Item-based Collaborative Filtering

To build an item-based collaborative filtering system, we need to calculate the similarity between items based on the ratings users have given to those items. We will use the cosine similarity to calculate the similarity between items. 



In [12]:
df_merged = pd.read_pickle('data/df_movies_cleaned.pkl')
df_ratings = pd.read_pickle('data/df_ratings_3M.pkl')



In [13]:
df_ratings = df_ratings.drop(columns=['user_mean_rating', 'liked_by_user'])

In [23]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3000001 entries, 11800835 to 7900459
Data columns (total 5 columns):
 #   Column           Dtype         
---  ------           -----         
 0   userId           int64         
 1   movieId          int64         
 2   rating           Float64       
 3   timestamp        datetime64[ns]
 4   rating_category  category      
dtypes: Float64(1), category(1), datetime64[ns](1), int64(2)
memory usage: 120.2 MB


In [14]:
df_ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,3000001.0,3000001.0,3000001.0,3000001
mean,135100.7,16159.02,3.528979,2007-02-18 20:57:45.565002496
min,1.0,1.0,0.5,1996-01-29 00:00:00
25%,67352.0,1088.0,3.0,2001-06-07 13:51:18
50%,135309.0,2657.0,3.5,2006-06-12 15:55:57
75%,202713.0,6707.0,4.0,2013-02-11 17:27:34
max,270896.0,176267.0,5.0,2017-08-04 06:57:50
std,78146.7,31271.31,1.060178,


For the train/test split we will split the data randomly. We will use 80% of the data for training and 20% for testing.

In [15]:
from sklearn.model_selection import train_test_split
# randome Train/Test Split

train_data, test_data = train_test_split(df_ratings, test_size=0.2, random_state=42)


### Singular Value Decomposition (SVD)

We will use Singular Value Decomposition (SVD) to create the collaborative filtering model. SVD is a matrix factorization technique that is commonly used for recommendation systems. It decomposes the user-item interaction matrix into different matrices. SVD helps in extracting latent factors that explain observed ratings, efficiently reducing data dimensionality while preserving essential information. This significantly speeds up calculations, making the process of predicting ratings more efficient, especially when dealing with a large dataset like ours. Additionally, by focusing on these latent factors, SVD enables a deeper understanding of user preferences and item characteristics, promising more personalized and accurate recommendations.

Do compute SVD, we will use the surprise library. Surprise automatically handles normalization and scaling of the data as well as the handling of cold start and sparsity issues.


Let's also use a GridSearch to find the best combination of hyperparameter for the model. 

In [18]:
reader = Reader(rating_scale=(1, 5))

data = Dataset.load_from_df(train_data[['userId', 'movieId', 'rating']], reader)

trainset = data.build_full_trainset()
testset = list(zip(test_data['userId'].values, test_data['movieId'].values, test_data['rating'].values))

# our grid of parameters
param_grid = {'n_factors': [50, 100],  # Number of factors
              'n_epochs': [20],         # Number of iterations
              'lr_all': [0.005, 0.01],      # Learning rate
              'reg_all': [0.02, 0.05]}      # Regularization term

svd = SVD()

best_rmse = float('inf')
best_params = None

# Loop through parameter combinations
for params in ParameterGrid(param_grid):
    svd = SVD(**params)
    svd.fit(trainset)

    predictions = svd.test(testset)

    # RMSE
    rmse = accuracy.rmse(predictions)

    # Update best RMSE and parameters if necessary
    if rmse < best_rmse:
        best_rmse = rmse
        best_params = params

print("Best RMSE score obtained: ", best_rmse)
print("Best parameters: ", best_params)

RMSE: 0.8913
RMSE: 0.8829
RMSE: 0.8978
RMSE: 0.8840
RMSE: 0.9182
RMSE: 0.8831
RMSE: 0.9193
RMSE: 0.8844
Best RMSE score obtained:  0.8828869256424875
Best parameters:  {'lr_all': 0.005, 'n_epochs': 20, 'n_factors': 50, 'reg_all': 0.05}


The best RMSE score obtained is 0.88 with the following parameters: 

lr_all: 0.005, n_epochs: 20, n_factors: 50, reg_all: 0.05

Let us now train the best version of our model and evaluate it on the test set with precision as metric.

In [26]:
best_model = SVD(**best_params)
best_model.fit(trainset)

# Test the final model
final_predictions = best_model.test(testset)

# Function to calculate precision metric
def calculate_precision(predictions, threshold=3.5):
    true_positives = 0
    predicted_positives = 0

    for uid, iid, true_r, est, _ in predictions:
        if est >= threshold:
            predicted_positives += 1
            if true_r >= threshold:
                true_positives += 1

    if predicted_positives == 0:
        return 0

    precision = true_positives / predicted_positives
    return precision

# Calculate precision
precision = calculate_precision(final_predictions, threshold=4.0)
print(f"Precision: {precision:.4f}")

Precision: 0.8119
