In [1]:
# libraries

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics.pairwise import pairwise_distances
from scipy.sparse.linalg import svds
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer


## Collaborative Filtering

Collaborative filtering is a method of making automatic predictions (filtering) about the interests of a user by collecting preferences from many users (collaborating). The underlying assumption of the collaborative filtering approach is that if a person A has the same opinion as a person B on an issue, A is more likely to have B's opinion on a different issue than that of a randomly chosen person.

There are two types of collaborative filtering: user-based and item-based. User-based collaborative filtering is based on the similarity between users and item-based collaborative filtering is based on the similarity between items.

There are also two types of collaborative filtering algorithms: memory-based and model-based. Memory-based algorithms are based on statistical techniques and model-based algorithms are based on machine learning techniques. In this notebook, we will use model-based algorithms because they can scale with the number of users and items aligning with our goal of building a recommender system for a large user application.

## Item-based Collaborative Filtering

Let's start with an item-based collaborative filtering algorithm. Item-based collaborative filtering is often preferred over user-based collaborative filtering as it tend to perform better when there are many items and fewer users.

 We will calculate the similarity between items based on the ratings users have given to those items. We will use the cosine similarity to calculate the similarity between items. 



## Leo's Approach

In [2]:
df_merged = pd.read_pickle('data/df_movies_cleaned.pkl')
df_ratings = pd.read_pickle('data/df_ratings_cleaned.pkl')

In [3]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24829828 entries, 0 to 24829827
Data columns (total 6 columns):
 #   Column            Dtype         
---  ------            -----         
 0   userId            int64         
 1   movieId           int64         
 2   rating            Float64       
 3   timestamp         datetime64[ns]
 4   user_mean_rating  Float64       
 5   liked_by_user     boolean       
dtypes: Float64(2), boolean(1), datetime64[ns](1), int64(2)
memory usage: 1.2 GB


In [4]:
df_ratings.describe()

Unnamed: 0,userId,movieId,rating,user_mean_rating
count,24829830.0,24829830.0,24829828.0,24829828.0
mean,135069.2,16163.04,3.52898,3.52898
std,78166.41,31278.68,1.060177,0.467016
min,1.0,1.0,0.5,0.5
25%,67239.0,1088.0,3.0,3.252336
50%,135244.0,2664.0,3.5,3.550562
75%,202693.0,6708.0,4.0,3.837209
max,270896.0,176275.0,5.0,5.0


To safe computational time, we will use a subset of the data.

In [5]:
df_ratings['timestamp'] = pd.to_datetime(df_ratings['timestamp'])
df_ratings = df_ratings.sort_values('timestamp')
df_ratings_subset = df_ratings[df_ratings['timestamp'] > '2017-01-01']
df_ratings_subset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 0 entries
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   userId            0 non-null      int64         
 1   movieId           0 non-null      int64         
 2   rating            0 non-null      Float64       
 3   timestamp         0 non-null      datetime64[ns]
 4   user_mean_rating  0 non-null      Float64       
 5   liked_by_user     0 non-null      boolean       
dtypes: Float64(2), boolean(1), datetime64[ns](1), int64(2)
memory usage: 0.0 bytes


In [17]:
# Train/Test Split
train_data, test_data = train_test_split(df_ratings_subset, test_size=0.2, random_state=42)

# User-Item Matrix for Training
user_item_matrix_train = train_data.pivot_table(index='userId', columns='movieId', values='rating')

# Item-Item Similarity Matrix
item_similarity = cosine_similarity(user_item_matrix_train.fillna(0).T)
item_similarity_df = pd.DataFrame(item_similarity, index=user_item_matrix_train.columns, columns=user_item_matrix_train.columns)

print(item_similarity_df)
print(item_similarity_df.info())


movieId    1         2         3         4         5         6         7       \
movieId                                                                         
1        1.000000  0.291622  0.104701  0.041914  0.137984  0.192833  0.083149   
2        0.291622  1.000000  0.137777  0.032426  0.167501  0.170788  0.118674   
3        0.104701  0.137777  1.000000  0.094457  0.272469  0.137315  0.150091   
4        0.041914  0.032426  0.094457  1.000000  0.072964  0.042718  0.024702   
5        0.137984  0.167501  0.272469  0.072964  1.000000  0.072802  0.205986   
...           ...       ...       ...       ...       ...       ...       ...   
176267   0.025938  0.033563  0.000000  0.000000  0.000000  0.000000  0.110065   
176269   0.000000  0.039157  0.000000  0.000000  0.000000  0.000000  0.000000   
176271   0.019454  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
176273   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
176275   0.000000  0.000000 

Let's use Singular Value Decomposition (SVD) on our user-item ratings matrix. SVD helps in extracting latent factors that explain observed ratings, efficiently reducing data dimensionality while preserving essential information. This significantly speeds up calculations, making the process of predicting ratings more efficient, especially when dealing with large dataset like ours. 

In [16]:
user_item_matrix_train = user_item_matrix_train.fillna(0).astype(float)
mean_user_rating = user_item_matrix_train.mean(axis=1)
R_demeaned = user_item_matrix_train.sub(mean_user_rating, axis='index')

R_demeaned_matrix = R_demeaned.values

# SVD 
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(R_demeaned_matrix, k=50)  

sigma_matrix = np.diag(sigma)
all_user_predicted_ratings = np.dot(np.dot(U, sigma_matrix), Vt) + mean_user_rating.values.reshape(-1, 1)

preds_df = pd.DataFrame(all_user_predicted_ratings, index=user_item_matrix_train.index, columns=user_item_matrix_train.columns)

actual = []
predicted = []

for index, row in test_data.iterrows():
    user_id = row['userId']
    movie_id = row['movieId']
    if user_id in preds_df.index and movie_id in preds_df.columns:
        actual_rating = row['rating']
        predicted_rating = preds_df.loc[user_id, movie_id]

        actual.append(actual_rating)
        predicted.append(predicted_rating)

rmse = np.sqrt(mean_squared_error(actual, predicted))
print(f'RMSE: {rmse}')


RMSE: 2.904032531436712


Given a 1-to-5 scale, an RMSE of 2.904 is relatively high, indicating that the predictions can be quite far off from the actual ratings. 

## Gilian's Approach

In [2]:
import pandas as pd
import numpy as np 
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

In [3]:
df_merged = pd.read_pickle('data/df_movies_cleaned.pkl')
df_ratings = pd.read_pickle('data/df_ratings_cleaned.pkl')

In [6]:
df_ratings_subset = df_ratings.sample(frac=0.01, random_state=42)
df_ratings_subset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 248481 entries, 21920435 to 8204774
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   userId            248481 non-null  int64         
 1   movieId           248481 non-null  int64         
 2   rating            248481 non-null  Float64       
 3   timestamp         248481 non-null  datetime64[ns]
 4   user_mean_rating  248481 non-null  Float64       
 5   liked_by_user     248481 non-null  boolean       
dtypes: Float64(2), boolean(1), datetime64[ns](1), int64(2)
memory usage: 12.3 MB


In [8]:
reader = Reader()

# Prepare the data for Surprise
data = Dataset.load_from_df(df_ratings_subset[['userId', 'movieId', 'rating']], reader)

# Initialize the SVD algorithm
svd = SVD()

# Perform cross-validation
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9480  0.9484  0.9574  0.9484  0.9475  0.9499  0.0037  
MAE (testset)     0.7342  0.7328  0.7404  0.7340  0.7342  0.7351  0.0027  
Fit time          6.23    5.38    5.48    5.28    5.70    5.61    0.34    
Test time         0.76    0.69    0.66    0.62    0.70    0.69    0.05    


{'test_rmse': array([0.94795768, 0.94843935, 0.95740638, 0.94840534, 0.94750278]),
 'test_mae': array([0.7341601 , 0.73278977, 0.7404062 , 0.73400047, 0.73418453]),
 'fit_time': (6.232583999633789,
  5.377919912338257,
  5.476396322250366,
  5.275872468948364,
  5.703993320465088),
 'test_time': (0.7607684135437012,
  0.689399003982544,
  0.6592118740081787,
  0.6228635311126709,
  0.6965868473052979)}

In [11]:
# TODO GILIAN: Matrix geben lassen:


We get a mean Root Mean Sqaure Error of 0.95 approx which is more than good enough for our case. Let us now train on our dataset and arrive at predictions.

In [9]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1713438b790>

In [10]:
svd.predict(1, 302, 3)

Prediction(uid=1, iid=302, r_ui=3, est=3.610944654755088, details={'was_impossible': False})

In [None]:
# TODO: beispielhaft für einen User alles ausgeben lassen