In [9]:
import numpy as np
import pandas as pd
import seaborn as sns
# we are using Surprise library which deals with explicit rating data
from surprise import Reader
from surprise import KNNWithMeans
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise.model_selection import KFold
from surprise import accuracy


In [10]:
ratings = pd.read_csv("ratings.csv")

In [11]:
print(ratings.head(5))
ratings.isnull().any()
#ratings dataset does not have any missing values


   userId  movieId  rating   timestamp
0       1      110     1.0  1425941529
1       1      147     4.5  1425942435
2       1      858     5.0  1425941523
3       1     1221     5.0  1425941546
4       1     1246     5.0  1425941556


userId       False
movieId      False
rating       False
timestamp    False
dtype: bool

In [12]:
print(ratings.info()) 
#userID and movieID column values are all int64 type and ratings are float64 type
print(ratings.describe())
#ratings range from 0.5 to 5.0
print(ratings.nunique())
#there are 270,896 users and 45,115 different movies in this dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26024289 entries, 0 to 26024288
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 794.2 MB
None
             userId       movieId        rating     timestamp
count  2.602429e+07  2.602429e+07  2.602429e+07  2.602429e+07
mean   1.350371e+05  1.584911e+04  3.528090e+00  1.171258e+09
std    7.817620e+04  3.108526e+04  1.065443e+00  2.052889e+08
min    1.000000e+00  1.000000e+00  5.000000e-01  7.896520e+08
25%    6.716400e+04  1.073000e+03  3.000000e+00  9.907545e+08
50%    1.351630e+05  2.583000e+03  3.500000e+00  1.151716e+09
75%    2.026930e+05  6.503000e+03  4.000000e+00  1.357578e+09
max    2.708960e+05  1.762750e+05  5.000000e+00  1.501830e+09
userId         270896
movieId         45115
rating             10
timestamp    20549435
dtype: int64


Since the rating dataset we have is clean enough for our purpose, we will proceed without much data cleaning and pre-processing.

In [13]:
ratings = ratings[:10000] #the dataset is too large and the code is running too slow, trying out the small subset

In [14]:
# Set up Reader object and load dataset into Surprise's Dataset object

reader = Reader(rating_scale=(0.5, 5))
rating_reader = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [15]:
# Split dataset into training and testing datasets
trainset, testset = train_test_split(rating_reader, test_size=0.2)

User-based collaborative filtering

In [16]:
# Train algorithm using cosine similarity and user-based approach
sim_options_user_based = {'name': 'cosine', 'user_based': True}
algo_user_based = KNNWithMeans(sim_options=sim_options_user_based)
algo_user_based.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7faa70c72d30>

In [17]:
# Predict rating for a specific user and movie (example)
user_id = 42 #random user
movie_id = 123 #random movie
predicted_rating = algo_user_based.predict(user_id, movie_id).est
print(f'Predicted rating for user {user_id} and movie {movie_id}: {predicted_rating}')

Predicted rating for user 42 and movie 123: 4.384852216748769


In [18]:
def recommend_movies(user_id, k=10, n=10):
    # Get k most similar users to given user
    similar_users = algo_user_based.get_neighbors(user_id, k=k)

    # Get movies rated highly by similar users that given user hasn't rated
    movies = {}
    for user in similar_users:
        for movie in ratings[ratings['userId'] == user]['movieId'].values:
            if movie not in ratings[ratings['userId'] == user_id]['movieId'].values:
                rating = ratings[(ratings['userId'] == user) & (ratings['movieId'] == movie)]['rating'].values[0]
                if movie in movies:
                    movies[movie]['rating'] += rating
                    movies[movie]['count'] += 1
                else:
                    movies[movie] = {'rating': rating, 'count': 1}

    # Sort recommended movies by average rating
    sorted_movies = sorted(movies.items(), key=lambda x: x[1]['rating']/x[1]['count'], reverse=True)

    # Return top n recommended movies
    return [movie[0] for movie in sorted_movies[:n]]

In [19]:
# Recommend 5 movies to a specific user (example)
user_id = 8
k = 10
n = 5
recommended_movies = recommend_movies(user_id, k, n)
print(f'{n} recommended movies for user {user_id}: {recommended_movies}')

5 recommended movies for user 8: [342, 1059, 2324, 2396, 4027]


Evaluating the performance

In [20]:
# Set up KFold cross-validation
kf = KFold(n_splits=5)

# Evaluate algorithm using RMSE metric
for trainset, testset in kf.split(rating_reader):
    algo_user_based.fit(trainset)
    predictions = algo_user_based.test(testset)
    rmse = accuracy.rmse(predictions, verbose=True)
    print(f'RMSE: {rmse}')

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0054
RMSE: 1.005350411971557
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0004
RMSE: 1.0003989971299625
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0136
RMSE: 1.013639590455153
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0014
RMSE: 1.00139951720365
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0071
RMSE: 1.0071343169758318


Tuning the parameters

In [23]:
# Set up parameter grid for grid search
param_grid = {'k': [5, 10, 15, 20], 'sim_options': {'name': ['cosine', 'pearson'], 'user_based': [False, True]}}

# Perform grid search
gs = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse'], cv=5)
gs.fit(rating_reader)

# Print best RMSE score and parameters
print(f'Best RMSE score: {gs.best_score["rmse"]}')
print(f'Best parameters: {gs.best_params["rmse"]}')

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing