In [3]:
#!pip install surprise
#!pip install pandas scikit-surprise

import pandas as pd
from surprise import SVD, Dataset, Reader, accuracy
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV
import math

In [5]:
# Load the data
data = Dataset.load_builtin('ml-100k')
df = pd.DataFrame(data.raw_ratings, columns=["user", "item", "rating", "timestamp"])

print(df.head())

  user item  rating  timestamp
0  196  242     3.0  881250949
1  186  302     3.0  891717742
2   22  377     1.0  878887116
3  244   51     2.0  880606923
4  166  346     1.0  886397596


We see the following columns:

* **user**: User ID of the user that provided the rating

* **item**: Movie ID of the rated movie

* **rating:** The rating given to the movie by the user (1 - 5)

* **timestamp:** The time at which the rating was provided



In [6]:
print(df.isnull().sum())

user         0
item         0
rating       0
timestamp    0
dtype: int64


We see that there are no missing values.

In [7]:
# Hyperparameter grid for SVD
param_grid = {
    'n_factors': [50, 100, 150],
    'n_epochs': [20, 30],
    'lr_all': [0.005, 0.010],
    'reg_all': [0.02, 0.1]
}

In [8]:
# 3-fold cross validation grid search to find the best hyperparameters for our model
gs = GridSearchCV(SVD, param_grid, measures=['RMSE', 'MAE'], cv=3)
gs.fit(data)

In [9]:
# Best score and parameters
print(f"Best RMSE: {gs.best_score['rmse']}")
print(f"Best parameters: {gs.best_params['rmse']}")

Best RMSE: 0.9210106114794548
Best parameters: {'n_factors': 100, 'n_epochs': 30, 'lr_all': 0.01, 'reg_all': 0.1}


In [10]:
# Select and use the best model based on optimal RMSE
model = gs.best_estimator['rmse']

In [12]:
# Train and test split (75%-25%)
trainset, testset = train_test_split(data, test_size=0.25)

# Fit on the trainset
model.fit(trainset)

# Predict on the testset
pred = model.test(testset)

# RMSE of the predictions
accuracy.rmse(pred)

RMSE: 0.9125


0.9125467331939697

In [13]:
# Predict rating for a user and item
user_id = '165'
item_id = '122'
predicted_rating = model.predict(user_id, item_id)
print(f"Predicted rating for user {user_id} and item {item_id}: {predicted_rating.est}")

Predicted rating for user 165 and item 122: 2.5610470274707287


In [34]:
# Movie recommendation function based on predicted ratings

def recommend_movies(model, trainset, user_id, num_recommendations = 5):

    # Get the raw user id to inner id
    inner_user_id = trainset.to_inner_uid(user_id)

    # Get the list of items that user has rated
    rated_items = {iid for (iid, _) in trainset.ur[inner_user_id]}

    # Get a list of all movie IDs in the dataset that user has not rated
    all_movie_ids = trainset.all_items()
    unrated_movie_ids = [trainset.to_raw_iid(iid) for iid in all_movie_ids if iid not in rated_items]

    # Predict ratings for all movies that the user has not seen yet
    predictions = [model.predict(user_id, movie_id) for movie_id in unrated_movie_ids]

    # Sort predictions by estimated rating in descending order
    predictions.sort(key=lambda x: x.est, reverse=True)

    # Get the top N recommended movies
    top_recommendations = predictions[:num_recommendations]

    # Print the results in a list format, one under the other
    print(f"Recommended Movies for user {user_id}:")
    for pred in top_recommendations:
        print(f"{pred.iid} (Estimated Rating: {pred.est:.2f})")

In [35]:
# Recommend top 5 movies for a user of your choice
user_id = input()
recommend_movies(model, trainset, user_id)

134
Recommended Movies for user 134:
1449 (Estimated Rating: 4.78)
318 (Estimated Rating: 4.64)
64 (Estimated Rating: 4.61)
483 (Estimated Rating: 4.49)
169 (Estimated Rating: 4.47)
