## Install Libraries

In [1]:
%pip install scikit-surprise

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


## Import Libraries

In [2]:
from collections import defaultdict
from surprise import accuracy, Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
import pandas as pd


## Load Datasets

In [3]:
movies_df = pd.read_csv("ml-latest-small/movies.csv")
movies_df


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [4]:
reader = Reader(line_format="user item rating timestamp", sep=",", skip_lines=1)


In [5]:
data = Dataset.load_from_file("ml-latest-small/ratings.csv", reader=reader)


In [6]:
trainset, testset = train_test_split(data, test_size=0.2)


## Rating Prediction


In [7]:
algo = KNNBasic()


In [8]:
predictions = algo.fit(trainset).test(testset)


Computing the msd similarity matrix...
Done computing similarity matrix.


In [9]:
accuracy.rmse(predictions)


RMSE: 0.9482


0.9481597267316231

In [10]:
accuracy.mae(predictions)


MAE:  0.7265


0.7265135449161088

## Movie Recommendation


In [11]:
def get_top_n(user_id, predictions, movies_df, n=10):
    pred_df = pd.DataFrame(
        predictions, columns=["userId", "movieId", "r_ui", "est", "details"]
    )
    pred_df[["userId", "movieId"]] = pred_df[["userId", "movieId"]].astype(int)
    pred_df = pred_df[pred_df["userId"] == user_id]
    pred_df = pd.merge(pred_df, movies_df, on="movieId")
    top_n_df = pred_df.nlargest(n, "est").reset_index()

    return top_n_df[["movieId", "title", "genres"]]


In [12]:
# Predict ratings for all pairs (u, i) that are NOT in the training set.
anti_testset = trainset.build_anti_testset(fill=0)
predictions = algo.test(anti_testset)


In [13]:
top_n_df = get_top_n(123, predictions, movies_df, n=10)
top_n_df


Unnamed: 0,movieId,title,genres
0,3223,"Zed & Two Noughts, A (1985)",Drama
1,74226,"Dream of Light (a.k.a. Quince Tree Sun, The) (...",Documentary|Drama
2,4708,Marat/Sade (1966),Drama|Musical
3,124404,"Snowflake, the White Gorilla (2011)",Adventure|Animation|Children|Comedy
4,140265,George Carlin: Jammin' in New York (1992),Comedy
5,149566,Unicorn City (2012),Comedy|Romance
6,5890,Elling (2001),Comedy|Drama
7,59814,Ex Drummer (2007),Comedy|Crime|Drama|Horror
8,3771,The Golden Voyage of Sinbad (1973),Action|Adventure|Fantasy
9,6086,"I, the Jury (1982)",Crime|Drama|Thriller


In [14]:
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():
        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(
            ((true_r >= threshold) and (est >= threshold))
            for (est, true_r) in user_ratings[:k]
        )

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    precision = sum(prec for prec in precisions.values()) / len(precisions)
    recall = sum(rec for rec in recalls.values()) / len(recalls)

    return precision, recall


In [15]:
predictions = algo.test(testset)
precision, recall = precision_recall_at_k(predictions, k=10, threshold=4)
f1_score = 2 * (precision * recall) / (precision + recall)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F-measure: {f1_score}")


Precision: 0.6774817850637523
Recall: 0.32808949972368945
F-measure: 0.4420863310162305
