In [None]:
import os
import sys

MAIN_MODULE_PATH = os.path.join(os.getcwd(), '..', '..')
sys.path.append(MAIN_MODULE_PATH)

In [None]:
from collections import Counter

import numpy as np
import scipy
import pandas as pd
import seaborn as sns
import lightfm.data
import lightfm.cross_validation
import lightfm.evaluation

from defaults import BOOK_RATINGS, BOOKS, USERS

In [None]:
RANDOM_STATE = 1234

# Preparing data

In [None]:
def load_ratings(path=BOOK_RATINGS):
    ratings = pd.read_csv(BOOK_RATINGS, sep=';')
    ratings['Book-Rating'] = ratings['Book-Rating'].astype('int8')
    return ratings

def load_books(path=BOOKS):
    books = pd.read_csv(path, sep=';', error_bad_lines=False, index_col='ISBN')    
    books = books.loc[pd.to_numeric(books['Year-Of-Publication'], errors='coerce').dropna().index]
    books['Year-Of-Publication'] = books['Year-Of-Publication'].astype("int8")
    return books

def load_users(path=USERS):
    return pd.read_csv(path, sep=';', index_col='User-ID')

In [None]:
MIN_BOOK_RATINGS = 20

def preprocess_ratings(ratings: pd.DataFrame, books: pd.DataFrame, users: pd.DataFrame,
                   min_book_ratings: int = MIN_BOOK_RATINGS):
    books_ratings_joined = pd.merge(ratings, books, left_on='ISBN', right_index=True, how='left')
    books['n_ratings'] = books_ratings_joined.groupby('ISBN')['Book-Rating'].size()
    popular_books = books[books['n_ratings'] > min_book_ratings]
    
    ratings = pd.merge(popular_books, ratings, left_index=True, right_on='ISBN', how='left')
    return ratings

In [None]:
ratings, books, users = load_ratings(), load_books(), load_users()

In [None]:
users

In [None]:
len(ratings), len(books), len(users)

In [None]:
ratings = preprocess_ratings(ratings, books, users)[['ISBN', 'User-ID', 'Book-Rating']]
ratings.head()

# KNNsurprise

In [None]:
from collections import defaultdict

def precision_recall_at_k(predictions, k=10, threshold=3.5):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls

In [None]:
from surprise import KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate, KFold, train_test_split

data = Dataset.load_from_df(ratings[['ISBN', 'User-ID', 'Book-Rating']], Reader(rating_scale=(1, 10)))

trainset, testset = train_test_split(data, test_size=.25, random_state=RANDOM_STATE)

results = []

for sim_measure_name in ['cosine', 'msd', 'pearson']:
    algo = KNNBasic(sim_options = {'name': sim_measure_name})
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)
    precision = sum(prec for prec in precisions.values()) / len(precisions)
    recall = sum(rec for rec in recalls.values()) / len(recalls)
    
    results.append({'similarity_measure': sim_measure_name, 'precision': precision, 'recall': recall})
    
results = pd.DataFrame(results)

In [None]:
with pd.option_context('display.float_format', '{:,.2f}'.format):
    display(results.T)