In [None]:
import os
import sys

MAIN_MODULE_PATH = os.path.join(os.getcwd(), '..', '..')
sys.path.append(MAIN_MODULE_PATH)

In [None]:
from collections import Counter, defaultdict
import itertools
import multiprocessing

import numpy as np
import scipy
import pandas as pd
import seaborn as sns
import lightfm.data
import lightfm.cross_validation
import lightfm.evaluation
from tqdm.auto import tqdm, trange
from sklearn.metrics import ndcg_score

from surprise import KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate, KFold, train_test_split
from surprise.prediction_algorithms.co_clustering import CoClustering

from defaults import BOOK_RATINGS, BOOKS, USERS

In [None]:
RANDOM_STATE = 1234

In [None]:
def load_ratings(path=BOOK_RATINGS):
    ratings = pd.read_csv(BOOK_RATINGS, sep=';')
    ratings['Book-Rating'] = ratings['Book-Rating'].astype('int8')
    return ratings

def load_books(path=BOOKS):
    books = pd.read_csv(path, sep=';', error_bad_lines=False, index_col='ISBN')    
    books = books.loc[pd.to_numeric(books['Year-Of-Publication'], errors='coerce').dropna().index]
    books['Year-Of-Publication'] = books['Year-Of-Publication'].astype("int8")
    return books

def load_users(path=USERS):
    return pd.read_csv(path, sep=';', index_col='User-ID')

def _filter_ratings(ratings: pd.DataFrame) -> pd.DataFrame:
    """Filters out interaction of user and books having #interactions below the threshold."""
    book_interactions_cutoff = user_interaction_cutoff = 10
    book_mask = (ratings['ISBN'].map(ratings['ISBN'].value_counts())
                 >= book_interactions_cutoff)
    ratings = ratings[book_mask]

    user_mask = (ratings['User-ID'].map(ratings['User-ID'].value_counts())
                 >= user_interaction_cutoff)
    ratings = ratings[user_mask]

    # project ids to indices - make index-space compact
    ratings['ISBN'] = ratings['ISBN'].astype('category').cat.codes
    ratings['User-ID'] = ratings['User-ID'].astype('category').cat.codes

    return ratings

In [None]:
ratings, books, users = load_ratings(), load_books(), load_users()

In [None]:
# ratings = preprocess_ratings(ratings, books, users)[['ISBN', 'User-ID', 'Book-Rating']]
ratings = _filter_ratings(ratings)
ratings.head()

In [None]:
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls

In [None]:
def ndcg_at_k(predictions, k=10, threshold=3.5):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((float(est), float(true_r)))
    
    scores = []
    for uid, user_ratings in user_est_true.items():
        pred, true = zip(*user_ratings)
        true, pred = [list(true)], [list(pred)]
        score = ndcg_score(true, pred, k=k)
        scores.append(score)
    return np.mean(scores)

In [None]:
def evaluate(n_clusters_users, n_clusters_items, i):
    algo = CoClustering(n_clusters_users, n_clusters_items)
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)
    precision = sum(prec for prec in precisions.values()) / len(precisions)
    recall = sum(rec for rec in recalls.values()) / len(recalls)
    return {'n_clusters_users': n_clusters_users, 'n_clusters_items': n_clusters_items, 'i': i, 'precision': precision, 'recall': recall, 'ndcg_at_k': ndcg_at_k(predictions)}

In [None]:
data = Dataset.load_from_df(ratings, Reader(rating_scale=(1, 10)))

trainset, testset = train_test_split(data, test_size=.25, random_state=RANDOM_STATE)

n_clusters_users = [1, 1, 2, 3, 5, 8, 13, 21, 34, 55]
n_clusters_items = [1, 1, 2, 3, 5, 8, 13, 21, 34, 55]
i = [0, 1]

with multiprocessing.Pool(processes=15) as pool:
    results = pool.starmap(evaluate, tqdm(list(itertools.product(n_clusters_users, n_clusters_items, i))))
        
results = pd.DataFrame(results)

In [None]:
to_plot = results[results.i ==0]
to_plot = to_plot[~to_plot.duplicated()]
to_plot = to_plot.pivot("n_clusters_users", "n_clusters_items", "precision")
sns.heatmap(to_plot)

In [None]:
to_plot = results[results.i == 0]
to_plot = to_plot[~to_plot.duplicated()]
to_plot = to_plot.pivot("n_clusters_users", "n_clusters_items", "recall")
sns.heatmap(to_plot)

In [None]:
to_plot = results[results.i ==0]
to_plot = to_plot[~to_plot.duplicated()]
to_plot = to_plot.pivot("n_clusters_users", "n_clusters_items", "ndcg_at_k")
sns.heatmap(to_plot)

In [None]:
results.sort_values(by='recall', ascending=False)[['precision','recall','ndcg_at_k']]

In [None]:
results.sort_values(by='ndcg_at_k', ascending=False)