In [None]:
import os
import sys

MAIN_MODULE_PATH = os.path.join(os.getcwd(), '..', '..')
sys.path.append(MAIN_MODULE_PATH)

In [None]:
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np
import scipy
import pandas as pd
import seaborn as sns
import lightfm.data
import lightfm.cross_validation
import lightfm.evaluation
import pycountry

from defaults import BOOK_RATINGS, BOOKS, USERS

In [None]:
RANDOM_STATE = 1234

# Preparing data
Preprocessing

In [None]:
def load_ratings(path=BOOK_RATINGS):
    ratings = pd.read_csv(BOOK_RATINGS, sep=';')
    ratings['Book-Rating'] = ratings['Book-Rating'].astype('int8')
    return ratings

def load_books(path=BOOKS):
    books = pd.read_csv(path, sep=';', error_bad_lines=False, index_col='ISBN')    
    books = books.loc[pd.to_numeric(books['Year-Of-Publication'], errors='coerce').dropna().index]
    books['Year-Of-Publication'] = books['Year-Of-Publication'].astype("int8")
    return books

def load_users(path=USERS):
    return pd.read_csv(path, sep=';', index_col='User-ID')

In [None]:
MIN_BOOK_RATINGS = 20

def preprocess_ratings(ratings: pd.DataFrame, books: pd.DataFrame, users: pd.DataFrame,
                   min_book_ratings: int = MIN_BOOK_RATINGS):
    books_ratings_joined = pd.merge(ratings, books, left_on='ISBN', right_index=True, how='left')
    books['n_ratings'] = books_ratings_joined.groupby('ISBN')['Book-Rating'].size()
    popular_books = books[books['n_ratings'] > min_book_ratings]
    
    ratings = pd.merge(popular_books, ratings, left_index=True, right_on='ISBN', how='left')
    return ratings

In [None]:
ratings, books, users = load_ratings(), load_books(), load_users()
ratings = preprocess_ratings(ratings, books, users)[['ISBN', 'User-ID', 'Book-Rating']]
ratings.head()

In [None]:
len(ratings), len(books), len(users)

In [None]:
(ratings['Book-Rating']>0).sum()

In [None]:
books

# SVD surprise

In [None]:
from collections import defaultdict
from statistics import mean
import pprint
import random

from surprise import SVD, SVDpp, NMF, NormalPredictor
from surprise import Dataset, Reader, 
from surprise.model_selection import cross_validate, RandomizedSearchCV, train_test_split
from surprise.accuracy import mae, rmse

In [None]:
dataset = Dataset.load_from_df(ratings, Reader(rating_scale=(1, 10)))
train, test = train_test_split(dataset, test_size=0.2, random_state=17)
model = SVDpp()
model.fit(train)

In [None]:
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls

In [None]:
predictions = fitted.test(test)
precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)

In [None]:
mean(precisions.values()), mean(recalls.values())

In [None]:
rmse(predictions), mae(predictions)

In [None]:
# users_prec_rec = pd.DataFrame.from_dict(precisions, orient='index', columns=['precision']).join(pd.DataFrame.from_dict(recalls, orient='index', columns=['recall']))
# users_prec_rec.index = users_prec_rec.index.astype(int)
# users_prec_rec = users_prec_rec.join(users)
# users_prec_rec.head()

In [None]:
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [None]:
random.seed(17)
top_n_recommendations = {user: recommendations for user, recommendations in get_top_n(predictions, n=5).items() if len(recommendations) > 3}
random_n = random.choices(list(top_n_recommendations.items()), k=10)
for user_id, user_recommendations in random_n:
    print(users.loc[int(user_id)].to_dict())
    for i, (book_id, rank) in enumerate(user_recommendations):
        print(f"{i}. [Rank={rank}], Title:, {books.loc[book_id].to_dict()['Book-Title']}")
#         pprint.pprint(books.loc[book_id].to_dict(), indent=4)
    print('\n-'.rjust(30, '-'), end='\n')

# Hybrid model 

In [None]:
dataset = lightfm.data.Dataset()
dataset.fit(ratings['ISBN'].unique(), ratings['User-ID'].unique())
interactions, weights = dataset.build_interactions(ratings.itertuples(index=False))

In [None]:
train, test = lightfm.cross_validation.random_train_test_split(interactions, test_percentage=0.2, random_state=RANDOM_STATE)

In [None]:
model = lightfm.LightFM(no_components=10)
fitted_model = model.fit(train, epochs=64, verbose=False)

In [None]:
prec_per_user = lightfm.evaluation.precision_at_k(fitted_model, test_interactions=test, train_interactions=train, k=5)
np.mean(prec_per_user)