In [None]:
import os
import sys

MAIN_MODULE_PATH = os.path.join(os.getcwd(), '..', '..')
sys.path.append(MAIN_MODULE_PATH)

In [None]:
from collections import Counter

import numpy as np
import scipy
import pandas as pd
import seaborn as sns
import lightfm.data
import lightfm.cross_validation
import lightfm.evaluation

from defaults import BOOK_RATINGS, BOOKS, USERS

In [None]:
RANDOM_STATE = 1234

# Preparing data

In [None]:
def load_ratings(path=BOOK_RATINGS):
    ratings = pd.read_csv(BOOK_RATINGS, sep=';')
    ratings['Book-Rating'] = ratings['Book-Rating'].astype('int8')
    return ratings

def load_books(path=BOOKS):
    books = pd.read_csv(path, sep=';', error_bad_lines=False, index_col='ISBN')    
    books = books.loc[pd.to_numeric(books['Year-Of-Publication'], errors='coerce').dropna().index]
    books['Year-Of-Publication'] = books['Year-Of-Publication'].astype("int8")
    return books

def load_users(path=USERS):
    return pd.read_csv(path, sep=';', index_col='User-ID')

In [None]:
MIN_BOOK_RATINGS = 20

def preprocess_ratings(ratings: pd.DataFrame, books: pd.DataFrame, users: pd.DataFrame,
                   min_book_ratings: int = MIN_BOOK_RATINGS):
    books_ratings_joined = pd.merge(ratings, books, left_on='ISBN', right_index=True, how='left')
    books['n_ratings'] = books_ratings_joined.groupby('ISBN')['Book-Rating'].size()
    popular_books = books[books['n_ratings'] > min_book_ratings]
    
    ratings = pd.merge(popular_books, ratings, left_index=True, right_on='ISBN', how='left')
    return ratings

In [None]:
ratings, books, users = load_ratings(), load_books(), load_users()

In [None]:
len(ratings), len(books), len(users)

In [None]:
ratings = preprocess_ratings(ratings, books, users)[['ISBN', 'User-ID', 'Book-Rating']]
ratings.head()

In [None]:
dataset = lightfm.data.Dataset()
dataset.fit(ratings['ISBN'].unique(), ratings['User-ID'].unique())
interactions, weights = dataset.build_interactions(ratings.itertuples(index=False))

In [None]:
train, test = lightfm.cross_validation.random_train_test_split(interactions, test_percentage=0.2, random_state=RANDOM_STATE)

In [None]:
model = lightfm.LightFM(no_components=10)
fitted_model = model.fit(train, epochs=64, verbose=False)

In [None]:
prec_per_user = lightfm.evaluation.precision_at_k(fitted_model, test_interactions=test, train_interactions=train, k=5)
np.mean(prec_per_user)

# SVD surprise

In [None]:
from surprise import SVD, SVDpp, NMF
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate


# Load the movielens-100k dataset (download it if needed),
data = Dataset.load_from_df(ratings, Reader(rating_scale=(1, 10)))

# We'll use the famous SVD algorithm.
algo = SVDpp(n_factors= 40)

# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE', 'fcp'], cv=5, n_jobs=10, verbose=True)