In [None]:
import os
import sys

MAIN_MODULE_PATH = os.path.join(os.getcwd(), '..', '..')
sys.path.append(MAIN_MODULE_PATH)

In [None]:
from collections import Counter

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from defaults import BOOK_RATINGS, BOOKS, USERS

## Read the data

In [None]:
ratings = pd.read_csv(BOOK_RATINGS, sep=';')
books = pd.read_csv(BOOKS, sep=';', error_bad_lines=False, index_col='ISBN')
users = pd.read_csv(USERS, sep=';', index_col='User-ID')

In [None]:
print("#ratings: ", len(ratings))
print("#books: ", len(books))
print("#users: ", len(users))

### Clean data

In [None]:
books = books.loc[pd.to_numeric(books['Year-Of-Publication'], errors='coerce').dropna().index]
books['Year-Of-Publication'] = books['Year-Of-Publication'].astype(int)

books_ratings_joined = pd.merge(ratings, books, left_on='ISBN', right_on='ISBN', how='left')
books['n_ratings'] = books_ratings_joined.groupby('ISBN')['Book-Rating'].size()
books['avg_rating'] = books_ratings_joined.groupby('ISBN')['Book-Rating'].mean()
books['n_ratings'].fillna(0, inplace=True)
books['books'].fillna(0, inplace=True)

#### Calculate average score based on IMBD formula

In [None]:
C = books['avg_rating'].mean()
m = books['n_ratings'].quantile(0.90)

Function that computes the weighted rating of each movie

In [None]:
def weighted_rating(x, m=m, C=C):
    v = x['n_ratings']
    R = x['avg_rating']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [None]:
books['score'] = books.apply(weighted_rating, axis=1)

In [None]:
books[list(books.columns[:4]) + list(books.columns[-3:])].head()

Books users ratings

In [None]:
books_ratings = books[list(books.columns[:4]) + list(books.columns[-3:])].reset_index().merge(ratings, on='ISBN', how='left')
books_ratings.head()

In [None]:
books_ratings.shape

## Peek data

In [None]:
books.head()

## Content based recommendation

A content-based recommender works with data that the user provides, either explicitly (rating) or implicitly (clicking on a link). Based on that data, a user profile is generated, which is then used to make suggestions to the user. As the user provides more inputs or takes actions on those recommendations, the engine becomes more and more accurate.

A recommender system has to decide between two methods for information delivery when providing the user with recommendations:
 - Exploitation. The system chooses documents similar to those for which the user has already expressed a preference.
 - Exploration. The system chooses documents where the user profile does not provide evidence to predict the user’s reaction.

In [None]:
data = books[list(books.columns[:4]) + list(books.columns[-3:])]
data = data.reset_index()
data.head()

In [None]:
def get_book_text_desc(book: pd.Series) -> str:
    try:
        return book['Book-Title'] + ' ' + ''.join(book['Book-Author'].split()) + ' ' + str(book['Year-Of-Publication']) + ' ' + ''.join(book['Publisher'].split())
    except:
        print(book)

In [None]:
data.loc[:, 'Book-Desc'] = data.copy().fillna('').apply(get_book_text_desc, axis=1)
data = data.dropna(subset=['Book-Desc'])

In [None]:
data.head()

In [None]:
class ContentBasedRecomennder:
    def __init__(self, data, fit_col):
        self._data = data
        self._fit_col = fit_col
        
    def fit(self, **cv_params):
        self._cv = TfidfVectorizer(**cv_params)
        self._cv_matrix = self._cv.fit_transform(data[self._fit_col])
        
    def recommend(self, book_title: str, top_n: int):
        indices = pd.Series(self._data.index, index=self._data['Book-Title'])
        book_idx = indices[book_title]
        if isinstance(book_idx, pd.Series):
            book_idx = book_idx.values
        else:
            book_idx = [book_idx]
        for idx in book_idx:
            cosine_similarities = np.array(cosine_similarity(self._cv_matrix[idx], self._cv_matrix))
            max_indices = cosine_similarities.argsort().flatten()
            max_indices = max_indices[-top_n:]
            max_indices = max_indices[::-1]
            max_similarities = np.sort(cosine_similarities).flatten()
            max_similarities = max_similarities[-top_n:]
            max_similarities = max_similarities[::-1]
            print(f"Most similar to {book_title} (ISBN: {self._data.iloc[idx]['ISBN']}) are following:\n")
            for i, (max_idx, max_sim) in enumerate(zip(max_indices, max_similarities)):
                book = self._data.iloc[max_idx]
                print(f"{i+1}. ({round(max_sim, 2)}) - {book['Book-Title']}, {book['Book-Author']}, {str(book['Year-Of-Publication'])}, {book['Publisher']}, ISBN: {book['ISBN']}")
                print(f"With average rating of {round(self._data.iloc[max_idx]['avg_rating'], 2)} rated by {self._data.iloc[max_idx]['avg_rating']} users.\n")
            print("====================================================")

In [None]:
recommender = ContentBasedRecomennder(data, 'Book-Desc')
recommender.fit()

In [None]:
data[data['Book-Author'] == "Carlo D'Este"]

In [None]:
recommender.recommend('Decision in Normandy', 5)