In [0]:
%pip

!pip install tqdm

In [0]:
import pandas as pd
from tqdm import tqdm

# Read Data

In [0]:
books_df = pd.read_csv('../data/processed/books.csv')
display(books_df)

In [0]:
ratings_df = pd.read_csv('../data/processed/ratings.csv')
display(ratings_df)

In [0]:
books_df.shape[0]

This is a lot of books and would produce a huge matrix for the cosine distances, especially when considering multiple features and embeddings or vectorisations of each book's features.

# Popular books

Due to Google API limits (which I plan to use for further augmentation) I will take the top 1000 books from the dataset according to number of reviews in the reviews dataset.

In [0]:
ratings_count = ratings_df.groupby('ISBN').size().reset_index(name='num_ratings')
books_df = books_df.merge(ratings_count, on='ISBN', how='left')
top_books = books_df.sort_values(by='num_ratings', ascending=False).head(1000)
top_books.drop(columns=['num_ratings'], inplace=True)
top_books.reset_index(drop=True, inplace=True)

## Lets add some more features

For my content based recommender just using author, title, publisher and age is not going to be enough to get good predictions. I will use Google Books API to augment the data to get additional features such as genre, page count, description and so on.

In [0]:
import requests
import time

def get_google_books_data_no_key(title, author):
    query = f"intitle:{title}+inauthor:{author}"
    url = f"https://www.googleapis.com/books/v1/volumes?q={query}"

    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()

        if data['totalItems'] == 0:
            return None

        book_info = data['items'][0]['volumeInfo']
        return {
            'description': book_info.get('description'),
            'categories': book_info.get('categories'),
            'pageCount': book_info.get('pageCount'),
            'averageRating': book_info.get('averageRating'),
            'ratingsCount': book_info.get('ratingsCount'),
            'language': book_info.get('language')
        }

    except Exception as e:
        print(f"Error fetching data for '{title}' by {author}: {e}")
        return None

def get_google_books_data(title, author):
    api_key = "MY_KEY_PLACEHOLDER"
    query = f"intitle:{title}+inauthor:{author}"
    url = f"https://www.googleapis.com/books/v1/volumes?q={query}&key={api_key}"

    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()

        if data['totalItems'] == 0:
            return None

        book_info = data['items'][0]['volumeInfo']
        return {
            'description': book_info.get('description'),
            'categories': book_info.get('categories'),
            'pageCount': book_info.get('pageCount'),
            'averageRating': book_info.get('averageRating'),
            'ratingsCount': book_info.get('ratingsCount'),
            'language': book_info.get('language')
        }

    except Exception as e:
        print(f"Error fetching data for '{title}' by {author}: {e}")
        return None

### All commented out code from the API request

In [0]:
# augmented_books_data = []

# for index, row in tqdm(top_books.iterrows()):
#     book_info = get_google_books_data(row['bookTitle'], row['bookAuthor'])
#     if book_info:
#         book_info['bookTitle'] = row['bookTitle']
#         book_info['bookAuthor'] = row['bookAuthor']
#         augmented_books_data.append(book_info)
#     else:
#         print(f"No book data found for '{row['bookTitle']}' by {row['bookAuthor']}")

In [0]:
# top_books[top_books.bookTitle == 'the devil in the white city : murder, magic, and madness at the fair that changed america (illinois)'].index[0]

I ran out of requests at index 945 so I'll be cheeky and do the rest without an api key.

In [0]:
# ran out of requests at index 945 do the rest without an api key
# for index, row in tqdm(top_books.iloc[945:].iterrows()):
#     book_info = get_google_books_data_no_key(row['bookTitle'], row['bookAuthor'])
#     if book_info:
#         book_info['bookTitle'] = row['bookTitle']
#         book_info['bookAuthor'] = row['bookAuthor']
#         augmented_books_data.append(book_info)
#     else:
#         print(f"No book data found for '{row['bookTitle']}' by {row['bookAuthor']}")

I drop `averageRating` and `ratingsCount` high null percentage and can get similar from `ratings_df` anyways. `pageCount` has a lot of 0 values which are likely missing so I'll replace with null.

In [0]:
# augmented_df = pd.DataFrame(augmented_books_data)
# augmented_df = augmented_df.drop(['averageRating', 'ratingsCount', 'language'], axis=1)
# augmented_df['pageCount'] = augmented_df['pageCount'].replace(0, np.nan)

In [0]:
# # check for duplicates based on bookTitle and bookAuthor
# duplicates = augmented_df.duplicated(subset=['bookTitle', 'bookAuthor'])
# print(f"Number of duplicate entries: {duplicates.sum()}")
# # drop duplicates
# augmented_df = augmented_df.drop_duplicates(subset=['bookTitle', 'bookAuthor'])


In [0]:
# Save to CSV
# augmented_df.to_csv("../data/external/augmented_books_data.csv", index=False)

### Process the Google books data

In [0]:
augmented_df = pd.read_csv("../data/external/augmented_books_data.csv")

In [0]:
augmented_df.isnull().sum()

Unfortunately there is some missing data.

In [0]:
augmented_df[augmented_df.description.isnull()]

It appears that even some very popular books are missing - this is likely due to small differences in the `bookTitle` or `author` in the text or some illegal punctuation in the request URL due to the author or title. I also just realised that I can use the ISBN with google api - next time I would definitely go for that option. But for now I'm out of requests and there are not too many missing values. 

Close your eyes while I shamefully drop all rows with missing entries for simiplicity. Even if I could not get the information using the API with ISBN I could:
- Impute the `pageCount` using the median/mean potentially stratified by author and/or publisher
- Impute the `categories` (actually just one category) again using the publisher
- Leave the `description` empty


In [0]:
top_books = top_books.merge(augmented_df, on=['bookTitle', 'bookAuthor'], how='left')

In [0]:
top_books = top_books.dropna()
top_books = top_books.reset_index(drop=True)

In [0]:
# convert categories to list
import ast
top_books['categories'] = top_books['categories'].apply(ast.literal_eval)

In [0]:
top_books['categories'].apply(len).value_counts()

In [0]:
top_books['category'] = top_books['categories'].apply(lambda x: x[0] if x else None)
top_books.drop(columns=['categories'], inplace=True)

In [0]:
top_books['category'].nunique()

Rather disappointingly there is only one category per book, I saw in plenty of datasets online lists of 10s of categories for each book. This would provide a lot more information, I would try to get more genre information in future.

In [0]:
top_books['category'].value_counts()

However for now this makes my life a lot simpler since I'm not doing any sophisticated embeddings and simpler TF-IDF vectorisation will likely not help too much since the categories are very short and likely won't contain similar words for similar genres and in some cases could be confusing Fiction vs Non-Fiction. I will likely go for a very simple distance metric from one hot-encoding.

# First Content Based Recommender

I will start with a recommender that only uses the data provided in the dataset.

In [0]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

## Base Features

In [0]:
author_encoded = pd.get_dummies(top_books['bookAuthor'], prefix='author')
publisher_encoded = pd.get_dummies(top_books['publisher'], prefix='publisher')
age_normalised = MinMaxScaler().fit_transform(top_books[['bookAge']])
x_auth_pub_age = pd.concat([author_encoded.reset_index(drop=True), publisher_encoded.reset_index(drop=True), pd.DataFrame(age_normalised).reset_index(drop=True)], axis=1)
similarity_matrix = cosine_similarity(x_auth_pub_age)


In [0]:
import numpy as np

def recommend(book_index, similarity_matrix, df, top_n=5):
    sim_scores = list(enumerate(similarity_matrix[book_index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_indices = [i for i, _ in sim_scores[1:top_n+1]]
    recommendations = df.iloc[top_indices].drop(columns=['ISBN'])
    recommendations['Score'] = [score for _, score in sim_scores[1:top_n+1]]
    recommendations['Rank'] = range(1, len(recommendations) + 1)
    return recommendations

In [0]:
book_index = 1
print(f"Book : {top_books.iloc[book_index]}")
recommend(book_index, similarity_matrix, top_books, 10)

As a little qualitative test I can see that the author and the publisher ar ethe same and the book age similar, suggesting that the similarity matrix is working as expected.

## Encode book title

For now I will start with TF-IDF vectorisations as they're simple to implement and still relevant although not as powerful as semamtic embeddings from BERT or learned embeddings. Similar books can still often contain the some of the same words so this could still be valuable information.

Although often book titles are quite abstract and contain the same words and be very differnt. The same can be said for book titles with similar semantic meanings too such as 'Three comrades' and 'Conversations with friends'!

So clearly more than just the book title would be needed to recommend another book.

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix_title = tfidf.fit_transform(top_books['bookTitle'])

In [0]:
from scipy.sparse import csr_matrix
from scipy.sparse import hstack
X_sparse = csr_matrix(x_auth_pub_age)
combined_features = hstack([tfidf_matrix_title, X_sparse])

In [0]:
similarity_matrix_with_title = cosine_similarity(combined_features)
book_index = 2
print(f"Book : {top_books.iloc[book_index]}")
recommend(book_index, similarity_matrix_with_title, top_books, 10)

Here you can see the `bookTitle` is clearly being used as information to suggest similar books `a novel` is repeated in the top 4 suggestions. This is clearly not the most sophisticated recommendation of just suggesting other books with `a novel` however it is implicitly suggesting other fiction books just from the title without even providing the additional gender information! 

## How can I make these predictions interpretable?

A key goal and reason I went for the content-based recommender is because I wanted to add an interpetable aspect to the prediction, which I think for books is especially valuable and a feature I haven't seen before.

This also opens the door to incorporate feedback from the user on the suggestion on specific metrics.

The seemingly straightforward solution is to decompose the similarity matrix considering each factor separately and the combining them with weights. The next question is what should these weights be. The weights could be learned or since I am also keen on incorporating user feedback for this first step I can make these parameters which the user can directly adjust. In later stages the feature importance could be learned a more sophisticated user input mechanism introduced.

In [0]:
import numpy as np

def inverse_normalised_distance_similarity(vec):
    vec = np.array(vec).reshape(-1, 1)  # Ensure column vector
    dists = np.abs(vec - vec.T)
    max_dist = dists.max()

    if max_dist == 0:
        # All values are identical → full similarity
        return np.ones_like(dists)

    similarity = 1 - (dists / max_dist)
    return similarity

For scalars I use an inverse normalised manhatten distance as a first metric. I only have `bookAge` and later also `pageCount`.

In [0]:
author_array = np.array(top_books['bookAuthor'])
similarity_authors = np.equal.outer(author_array, author_array).astype(float)
publisher_array = np.array(top_books['publisher'])
similarity_publishers = np.equal.outer(publisher_array, publisher_array).astype(float)
tfidf_title_matrix = tfidf.fit_transform(top_books['bookTitle'])
similarity_title = cosine_similarity(tfidf_title_matrix)
age_array = np.array(top_books['bookAge'])
similarity_age = inverse_normalised_distance_similarity(age_array)

similarity_weights = {'title': 0.3, 'author': 0.3, 'publisher': 0.3, 'age': 0.1}
similarity_matrices = {
    'title': similarity_title,
    'author': similarity_authors,
    'publisher': similarity_publishers,
    'age': similarity_age
}

In [0]:
import pandas as pd
import numpy as np

def recommend_books_interpretable(book_index, similarity_matrices, weights, df, top_n=5):
    """
    Recommend books using interpretable similarity components.
    
    Parameters:
        book_index (int): Index of the reference book
        similarity_matrices (dict): Dict of feature name -> similarity matrix (NxN)
        weights (dict): Dict of feature name -> weight (must match keys in similarity_matrices)
        df (pd.DataFrame): DataFrame with book info
        top_n (int): Number of recommendations to return

    Returns:
        pd.DataFrame: Recommended books with similarity breakdown and total score
    """
    feature_scores = {}
    
    # Extract similarity for the target book for each feature
    for feature, sim_matrix in similarity_matrices.items():
        feature_scores[feature] = sim_matrix[book_index]

    # Compute final weighted similarity score
    final_similarity = np.zeros_like(next(iter(feature_scores.values())))
    for feature in feature_scores:
        final_similarity += weights[feature] * feature_scores[feature]

    # Enumerate and rank by final score
    sim_scores = list(enumerate(final_similarity))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_indices = [i for i, _ in sim_scores if i != book_index][:top_n]

    # Build result dataframe
    recommendations = df.iloc[top_indices].drop(columns=['ISBN'])
    recommendations['Rank'] = range(1, len(recommendations) + 1)

    # Add feature-wise similarity scores
    for feature in feature_scores:
        recommendations[f'sim_{feature}'] = [feature_scores[feature][i] for i in top_indices]

    # Add final similarity score
    recommendations['Score'] = [final_similarity[i] for i in top_indices]

    return recommendations


In [0]:
similarity_matrix_with_title = cosine_similarity(combined_features)
book_index = 4
print(f"Book : {top_books.iloc[book_index]}")
recommend(book_index, similarity_matrix_with_title, top_books, 10)

In [0]:
book_index = 4
print(f"Book : {top_books.iloc[book_index]}")
similarity_weights = {'title': 0.4, 'author': 0.3, 'publisher': 0.2, 'age': 0.1}
recommend_books_interpretable(book_index, similarity_matrices, similarity_weights, top_books, 10)

The suggests are comparable with one difference in the first suggestion, which you can see is suggested due to similarity in title, however the suggestions are still overwhelminginly dominated by books from ballentine books publisher. It does not seem like this is enough information to really make such a good recommendation.

## Add in Google Books Data

On top of author, publisher, title and age. I will add description, category and the page count of the books as factors to be used in comparing the books. 

- For `description` I'll use TF-IDF vectorisation once again capturing semantics would be a lot more powerful here.
- For `category` since it is only one category I will just exactly compare categories rather than through an encoding. I would be interesting to see how differently an embedding would behave as I did notice categories such as `Fiction` and `Juvenile Fiction` which an embedding would be able to capture this similarity.
- For `pageCount` I used the scalar metric as before with age. 

In [0]:
tfidf_matrix_description = tfidf.fit_transform(top_books['description'])
similarity_matrix_description = cosine_similarity(tfidf_matrix_description)
category_array = np.array(top_books['category'])
similarity_matrix_category = np.equal.outer(category_array, category_array).astype(float)
pages_array = np.array(top_books['pageCount'])
similarity_pages = inverse_normalised_distance_similarity(pages_array)
# also for other features
author_array = np.array(top_books['bookAuthor'])
similarity_authors = np.equal.outer(author_array, author_array).astype(float)
publisher_array = np.array(top_books['publisher'])
similarity_publishers = np.equal.outer(publisher_array, publisher_array).astype(float)
tfidf_title_matrix = tfidf.fit_transform(top_books['bookTitle'])
similarity_title = cosine_similarity(tfidf_title_matrix)
age_array = np.array(top_books['bookAge'])
similarity_age = inverse_normalised_distance_similarity(age_array)

similarity_weights_extra = {'title': 0.1, 'author': 0.2, 'publisher': 0.1, 'age': 0.05, 'description': 0.2, 'category': 0.2, 'pages': 0.05}

similarity_matrices_extra = {
    'title': similarity_title,
    'author': similarity_authors,
    'publisher': similarity_publishers,
    'age': similarity_age,
    'description': similarity_matrix_description,
    'category': similarity_matrix_category,
    'pages': similarity_pages
}

In [0]:
book_index = 4
print(f"Book : {top_books.iloc[book_index]}")
similarity_weights_extra = {'title': 0.1, 'author': 0.2, 'publisher': 0.1, 'age': 0.05, 'description': 0.2, 'category': 0.2, 'pages': 0.05}
recommend_books_interpretable(book_index, similarity_matrices_extra, similarity_weights_extra, top_books, 10)

The recommendations are different to what I had before so the additional features and also slighlty different weights are playing a role. I can also see the description similarities are often very low likely because I'm not capturing any semantic meaning and the overlap in words is likely small. Its probably wise if this is given more weight than the other categories especially since people tend to pick books based on descriptions.

# Evaluation

I will use the users ratings to evaluate my model. Since my setting is a little uncoventional, I need a slightly different way to evaluate it. I flip regular LOOCV on its head.

I iterate over the set of books the user has read and enjoyed. I take one as the test book which I actually use as my input to the algorithm and compare the set of predictions to the rest of the books the user has read and enjoyed. I calculate average hit rate, average precision@N, average recall@N for each model.

I have been tinkering with these metrics a bit regarding the fairest ways to record them especially with recall. Open for discussion!?

In [0]:
ratings_df = pd.read_csv('../data/processed/ratings.csv')

In [0]:
# filter ratings to match top_books
ratings_df = ratings_df[ratings_df['ISBN'].isin(top_books['ISBN'])]
ratings_df.shape[0]

Even with the reduced amount of books there are still plenty of reviews.

In [0]:
min_ratings_per_user = 50
user_counts = ratings_df['userID'].value_counts()
active_users = user_counts[user_counts >= min_ratings_per_user].index
ratings_filtered = ratings_df[ratings_df['userID'].isin(active_users)]

I give a rather low threshold on the book ratings to preserve my sample size and use the threshold of ratings greater than 5 to be relevant/positively reviewed books. This does mean for now I have removed implicit reviews entirely.

In [0]:
ratings_filtered.shape[0]

After filtering to find users that have actually reviewed over 50 books I'm left with just over 30000 which is not so many books anymore but it is important for the evaluation that the users have read and reviewed at least a few books.

In [0]:
ratings_filtered[ratings_filtered['bookRating'] > 5].shape[0]

Note though that most of these are implicit and some negative (less than 5) reviews which won't be in the target set later.

In [0]:
ratings_positive = ratings_filtered[ratings_filtered['bookRating'] >= 5]

In [0]:
from matplotlib import pyplot as plt

# Count ratings
user_counts_all = ratings_filtered['userID'].value_counts()
user_counts_positive = ratings_positive['userID'].value_counts()

# Plot overlaid histograms
plt.figure(figsize=(10, 6))
plt.hist(user_counts_all, bins=50, alpha=0.5, label='All Ratings')
plt.hist(user_counts_positive, bins=50, alpha=0.5, label='Positive Ratings')

plt.xlabel('Number of Ratings per User')
plt.ylabel('Frequency')
plt.title('Distribution of Ratings per User')
plt.legend()
plt.show()


In [0]:
ratings_positive.groupby('userID').size().mean()

In [0]:
ratings_positive.ISBN.nunique()

In [0]:
ratings_filtered.ISBN.nunique()

The expectedly relatively low proportion of total books that the average reader has review positively means I should brace for rather low scores. The similarity matrices consider 844 books and the ratings only have 818 positively reviewed books and the average user has read and reviewd 20 of those popularly.

In [0]:
ratings_filtered.userID.nunique()

We are still evaluating on 300 users too. This is a large enough sample size to be useful.

In [0]:
import numpy as np

def recommend_indices(book_index, similarity_matrix, df, top_n=5):
    sim_scores = list(enumerate(similarity_matrix[book_index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_indices = [i for i, _ in sim_scores[1:top_n+1]]
    return top_indices

### Baseline

In [0]:
def recommend_popular_book_indices(books_df, ratings_df, top_n=5):
    """ recommend the book with simply the greatest number of reviews"""
    book_counts = ratings_df.groupby('ISBN')['bookRating'].count().reset_index()
    book_counts.columns = ['ISBN', 'count']
    top_books = books_df.merge(book_counts, on='ISBN')
    top_books = top_books.sort_values(by='count', ascending=False)
    top_books = top_books.head(top_n)
    # return indices
    return top_books.index.tolist()
    
recommend_popular_book_indices(top_books, ratings_filtered)
    

In [0]:
top_books.iloc[recommend_popular_book_indices(top_books, ratings_filtered)]

These seem to be popular books.

In [0]:
top_n = 5
hits = 0
total = 0
precision_total = 0
recall_total = 0
users_less_than_6_pos_ratings = 0

recommended_book_indices = recommend_popular_book_indices(top_books, ratings_filtered, top_n) 

for user, group in tqdm(ratings_filtered.groupby('userID')):
    user_books = group[group['bookRating'] >= 5]['ISBN'].tolist()
    if len(user_books) < 6:
        users_less_than_6_pos_ratings += 1
        continue

    for seed_book in user_books:
        other_liked_books = [b for b in user_books if b != seed_book]
        if not other_liked_books:
            continue

        # get ISBN of recommended books
        recommended_books = top_books.loc[recommended_book_indices, 'ISBN'].tolist()
        # True positives
        relevant_recs = set(recommended_books).intersection(set(other_liked_books))
        hits += int(len(relevant_recs) > 0)
        precision_total += len(relevant_recs) / top_n
        recall_total += len(relevant_recs) / len(other_liked_books)
        total += 1

# Metrics
print(f"Hit Rate@{top_n}: {hits / total:.4f}")
print(f"Precision@{top_n}: {precision_total / total:.4f}")
print(f"Recall@{top_n}: {recall_total / total:.4f}")

hit_rate_baseline = hits / total
precision_baseline = precision_total / total
recall_baseline = recall_total / total

After realising how I would calculate the metrics I noticed there could still be users without enough, 6 positive ratings there turned out to be 49 of the 338 I was using in the sample.

In [0]:
print(f"Users still in ratings with less than 6 positive ratigns: {users_less_than_6_pos_ratings}")

### First model

In [0]:
top_n = 5
hits = 0
total = 0
precision_total = 0
recall_total = 0

for user, group in tqdm(ratings_filtered.groupby('userID')):
    user_books = group[group['bookRating'] >= 5]['ISBN'].tolist()
    if len(user_books) < 6:
        continue

    for seed_book in user_books:
        other_liked_books = [b for b in user_books if b != seed_book]
        if not other_liked_books:
            continue
        
        book_index = top_books[top_books['ISBN'] == seed_book].index[0]
        recommended_books = recommend_indices(book_index, similarity_matrix, top_books, top_n)
        # get ISBN of recommended books
        recommended_books = top_books.loc[recommended_books, 'ISBN'].tolist()
        # True positives
        relevant_recs = set(recommended_books).intersection(set(other_liked_books))
        hits += int(len(relevant_recs) > 0)
        precision_total += len(relevant_recs) / top_n
        recall_total += len(relevant_recs) / len(other_liked_books)
        total += 1

# Metrics
print(f"Hit Rate@{top_n}: {hits / total:.4f}")
print(f"Precision@{top_n}: {precision_total / total:.4f}")
print(f"Recall@{top_n}: {recall_total / total:.4f}")

hit_rate_base_features = hits / total
precision_base_features = precision_total / total
recall_base_features = recall_total / total

Phew that's a relief my model is better than the baseline model just.

### Now with title

In [0]:
top_n = 5
hits = 0
total = 0
precision_total = 0
recall_total = 0

for user, group in tqdm(ratings_filtered.groupby('userID')):
    user_books = group[group['bookRating'] >= 5]['ISBN'].tolist()
    if len(user_books) < 6:
        continue

    for seed_book in user_books:
        other_liked_books = [b for b in user_books if b != seed_book]
        if not other_liked_books:
            continue
        
        book_index = top_books[top_books['ISBN'] == seed_book].index[0]
        recommended_books = recommend_indices(book_index, similarity_matrix_with_title, top_books, top_n)
        # get ISBN of recommended books
        recommended_books = top_books.loc[recommended_books, 'ISBN'].tolist()
        # True positives
        relevant_recs = set(recommended_books).intersection(set(other_liked_books))
        hits += int(len(relevant_recs) > 0)
        precision_total += len(relevant_recs) / top_n
        recall_total += len(relevant_recs) / len(other_liked_books)
        total += 1

# Metrics
print(f"Hit Rate@{top_n}: {hits / total:.4f}")
print(f"Precision@{top_n}: {precision_total / total:.4f}")
print(f"Recall@{top_n}: {recall_total / total:.4f}")

hit_rate_with_title = hits / total
precision_with_title = precision_total / total
recall_with_title = recall_total / total

Including the title does marginally improve the performance on both precision and recall.

### Genre, description ... model

In [0]:
def recommend_indices_intepretable(book_index, similarity_matrices, weights, df, top_n=5):
    feature_scores = {}
    
    # Extract similarity for the target book for each feature
    for feature, sim_matrix in similarity_matrices.items():
        feature_scores[feature] = sim_matrix[book_index]

    # Compute final weighted similarity score
    final_similarity = np.zeros_like(next(iter(feature_scores.values())))
    for feature in feature_scores:
        final_similarity += weights[feature] * feature_scores[feature]

    # Enumerate and rank by final score
    sim_scores = list(enumerate(final_similarity))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_indices = [i for i, _ in sim_scores if i != book_index][:top_n]
    return top_indices

In [0]:
top_n = 5
hits = 0
total = 0
precision_total = 0
recall_total = 0

for user, group in tqdm(ratings_filtered.groupby('userID')):
    user_books = group[group['bookRating'] >= 5]['ISBN'].tolist()
    
    if len(user_books) < 6:
        continue

    for seed_book in user_books:
        other_liked_books = [b for b in user_books if b != seed_book]
        if not other_liked_books:
            continue
        
        book_index = top_books[top_books['ISBN'] == seed_book].index[0]
        recommended_books = recommend_indices_intepretable(book_index, similarity_matrices_extra, similarity_weights_extra, top_books, 10)
        # get ISBN of recommended books
        recommended_books = top_books.loc[recommended_books, 'ISBN'].tolist()
        # True positives
        relevant_recs = set(recommended_books).intersection(set(other_liked_books))
        hits += int(len(relevant_recs) > 0)
        precision_total += len(relevant_recs) / top_n
        recall_total += len(relevant_recs) / len(other_liked_books)
        total += 1

# Metrics
print(f"Hit Rate@{top_n}: {hits / total:.4f}")
print(f"Precision@{top_n}: {precision_total / total:.4f}")
print(f"Recall@{top_n}: {recall_total / total:.4f}")

hit_rate_google_api = hits / total
precision_google_api = precision_total / total
recall_google_api = recall_total / total

This yields the largest increase in performance so far and has a hit rate of over 0.5

In [0]:
import matplotlib.pyplot as plt
import numpy as np

models = ['Popularity model', 'Base features', 'Base features + title', 'With Google API features']

hit_rate = [hit_rate_baseline, hit_rate_base_features, hit_rate_with_title, hit_rate_google_api]
precision = [precision_baseline, precision_base_features, precision_with_title, precision_google_api]
recall = [recall_baseline, recall_base_features, recall_with_title, recall_google_api]

fig, axes = plt.subplots(1, 3, figsize=(15, 5), sharey=True)

axes[0].bar(models, hit_rate, color='skyblue')
axes[0].set_title('Hit Rate')
axes[0].set_ylim(0, 1)
axes[0].set_ylabel('Score')
axes[0].set_xticklabels(models, rotation=45, ha='right')

axes[1].bar(models, precision, color='lightgreen')
axes[1].set_title('Precision')
axes[1].set_ylim(0, 1)
axes[1].set_xticklabels(models, rotation=45, ha='right')

axes[2].bar(models, recall, color='salmon')
axes[2].set_title('Recall')
axes[2].set_ylim(0, 1)
axes[2].set_xticklabels(models, rotation=45, ha='right')

fig.suptitle('Model Performance Comparison', fontsize=16)
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()

Using the features from the google API including genre and description, I am able to marginally improve the result. I expect learning the weights of the different factors would improve the results. 

## Visualising the recommendation 

In [0]:
book_index = 4
print(f"Book : {top_books.iloc[book_index]}")
recommendations_example = recommend_books_interpretable(book_index, similarity_matrices_extra, similarity_weights_extra, top_books, 10)

In [0]:
import numpy as np
import matplotlib.pyplot as plt


similarity_columns = ['sim_title', 'sim_author', 'sim_publisher', 'sim_age', 'sim_description', 'sim_category', 'sim_pages']
data = recommendations_example.iloc[0][similarity_columns]
labels = similarity_columns
values = data.values.flatten().tolist()
values += values[:1]

num_vars = len(labels)
angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
angles += angles[:1]

fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
ax.plot(angles, values, color='red', linewidth=2)
ax.fill(angles, values, color='red', alpha=0.25)

ax.set_xticks(angles[:-1])
ax.set_xticklabels(labels, fontsize=10)
ax.set_ylim(0, 1)

plt.title('Similarity Scores Radar Chart', size=16, y=1.1)

plt.show()


In [0]:
import numpy as np
import matplotlib.pyplot as plt

weights = {
    'sim_title': 0.15,
    'sim_author': 0.05,
    'sim_publisher': 0,
    'sim_age': 0.1,
    'sim_description': 0.5,
    'sim_category': 0.1,
    'sim_pages': 0.1
}

similarity_columns = ['sim_title', 'sim_author', 'sim_publisher', 'sim_age', 'sim_description', 'sim_category', 'sim_pages']
columns = [col for col in similarity_columns if weights[col] > 0]
raw_scores = recommendations_example.iloc[0][columns]
weighted_components = raw_scores * np.array([weights[col] for col in columns])
max_val = max(weighted_components.max(), 1e-6)
normalized_components = weighted_components / max_val
values = normalized_components.tolist()
values += values[:1]
num_vars = len(columns)
angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
angles += angles[:1]
book_title = recommendations_example.iloc[0]['bookTitle']

fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
ax.plot(angles, values, color='purple', linewidth=2, label=book_title)
ax.fill(angles, values, color='purple', alpha=0.25)
ax.set_xticks(angles[:-1])
ax.set_xticklabels(columns, fontsize=10)
ax.set_ylim(0, 1)
ax.legend(loc='upper right', bbox_to_anchor=(1.1, 1.1))
plt.title('Weighted Similarity Components', size=16, y=1.1)
plt.show()


In [0]:
import numpy as np
import matplotlib.pyplot as plt

weights = {
    'sim_title': 0.15,
    'sim_author': 0.05,
    'sim_publisher': 0,
    'sim_age': 0.1,
    'sim_description': 0.5,
    'sim_category': 0.1,
    'sim_pages': 0.1
}

similarity_columns = ['sim_title', 'sim_author', 'sim_publisher', 'sim_age', 'sim_description', 'sim_category', 'sim_pages']
columns = [col for col in similarity_columns if weights[col] > 0]


num_vars = len(columns)
angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
angles += angles[:1]

fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))

for idx in range(5):
    raw_scores = recommendations_example.iloc[idx][columns]
    weighted_components = raw_scores * np.array([weights[col] for col in columns])
    max_val = max(weighted_components.max(), 1e-6)
    normalized_components = weighted_components / max_val
    
    values = normalized_components.tolist()
    values += values[:1]

    book_title = recommendations_example.iloc[idx]['bookTitle']
    
    ax.plot(angles, values, linewidth=2, label=book_title)
    ax.fill(angles, values, alpha=0.1)

ax.set_xticks(angles[:-1])
ax.set_xticklabels(columns, fontsize=10)
ax.set_ylim(0, 1)

ax.legend(loc='center left', bbox_to_anchor=(1.2, 1.05), fontsize=9)

plt.title('Top 5 Weighted Similarity Components', size=16, y=1.1)
plt.show()


It would also be nice to have some variety measure maybe. So that all the recommendations aren't the same in the same way?

### Now all in one with the recommendations and visualisation

In [0]:
import numpy as np
import matplotlib.pyplot as plt

def recommend_and_plot_radar(book_index, similarity_matrices_extra, similarity_weights_extra, top_books_extra_no_null, top_n=5):
    recommendations_example = recommend_books_interpretable(
        book_index,
        similarity_matrices_extra,
        similarity_weights_extra,
        top_books_extra_no_null,
        top_n
    )

    print(recommendations_example[['bookTitle', 'Score']])
    
    similarity_columns = list(similarity_weights_extra.keys())
    columns = [col for col in similarity_columns if similarity_weights_extra[col] > 0]
    columns = ['sim_' + col for col in columns]
    
    num_vars = len(columns)
    angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
    angles += angles[:1]
    
    fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))
    
    for idx in range(min(top_n, len(recommendations_example))):
        raw_scores = recommendations_example.iloc[idx][columns]

        weighted_components = raw_scores * np.array([similarity_weights_extra[col.split('_')[-1]] for col in columns])

        max_val = max(weighted_components.max(), 1e-6)
        normalized_components = weighted_components / max_val
        
        values = normalized_components.tolist()
        values += values[:1]
        
        book_title = recommendations_example.iloc[idx]['bookTitle']
        
        ax.plot(angles, values, linewidth=2, label=book_title)
        ax.fill(angles, values, alpha=0.1)
    
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(columns, fontsize=10)
    ax.set_ylim(0, 1)

    ax.legend(loc='center left', bbox_to_anchor=(1.2, 1.05), fontsize=9)
    
    plt.title(f"Top {top_n} Books' Weighted Similarity Components", size=16, y=1.1)
    plt.show()
    
    return recommendations_example


In [0]:
similarity_weights_extra

In [0]:
top_books.iloc[4]

In [0]:
recommended_books = recommend_and_plot_radar(
    4,
    similarity_matrices_extra,
    similarity_weights_extra,
    top_books,
    top_n=5
)

I recommend books using the default similarity measures I initially defined. I decide I care most about the description and less about author, publisher and age and get new recommendations.

In [0]:
weights = {
    'title': 0.15,
    'author': 0.05,
    'publisher': 0.05,
    'age': 0.05,
    'description': 0.5,
    'category': 0.1,
    'pages': 0.1
}

recommended_books = recommend_and_plot_radar(
    4,
    similarity_matrices_extra,
    weights,
    top_books,
    top_n=5
)


## Lets give a Collaborative Filtering approach a go

I will go for an item-based over user-based collaborative filtering approach  This is best since I am assuming no user history only that they read this one book and would like some similar recommendations and it would be difficult to find similar users based on just one book. There will likely be many.

In [0]:
# I will filter down for just explicit ratings for now
ratings_filtered_explicit = ratings_df[ratings_df['bookRating'] > 0]


In [0]:
import pandas as pd
import numpy as np

data = {
    'user_id': [1, 1, 2, 2, 3, 3],
    'item_id': ['A', 'B', 'A', 'C', 'B', 'C'],
    'rating': [5, 3, 4, 2, 2, 5]
}

user_item_matrix = ratings_filtered_explicit.pivot_table(index='userID', columns='ISBN', values='bookRating')


In [0]:
# lets go for cosine similarity again
from sklearn.metrics.pairwise import cosine_similarity
user_item_matrix = user_item_matrix.T.fillna(0)
similarity_matrix_item_cf = cosine_similarity(user_item_matrix)

In [0]:
cf_item_similarity_df = pd.DataFrame(similarity_matrix_item_cf, 
                                   index=user_item_matrix.index, 
                                   columns=user_item_matrix.index)

In [0]:
def recommend_similar_books(liked_book, similarity_matrix, books_df, top_n=5):
    if liked_book not in similarity_matrix.index:
        raise ValueError(f"Book '{liked_book}' not found in similarity matrix.")
    similar_books = similarity_matrix.loc[liked_book].drop(labels=[liked_book])
    top_recommendations = similar_books.sort_values(ascending=False).head(top_n)
    top_recommendations = books_df[books_df.ISBN.isin(top_recommendations.index)]
    return top_recommendations

def recommend_similar_books_isbn(liked_book, similarity_matrix, books_df, top_n=5):
    if liked_book not in similarity_matrix.index:
        raise ValueError(f"Book '{liked_book}' not found in similarity matrix.")
    similar_books = similarity_matrix.loc[liked_book].drop(labels=[liked_book])
    top_recommendations = similar_books.sort_values(ascending=False).head(top_n)
    return top_recommendations.index


In [0]:
book_index = 4
book_isbn = top_books.iloc[book_index]['ISBN']
print(f"Recommendations for: {top_books.iloc[book_index]}")
recommend_similar_books(book_isbn, cf_item_similarity_df, top_books, top_n=5)

In [0]:
book_index = 5
book_isbn = top_books.iloc[book_index]['ISBN']
print(f"Recommendations for: {top_books.iloc[book_index]}")
recommend_similar_books(book_isbn, cf_item_similarity_df, top_books, top_n=5)

## Quick Evaluation of user-based CF on its own

Not I will use the same method of evaluation I designed and used for the content based recommender - however this is likely at risk of some data leakage as the ratings was used to construct the similarity matrix. And these results might be inflated slightly!

In [0]:
top_n = 5
hits = 0
total = 0
precision_total = 0
recall_total = 0

for user, group in tqdm(ratings_filtered.groupby('userID')):
    user_books = group[group['bookRating'] >= 5]['ISBN'].tolist()
    
    if len(user_books) < 6:
        continue

    for seed_book in user_books:
        other_liked_books = [b for b in user_books if b != seed_book]
        if not other_liked_books:
            continue
        
        book_index = top_books[top_books['ISBN'] == seed_book].index[0]
        recommended_books = recommend_similar_books_isbn(seed_book, cf_item_similarity_df, top_books, top_n=top_n)
        # True positives
        relevant_recs = set(recommended_books).intersection(set(other_liked_books))
        hits += int(len(relevant_recs) > 0)
        precision_total += len(relevant_recs) / top_n
        recall_total += len(relevant_recs) / len(other_liked_books)
        total += 1

# Metrics
print(f"Hit Rate@{top_n}: {hits / total:.4f}")
print(f"Precision@{top_n}: {precision_total / total:.4f}")
print(f"Recall@{top_n}: {recall_total / total:.4f}")

hit_rate_google_api = hits / total
precision_google_api = precision_total / total
recall_google_api = recall_total / total

Well it does seem like this model is... well there's no possibility to beat around the bush here, much better than the content based one I built - I could claim that this evaluation unfairly benefits the CF model. I would like to further investigate if adding some semantic ability to my content based one would increase the performance a bit but maybe what I can do is try and combine them. I would also try and find a fairer evaluation method to compare both - maybe for now I could investigate calculating the similarity matrix in each iteration without the given users data as for rarer books this could be quite influential.

Another caveat is that I have sculpted the dataset to be more in the domain of a Collaborative Filtering model and that the content based model is more useful for these item cold start cases where we aren't dealing with popular books with lots of user data! So this evaluation is very rather harsh and a qualitative evaluation and user testing could prove that the content-based model does perfom okay. 

Diversity of recommendations has also not been considered as an option too! Sometimes a book reader wouldn't mind a bit of a wild card suggestion or at least variety of options.

# Combining collaborative filtering and content-based models

Since the very simple collaborative filtering outperformed my content-based model. I will combine them so that collaborative filtering gives the recommendations say 100 and the content-based will rerank them and be used to explain in a way why they are relevant with the caveat that they were recommended because other readers that liked this book liked these other books too.

In [0]:
def recommend_indices_interpretable_subset(
    book_index, 
    candidate_indices, 
    similarity_matrices, 
    weights, 
    top_n=5
):
    """
    Re-ranks a list of candidate book indices using weighted content-based similarity
    to a given book_index.

    Args:
        book_index (int): Index of the seed (liked) book.
        candidate_indices (list[int]): List of book indices to re-rank.
        similarity_matrices (dict): Dictionary of similarity matrices per feature.
        weights (dict): Corresponding weights for each similarity matrix.
        top_n (int): Number of top recommendations to return.

    Returns:
        list[int]: Top-n indices from candidate_indices, re-ranked by weighted similarity.
    """
    feature_scores = {
        feature: sim_matrix[book_index] for feature, sim_matrix in similarity_matrices.items()
    }

    final_scores = {}
    for idx in candidate_indices:
        score = sum(weights[feature] * feature_scores[feature][idx] for feature in similarity_matrices)
        final_scores[idx] = score

    # Sort by score and return top_n indices
    sorted_scores = sorted(final_scores.items(), key=lambda x: x[1], reverse=True)
    top_indices = [idx for idx, _ in sorted_scores[:top_n]]
    return top_indices


In [0]:
cf_top_n = 50
top_n = 5
hits = 0
total = 0
precision_total = 0
recall_total = 0


# use these weights for content based - same as for previous recommendation
content_based_weights = {'title': 0.1, 'author': 0.2, 'publisher': 0.1, 'age': 0.05, 'description': 0.2, 'category': 0.2, 'pages': 0.05}

# original similarity matrix too
content_based_similarity_matrices = similarity_matrices_extra

for user, group in tqdm(ratings_filtered.groupby('userID')):
    user_books = group[group['bookRating'] >= 5]['ISBN'].tolist()
    if len(user_books) < 2:
        continue

    for seed_book in user_books:
        other_liked_books = [b for b in user_books if b != seed_book]
        if not other_liked_books:
            continue
        
        book_index = top_books[top_books['ISBN'] == seed_book].index[0]
        recommended_books_cf = recommend_similar_books_isbn(seed_book, cf_item_similarity_df, top_books, top_n=cf_top_n)
        cf_candidate_indices = [top_books[top_books['ISBN'] == isbn].index[0] for isbn in recommended_books_cf if isbn in top_books['ISBN'].values]
        re_ranked_content_based_indices = recommend_indices_interpretable_subset(
            book_index, 
            cf_candidate_indices, 
            content_based_similarity_matrices, 
            content_based_weights, 
            top_n=top_n
        )
        recommended_books = top_books.iloc[re_ranked_content_based_indices]['ISBN'].tolist()
        # True positives
        relevant_recs = set(recommended_books).intersection(set(other_liked_books))
        hits += int(len(relevant_recs) > 0)
        precision_total += len(relevant_recs) / top_n
        recall_total += len(relevant_recs) / len(other_liked_books)
        total += 1

# Metrics
print(f"Hit Rate@{top_n}: {hits / total:.4f}")
print(f"Precision@{top_n}: {precision_total / total:.4f}")
print(f"Recall@{top_n}: {recall_total / total:.4f}")

hit_rate_google_api = hits / total
precision_google_api = precision_total / total
recall_google_api = recall_total / total

Combining the collaborative filtering and content based models in this way does not improve the metrics but it does improve upon just using the content based one on its own. I still think the content based model can be valuable for the recommender system as it can give the user some explanations and details about why it is relevant and allow them to feedback into it. It may also be better in the cold start situation where the book is not so popular.