# Book Recommendation System

### Import all necessary libraries here

In [1]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader

### Load data and merge. Preprocess the null values

In [2]:
# Read data
books_data = pd.read_csv('archive/Books.csv')
ratings_data = pd.read_csv('archive/Ratings.csv')
users_data = pd.read_csv('archive/Users.csv')

# Ratings data anchors the left side. We join the books after that. Then join the users data.
ratings_books = pd.merge(ratings_data, books_data, on='ISBN', how='left')
full_data = pd.merge(ratings_books, users_data, on='User-ID', how='left')

# Create the mapping of book ID to the book title
book_id_to_title = dict(zip(books_data["ISBN"], books_data["Book-Title"]))

  books_data = pd.read_csv('archive/Books.csv')


In [3]:
# Upon inspection, there are a lot of null values in the book column which we need to clean out. These represent users that have made a rating. 
# Drop the values where the book information is null. That is critical information. Age has some null values but we can deal with that later.
full_data = full_data.dropna(subset = ['Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher', 'Image-URL-S', 'Image-URL-M', 'Image-URL-L'])

### Set up the data in surprise format and set up SVD model instance

In [4]:
# Define the rating scale (0 to 10)
reader = Reader(rating_scale=(0, 10))

# Load data into Surprise format
surprise_data = Dataset.load_from_df(full_data[['User-ID', 'ISBN', 'Book-Rating']], reader)

In [5]:
from surprise import SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

# Split data into training and testing sets
trainset, testset = train_test_split(surprise_data, test_size=0.2)

# Train the SVD model
model = SVD()
model.fit(trainset)

# Make predictions on test data
predictions = model.test(testset)

# Evaluate model performance
print("RMSE:", accuracy.rmse(predictions))

RMSE: 3.5068
RMSE: 3.5067800052312053


### Make Personalized Book Recommendations

In [6]:
from collections import defaultdict

def get_top_n_recommendations(predictions, n=10):
    """
    Get top N recommended books for each user based on predictions.

    Parameters:
        predictions (list): List of predictions from a Surprise model.
        n (int): Number of top recommendations to return.

    Returns:
        dict: Dictionary mapping user IDs to a list of (book title, predicted rating).
    """
    top_n = defaultdict(list)

    # Group predictions by user and store (item_id, estimated rating)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Sort each user's predictions and keep top N
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = sorted(user_ratings, key=lambda x: x[1], reverse=True)[:n]

    # Convert book IDs to book titles
    for uid in top_n:
        top_n[uid] = [(book_id_to_title.get(iid, "Unknown Book"), rating) for iid, rating in top_n[uid]]

    return top_n

top_recommendations = get_top_n_recommendations(predictions, n=5)

### Implement Content-Based Filtering with Nearest Neighbors with book metadata.

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer # TfIdf instance
from sklearn.metrics.pairwise import cosine_similarity # using content metadata to teach us about ratings

# Deal with any null values in the the "Author" and "Publisher" fields
books_data['Book-Author'] = books_data['Book-Author'].fillna('Unknown')
books_data['Publisher'] = books_data['Publisher'].fillna('Unknown')

# Append book metadata (authors + publishers) into one column
books_data['metadata'] = books_data['Book-Author'] + " " + books_data['Publisher']

# Convert the metadata into the TF-IDF matrix
vectorizer = TfidfVectorizer(stop_words="english")
tfidf_matrix = vectorizer.fit_transform(books_data["metadata"])

# Here I would normally use cosine similarity to calculate tfidf_matrix x tfidf_matrix but the operation is too 
# expensive. I've chosen to use a K-Nearest-Neighbors route to limit the amount of operations the kernel has to do.
from sklearn.neighbors import NearestNeighbors

# Use NearestNeighbors to find the most similar books
knn = NearestNeighbors(n_neighbors=11, metric='cosine', n_jobs=-1)  # n_jobs=-1 uses all CPUs
knn.fit(tfidf_matrix)

# Find the nearest neighbors for each book (book_id=0, book_id=1, etc.)
distances, indices = knn.kneighbors(tfidf_matrix)

# indices now contains the most similar books for each book in the dataset

# Create a DataFrame with the indices and distances
neighbors_df = pd.DataFrame(indices, columns=[f'Neighbor_{i+1}' for i in range(indices.shape[1])])
# Add distances as additional columns in the DataFrame
distances_df = pd.DataFrame(distances, columns=[f'Distance_{i+1}' for i in range(distances.shape[1])])

# Concatenate both DataFrames
knn_df = pd.concat([neighbors_df, distances_df], axis=1)

# If you want to add the book titles (assuming you have a 'books_data' DataFrame with book titles)
knn_df['Book ID'] = books_data['ISBN'].values  # Assuming you have a 'book_id' column
knn_df['Book Title'] = books_data['Book-Title'].values

# Show the DataFrame with book IDs, book titles, and their nearest neighbors
knn_df.to_csv('knn_df.csv')

   Neighbor_1  Neighbor_2  Neighbor_3  Neighbor_4  Neighbor_5  Neighbor_6  \
0           0      111977      193923       95231      107299      159200   
1           1       69226      130669      187357       48704       73234   
2           2       61417      117824       78248      120167      187435   
3           3      117371      184279       75775       45833       70324   
4           4       54649      149697      214660      266528      214662   

   Neighbor_7  Neighbor_8  Neighbor_9  Neighbor_10  ...  Distance_4  \
0      190381       57665      118542       113385  ...    0.347922   
1       72119       69955      140651       268831  ...    0.373861   
2      236521      192655      115309       143565  ...    0.545645   
3      187644      216926      173735        70014  ...    0.393995   
4      266527       49069      126385       118655  ...    0.426277   

   Distance_5  Distance_6  Distance_7  Distance_8  Distance_9  Distance_10  \
0    0.446818    0.460511    0.4

In [21]:
def hybrid_recommendations(user_id, top_n=5):
    """
    Generate hybrid recommendations for a user using SVD and content-based filtering.
    
    1. Get top-N predictions from SVD.
    2. If fewer than N books exist, use content-based filtering to fill the gap.
    """
    svd_recommendations = top_recommendations.get(user_id, []) # Get SVD-based recommendations
    
    if len(svd_recommendations) >= top_n:
        return svd_recommendations[:top_n] # Use SVD only if we have enough data (top_n)
    
    # Otherwise, use content-based recommendations to fill the gap (SVD + TF-IDF)
    recommended_books = set(book_id for book_id, _ in svd_recommendations) # Avoid duplicates
    additional_recommendations = []
    
    for book_title, _ in svd_recommendations:
        book_id = books_data[books_data['Book-Title'] == book_title]["ISBN"].values[0]
        similar_books = knn_df['Book ID'].index.tolist()
        
        # Add similar books that weren't already recommended
        for similar_book in similar_books:
            if similar_book not in recommended_books and len(additional_recommendations) < (top_n - len(svd_recommendations)):
                #additional_recommendations.append((book_id_to_title[similar_book], 0)) # content-based scores aren't numerical
                additional_recommendations.append((book_title,0))
        final_recommendations = svd_recommendations + additional_recommendations
        
        return final_recommendations[:top_n]

### Print the recommendations for a specific user

In [29]:
# Input the user ID and how many books you want recommended.
hybrid_recommendations(198711, 5)

[('Love You Forever', 2.5157304450421107),
 ('Good in Bed', 2.0630823296815324),
 ("Chicken Soup for the Woman's Soul (Chicken Soup for the Soul Series (Paper))",
  1.8851332733481787),
 ("The Berenstain Bears' New Baby (Pictureback Series)", 1.66732975848313),
 ('The Phantom Tollbooth', 1.5545600679413258)]