In [1]:
import pandas as pd
# ================== Preprocessing ====================
# Set the name of the file
rating_filename = 'Ratings.csv'
# user_filename = 'Users.csv'
#book_filename = 'Books.csv'

ratings_df = pd.read_csv(rating_filename)

print(ratings_df.head())

   User-ID        ISBN  Book-Rating
0   276725  034545104X            0
1   276726  0155061224            5
2   276727  0446520802            0
3   276729  052165615X            3
4   276729  0521795028            6


In [2]:
print("Number of unique users: ", len(ratings_df["User-ID"].unique()))
print("Number of unique books: ", len(ratings_df["User-ID"].unique()))
print("Shape of dataframe: ",ratings_df.shape)

Number of unique users:  105283
Number of unique books:  105283
Shape of dataframe:  (1149780, 3)


In [3]:
# Drop book ratings with 0 because it is already implied and not every book has a rating so it is unnecessary
ratings_df = ratings_df[ratings_df["Book-Rating"] != 0]
ratings_df.shape

(433671, 3)

In [4]:
### Using RMSE to see insights
# Find the rater with the highest number of ratings
ratings_df['User-ID'].value_counts()

User-ID
11676     8524
98391     5802
153662    1969
189835    1906
23902     1395
          ... 
114079       1
114081       1
114096       1
114115       1
276723       1
Name: count, Length: 77805, dtype: int64

In [5]:
# For testing purposes, we will drop half of the highest raters ratings
# We cannot simply cut the data in half since the data is ordered so if we do that then all the users in testing set will be new
# which will be difficult to measure using our recommendation system

# Highest number of ratings person is User-ID 11676
# We will drop half of his ratings for testing

test_user = 11676

test_df = ratings_df[ratings_df['User-ID'] == test_user]

# Create testing set
test_set = test_df[len(test_df)//2:]
print("Number of ratings from test rater before dropping: ",len(ratings_df[ratings_df['User-ID'] == test_user]))

# This is our training set
ratings_df = pd.merge(ratings_df,test_set, indicator=True, how='outer').query('_merge=="left_only"').drop('_merge', axis=1)

Number of ratings from test rater before dropping:  8524


In [6]:
print("Number of ratings from test rater after dropping: ",len(ratings_df[ratings_df['User-ID'] == test_user]))

Number of ratings from test rater after dropping:  4262


In [7]:
# Take the aggregate ratings (worst case scenario if we are recommending for new users)
avg_ratings = ratings_df.groupby('ISBN').agg(avg_rating = ('Book-Rating', 'mean'),
                                                number_of_ratings = ('Book-Rating', 'count')).reset_index()

# Keep the books with over 100 ratings
avg_ratings100 = avg_ratings[avg_ratings['number_of_ratings']>100]

# Sort average ratings in descending order
sorted_avg_ratings = avg_ratings100.sort_values(by='avg_rating', ascending=False)

In [8]:
# Check popular books sorted by average rating
sorted_avg_ratings.head()

Unnamed: 0,ISBN,avg_rating,number_of_ratings
52761,0439139597,9.262774,137
52741,0439136369,9.082707,133
52740,0439136350,9.035461,141
53142,043935806X,9.033981,206
77940,0590353403,8.983193,119


In [9]:
# Store all the average ratings with an average rating above a threshold as popular
popular_sorted_average_ratings = sorted_avg_ratings[sorted_avg_ratings['avg_rating']>8]

In [10]:
# Generates the cosine similarity matrix
from scipy.sparse import coo_matrix
from sklearn.metrics.pairwise import cosine_similarity

# Assuming unique_books and unique_users are precomputed
user_ids = ratings_df['User-ID'].unique()
book_ids = ratings_df['ISBN'].unique()

user_id_to_index = {user_id: index for index, user_id in enumerate(user_ids)}
book_id_to_index = {book_id: index for index, book_id in enumerate(book_ids)}

# Create a COO sparse matrix
data = []
row_indices = []
col_indices = []

for index, row in ratings_df.iterrows():
    user_index = user_id_to_index.get(row['User-ID'])
    book_index = book_id_to_index.get(row['ISBN'])

    if user_index is not None and book_index is not None:
        data.append(row['Book-Rating'])
        row_indices.append(user_index)
        col_indices.append(book_index)

user_book_coo = coo_matrix((data, (row_indices, col_indices)), shape=(len(user_ids), len(book_ids)))

# Compute cosine similarity
cosine_sim_matrix = cosine_similarity(user_book_coo, dense_output=False)

In [11]:
# Map user-id to cosine similarity matrix index
unique_users = ratings_df["User-ID"].unique()
user_map = {k: v for v, k in enumerate(unique_users)}

# Generate unique books
unique_books = set(ratings_df["ISBN"].unique())

In [12]:
# Given a list of books and a user, generate a dictionary of predicted ratings for unrated books
def generate_predicted_ratings(index, books):
    # Threshold
    threshold = 0.05
    
    # Average Nearest-Neighbors calculation
    # Dictionary to store ratings
    predicted_ratings = dict()
    for i in range(cosine_sim_matrix.shape[0]):
        # If cosine similarity is above a certain threshold, add it to weighted average
        if (cosine_sim_matrix[(index, i)] > threshold):
            # Gets the User-ID from the index
            user2 = unique_users[i]
            user2_df = ratings_df[ratings_df['User-ID'] == user2]
            for book in books:
                if book in user2_df["ISBN"].values:
                    tmp = predicted_ratings.get(book,[0,0])
                    tmp[0] += user2_df[user2_df["ISBN"] == book]["Book-Rating"].values[0]
                    tmp[1] += 1
                    predicted_ratings[book] = tmp
    # Calculate the average ratings of neighbors by dividing total by total count
    for key in predicted_ratings:
        predicted_ratings[key] = predicted_ratings.get(key)[0] // predicted_ratings.get(key)[1]

    return predicted_ratings

In [13]:
# Returns top 10 book recommendations
def recommend(user):
    # Convert user-id to matrix index
    index = user_map.get(user)

    # Find the unrated books by specific user
    rated_books = ratings_df[ratings_df['User-ID'] == user]["ISBN"]

    # If user does not have enough ratings, do not use this algorithm (use most popular)
    if len(rated_books) < 5:
        return []
    
    unrated_books = unique_books - set(rated_books)

    # Calculate predicted ratings using average nearest neighbors
    predicted_ratings = generate_predicted_ratings(index,unrated_books)

    # Get the top 10 recommended books
    top_10 = sorted(predicted_ratings.items(), key=lambda x:-x[1])[:10]

    # Store the result in a 
    result = []
    for x in top_10:
        result.append(x[0])
    return result


In [14]:
# Formts top 10 recommended for user
def format_top10(top_10_ISBN):
    top_10 = avg_ratings[avg_ratings["ISBN"].isin(top_10_ISBN)]
    print("Recommended books for user with user-id " + str(USER) + ": ")
    print(top_10)

In [15]:
# When users are new, display popular items until user has made enough ratings
def display_popular():
    # Store all the average ratings with an average rating above a threshold as popular
    random_top_ten = popular_sorted_average_ratings.sample(n = 10)
    sorted_top_ten = random_top_ten.sort_values(by = "avg_rating", ascending = False)
    print(sorted_top_ten)

In [16]:
# Calculate RMSE using previously made testing set
import math
import numpy as np

test_books = set(test_set["ISBN"])
test_index = user_map.get(test_user)
test_ratings = dict(zip(test_set["ISBN"], test_set["Book-Rating"]))
train_ratings = generate_predicted_ratings(test_index,test_books)

# Find common keys
common_keys = set(test_ratings.keys()) & set(train_ratings.keys())

# Create arrays for values of common keys in each dictionary
test_arr = [test_ratings[key] for key in common_keys] # Actual
train_arr = [train_ratings[key] for key in common_keys] # Predicted

# Calculate RMSE
MSE = np.square(np.subtract(test_arr,train_arr)).mean()
RMSE = math.sqrt(MSE)
print("Root Mean Square Error: ",RMSE)

Root Mean Square Error:  2.496462143178469


In [17]:
#### MAIN PROGRAM - CHANGE THE USER-ID HERE
# Select a user to recommend books - change this variable
# User with more than 5 books rated
USER = 276704
# User with less than 5 books rated
# USER = 276723

# Note that this algorithm may take a while to run
top_10_ISBN = recommend(USER)

# Displays the top 10 recommended books for the user 
if len(top_10_ISBN) == 0:
    display_popular()
else:
    format_top10(top_10_ISBN)

Recommended books for user with user-id 276704: 
                ISBN  avg_rating  number_of_ratings
53676     0440159016    9.333333                  3
58695     0446611778    7.333333                 39
60713     0449242773    8.250000                  8
66956     0515087491    8.500000                  6
67072     0515101486    9.600000                  5
67101     0515104175    8.400000                  5
81318     0670030031    7.923077                 13
144060    1593080549    9.000000                  2
144063    1593081017   10.000000                  1
167558  780451524201   10.000000                  1
