In [27]:
import pandas as pd
# ================== Preprocessing ====================
# Set the name of the file
rating_filename = 'Ratings.csv'
# user_filename = 'Users.csv'
#book_filename = 'Books.csv'

ratings_df = pd.read_csv(rating_filename)

print(ratings_df.head())

   User-ID        ISBN  Book-Rating
0   276725  034545104X            0
1   276726  0155061224            5
2   276727  0446520802            0
3   276729  052165615X            3
4   276729  0521795028            6


In [28]:
print("Number of unique users: ", len(ratings_df["User-ID"].unique()))
print("Number of unique books: ", len(ratings_df["User-ID"].unique()))
print("Shape of dataframe: ",ratings_df.shape)

Number of unique users:  105283
Number of unique books:  105283
Shape of dataframe:  (1149780, 3)


In [29]:
# Drop book ratings with 0 because it is already implied and not every book has a rating so it is unnecessary
ratings_df = ratings_df[ratings_df["Book-Rating"] != 0]
ratings_df.shape

(433671, 3)

In [30]:
# print(ratings_df.mean())
# print(ratings_df.max())

In [31]:
# Take the aggregate ratings (worst case scenario if we are recommending for new users)
avg_ratings = ratings_df.groupby('ISBN').agg(avg_rating = ('Book-Rating', 'mean'),
                                                number_of_ratings = ('Book-Rating', 'count')).reset_index()

# Keep the books with over 100 ratings
avg_ratings100 = avg_ratings[avg_ratings['number_of_ratings']>100]

# Sort average ratings in descending order
sorted_avg_ratings = avg_ratings100.sort_values(by='avg_rating', ascending=False)

In [32]:
# Check popular books
sorted_avg_ratings.head()

Unnamed: 0,ISBN,avg_rating,number_of_ratings
52761,0439139597,9.262774,137
52741,0439136369,9.082707,133
52740,0439136350,9.035461,141
53142,043935806X,9.033981,206
77940,0590353403,8.983193,119


In [33]:
from scipy.sparse import coo_matrix
from sklearn.metrics.pairwise import cosine_similarity

# Assuming unique_books and unique_users are precomputed
user_ids = ratings_df['User-ID'].unique()
book_ids = ratings_df['ISBN'].unique()

user_id_to_index = {user_id: index for index, user_id in enumerate(user_ids)}
book_id_to_index = {book_id: index for index, book_id in enumerate(book_ids)}

# Create a COO sparse matrix
data = []
row_indices = []
col_indices = []

for index, row in ratings_df.iterrows():
    user_index = user_id_to_index.get(row['User-ID'])
    book_index = book_id_to_index.get(row['ISBN'])

    if user_index is not None and book_index is not None:
        data.append(row['Book-Rating'])
        row_indices.append(user_index)
        col_indices.append(book_index)

user_book_coo = coo_matrix((data, (row_indices, col_indices)), shape=(len(user_ids), len(book_ids)))

# Compute cosine similarity
cosine_sim_matrix = cosine_similarity(user_book_coo, dense_output=False)

In [34]:
# Map user-id to cosine similarity matrix index
unique_users = ratings_df["User-ID"].unique()
user_index = {k: v for v, k in enumerate(unique_users)}

In [35]:
unique_books = set(ratings_df["ISBN"].unique())
len(unique_books)

185973

In [36]:
# Store all the average ratings with an average rating above a threshold as popular
popular_sorted_average_ratings = sorted_avg_ratings[sorted_avg_ratings['avg_rating']>8]
random_top_ten = popular_sorted_average_ratings.sample(n = 10)
sorted_top_ten = random_top_ten.sort_values(by = "avg_rating", ascending = False)
print(sorted_top_ten)

              ISBN  avg_rating  number_of_ratings
57050   0446310786    8.943925                214
52612   0439064864    8.920635                126
52613   0439064872    8.783069                189
59594   0449005615    8.483333                120
82347   0671001795    8.258929                112
104319  0767902521    8.207547                106
109650  0804106304    8.195876                194
25764   0316096199    8.195489                133
26378   0316666343    8.185290                707
17136   0156027321    8.030973                226


In [37]:
# # Uses the same algorithm as recommendation system but predicts for specific books, used for insights
# def predict_book_ratings(user, books):
#     # Dictionary to store ratings
#     predicted_ratings = dict()
#     for book in books:
        

In [38]:
# Returns top 10 book recommendations
def recommend(user):
    # Convert user-id to matrix index
    index = user_index.get(user)

    # Find the unrated books by specific user
    rated_books = ratings_df[ratings_df['User-ID'] == user]["ISBN"]
    unrated_books = unique_books - set(rated_books)

    # Average Nearest-Neighbors calculation
    # Dictionary to store ratings
    predicted_ratings = dict()
    for i in range(cosine_sim_matrix.shape[0]):
        # If cosine similarity is above a certain threshold, add it to weighted average
        if (cosine_sim_matrix[(index, i)] > 0.3):
            # Gets the User-ID from the index
            user2 = unique_users[i]
            user2_df = ratings_df[ratings_df['User-ID'] == user2]
            for book in unrated_books:
                if book in user2_df["ISBN"].values:
                    tmp = predicted_ratings.get(book,[0,0])
                    tmp[0] += user2_df[user2_df["ISBN"] == book]["Book-Rating"].values[0]
                    tmp[1] += 1
                    predicted_ratings[book] = tmp

    # Calculate the average ratings of neighbors by dividing total by total count
    for key in predicted_ratings:
        predicted_ratings[key] = predicted_ratings.get(key)[0] // predicted_ratings.get(key)[1]

    # Get the top 10 recommended books
    top_10 = sorted(predicted_ratings.items(), key=lambda x:-x[1])[:10]

    # Store the result in a 
    result = []
    for x in top_10:
        result.append(x[0])
    return result


In [39]:
#### MAIN PROGRAM - CHANGE THE USER-ID HERE
# Select a user to recommend books - change this variable
USER = 276704

top_10_ISBN = recommend(USER)  # Assuming you have a function called 'recommend' that returns the top 10 ISBNs
top_10 = avg_ratings[avg_ratings["ISBN"].isin(top_10_ISBN)]
print("Recommended books for user with user-id " + str(USER) + ": ")
print(top_10)

Recommended books for user with user-id 276704: 
                ISBN  avg_rating  number_of_ratings
48998     039914370X    7.857143                  7
54590     0440226430    7.556075                214
97559     0743211375    7.288136                 59
97870  0743235150030    5.000000                  1
