In [1]:
import pandas as pd
# ================== Preprocessing ====================
# Set the name of the file
rating_filename = 'Ratings.csv'
# user_filename = 'Users.csv'
#book_filename = 'Books.csv'

ratings_df = pd.read_csv(rating_filename)

print(ratings_df.head())

# Rating - User-ID, ISBN, Book-Rating
# Connect via ISBN

   User-ID        ISBN  Book-Rating
0   276725  034545104X            0
1   276726  0155061224            5
2   276727  0446520802            0
3   276729  052165615X            3
4   276729  0521795028            6


In [2]:
print(len(ratings_df["User-ID"].unique()))
print("Number of unique books: ", len(ratings_df["User-ID"].unique()))
print(ratings_df.shape)

105283
Number of unique books:  105283
(1149780, 3)


In [3]:
# Drop book ratings with 0 because it is already implied and not every book has a rating so it is unnecessary
ratings_df = ratings_df[ratings_df["Book-Rating"] != 0]
ratings_df.shape

(433671, 3)

In [None]:
print(ratings_df.mean())
print(ratings_df.max())

In [None]:
# Take the aggregate ratings (worst case scenario if user is new and age is unknown)
avg_ratings = ratings_df.groupby('ISBN').agg(avg_rating = ('Book-Rating', 'mean'),
                                                number_of_ratings = ('Book-Rating', 'count')).reset_index()

# Keep the books with over 100 ratings
avg_ratings100 = avg_ratings[avg_ratings['number_of_ratings']>100]

In [None]:
# Check popular books
avg_ratings100.sort_values(by='avg_rating', ascending=False).head()

In [None]:
# Split the data into training and testing data set
test_set, train_set = ratings_df[:len(ratings_df)//2], ratings_df[len(ratings_df)//2:]

In [None]:
from scipy.sparse import coo_matrix
from sklearn.metrics.pairwise import cosine_similarity

# Assuming unique_books and unique_users are precomputed
user_ids = train_set['User-ID'].unique()
book_ids = train_set['ISBN'].unique()

user_id_to_index = {user_id: index for index, user_id in enumerate(user_ids)}
book_id_to_index = {book_id: index for index, book_id in enumerate(book_ids)}

# Create a COO sparse matrix
data = []
row_indices = []
col_indices = []

for index, row in ratings_df.iterrows():
    user_index = user_id_to_index.get(row['User-ID'])
    book_index = book_id_to_index.get(row['ISBN'])

    if user_index is not None and book_index is not None:
        data.append(row['Book-Rating'])
        row_indices.append(user_index)
        col_indices.append(book_index)

user_book_coo = coo_matrix((data, (row_indices, col_indices)), shape=(len(user_ids), len(book_ids)))

# Compute cosine similarity
cosine_sim_matrix = cosine_similarity(user_book_coo, dense_output=False)

In [None]:
train_set

In [None]:
# Map user-id to cosine similarity matrix index
unique_users = train_set["User-ID"].unique()
user_index = {k: v for v, k in enumerate(unique_users)}

In [None]:
unique_books = set(train_set["ISBN"].unique())
len(unique_books)

In [None]:
#### MAIN PROGRAM - CHANGE THE USER-ID HERE
# Select a user to recommend books
user = 276704

In [None]:
# Convert user-id to matrix index
index = user_index.get(user)
print(index)

In [None]:
# Find the unrated books by specific user
rated_books = train_set[train_set['User-ID'] == user]["ISBN"]
unrated_books = unique_books - set(rated_books)
len(unrated_books)

In [None]:
# Average Nearest-Neighbors calculation (WIP)
count = 0
avg = 0
for i in range(cosine_sim_matrix.shape[0]):
    # If cosine similarity is above a certain threshold, add it to weighted average
    if (cosine_sim_matrix[(index, i)] > 0.1):
        avg += cosine_sim_matrix[(index, i)]
        count += 1
print(count)