In [59]:
import pandas as pd
# ================== Preprocessing ====================
# Set the name of the file
rating_filename = 'Ratings.csv'
# user_filename = 'Users.csv'
#book_filename = 'Books.csv'

ratings_df = pd.read_csv(rating_filename)

print(ratings_df.head())

# Rating - User-ID, ISBN, Book-Rating
# Connect via ISBN

   User-ID        ISBN  Book-Rating
0   276725  034545104X            0
1   276726  0155061224            5
2   276727  0446520802            0
3   276729  052165615X            3
4   276729  0521795028            6


In [60]:
print(len(ratings_df["User-ID"].unique()))
print("Number of unique books: ", len(ratings_df["User-ID"].unique()))
print(ratings_df.shape)

105283
Number of unique books:  105283
(1149780, 3)


In [61]:
# Drop book ratings with 0 because it is already implied and not every book has a rating so it is unnecessary
ratings_df = ratings_df[ratings_df["Book-Rating"] != 0]
ratings_df.shape

(433671, 3)

In [62]:
# print(ratings_df.mean())
# print(ratings_df.max())

In [63]:
# Take the aggregate ratings (worst case scenario if user is new and age is unknown)
avg_ratings = ratings_df.groupby('ISBN').agg(avg_rating = ('Book-Rating', 'mean'),
                                                number_of_ratings = ('Book-Rating', 'count')).reset_index()

# Keep the books with over 100 ratings
avg_ratings100 = avg_ratings[avg_ratings['number_of_ratings']>100]

In [64]:
# Check popular books
avg_ratings100.sort_values(by='avg_rating', ascending=False).head()

Unnamed: 0,ISBN,avg_rating,number_of_ratings
52761,0439139597,9.262774,137
52741,0439136369,9.082707,133
52740,0439136350,9.035461,141
53142,043935806X,9.033981,206
77940,0590353403,8.983193,119


In [65]:
# Split the data into training and testing data set
test_set, train_set = ratings_df[:len(ratings_df)//2], ratings_df[len(ratings_df)//2:]

In [66]:
from scipy.sparse import coo_matrix
from sklearn.metrics.pairwise import cosine_similarity

# Assuming unique_books and unique_users are precomputed
user_ids = train_set['User-ID'].unique()
book_ids = train_set['ISBN'].unique()

user_id_to_index = {user_id: index for index, user_id in enumerate(user_ids)}
book_id_to_index = {book_id: index for index, book_id in enumerate(book_ids)}

# Create a COO sparse matrix
data = []
row_indices = []
col_indices = []

for index, row in ratings_df.iterrows():
    user_index = user_id_to_index.get(row['User-ID'])
    book_index = book_id_to_index.get(row['ISBN'])

    if user_index is not None and book_index is not None:
        data.append(row['Book-Rating'])
        row_indices.append(user_index)
        col_indices.append(book_index)

user_book_coo = coo_matrix((data, (row_indices, col_indices)), shape=(len(user_ids), len(book_ids)))

# Compute cosine similarity
cosine_sim_matrix = cosine_similarity(user_book_coo, dense_output=False)

In [67]:
# Map user-id to cosine similarity matrix index
unique_users = train_set["User-ID"].unique()
user_index = {k: v for v, k in enumerate(unique_users)}

In [68]:
unique_books = set(train_set["ISBN"].unique())
len(unique_books)

111419

In [69]:
#### MAIN PROGRAM - CHANGE THE USER-ID HERE
# Select a user to recommend books
user = 276704

In [70]:
# Convert user-id to matrix index
index = user_index.get(user)
print(index)

40348


In [71]:
# Find the unrated books by specific user
rated_books = train_set[train_set['User-ID'] == user]["ISBN"]
unrated_books = unique_books - set(rated_books)
len(unrated_books)

111414

In [88]:
# Average Nearest-Neighbors calculation
# Dictionary to store ratings
predicted_ratings = dict()
for i in range(cosine_sim_matrix.shape[0]):
    # If cosine similarity is above a certain threshold, add it to weighted average
    if (cosine_sim_matrix[(index, i)] > 0.1):
        # Gets the User-ID from the index
        user2 = unique_users[i]
        user2_df = train_set[train_set['User-ID'] == user2]
        for book in unrated_books:
            if book in user2_df["ISBN"].values:
                tmp = predicted_ratings.get(book,[0,0])
                tmp[0] += user2_df[user2_df["ISBN"] == book]["Book-Rating"].values[0]
                tmp[1] += 1
                predicted_ratings[book] = tmp
        

In [90]:
for key in predicted_ratings:
    predicted_ratings[key] = predicted_ratings.get(key)[0] // predicted_ratings.get(key)[1]

In [91]:
predicted_ratings

{'0375411550': 8,
 '1582342113': 7,
 '0156002027': 10,
 '0786868716': 6,
 '0312026021': 10,
 '0380726246': 6,
 '0446602620': 6,
 '0380802872': 6,
 '0553572202': 7,
 '0399146253': 5,
 '0316969680': 7,
 '0553568701': 7,
 '0312974485': 4,
 '0440226430': 7,
 '0399145885': 9,
 '0316602051': 8,
 '0345458915': 8,
 '0385510438': 8,
 '0385508042': 9,
 '0688156134': 8,
 '0425153975': 7,
 '0886773784': 9,
 '0425107469': 7,
 '0886773776': 9,
 '0886774632': 8,
 '0380794292': 6,
 '0886775639': 8,
 '0553265520': 6,
 '0316284955': 10,
 '0449005615': 7,
 '0440234743': 8,
 '0373280254': 5,
 '051511264X': 6,
 '0399152288': 8,
 '0684871521': 9,
 '0399149155': 7,
 '0060514949': 6,
 '0440206154': 9,
 '0812532007': 7,
 '0425097609': 10,
 '0671041789': 10,
 '0451124340': 9,
 '0425134350': 9,
 '0345370775': 10,
 '0425109720': 10,
 '0425100650': 9,
 '0553563718': 7,
 '0060929871': 10,
 '0345396065': 6,
 '0942802799': 7,
 '0760700362': 8,
 '0345379802': 9,
 '1586635603': 8,
 '0553568728': 6,
 '0441328008': 8,
 '

In [92]:
# Get the top 10 recommended books
top_10 = sorted(predicted_ratings.items(), key=lambda x:-x[1])[:10]

In [93]:
# Print the top 10 books for the specific user
print(top_10)

[('0156002027', 10),
 ('0312026021', 10),
 ('0316284955', 10),
 ('0425097609', 10),
 ('0671041789', 10),
 ('0345370775', 10),
 ('0425109720', 10),
 ('0060929871', 10),
 ('0670855030', 10),
 ('0684867621', 10)]