In [8]:
import pandas as pd
# ================== Preprocessing ====================
# Set the name of the file
rating_filename = 'Ratings.csv'
# user_filename = 'Users.csv'
book_filename = 'Books.csv'

ratings_df = pd.read_csv(rating_filename)

print(ratings_df.head())

# Rating - User-ID, ISBN, Book-Rating
# Connect via ISBN

   User-ID        ISBN  Book-Rating
0   276725  034545104X            0
1   276726  0155061224            5
2   276727  0446520802            0
3   276729  052165615X            3
4   276729  0521795028            6


In [9]:
print(len(ratings_df["User-ID"].unique()))
print(len(ratings_df["ISBN"].unique()))
print(ratings_df.shape)

105283
340556
(1149780, 3)


In [10]:
# Take the aggregate ratings (worst case scenario if user is new and age is unknown)
avg_ratings = ratings_df.groupby('ISBN').agg(avg_rating = ('Book-Rating', 'mean'),
                                                number_of_ratings = ('Book-Rating', 'count')).reset_index()

# Keep the books with over 100 ratings
avg_ratings100 = avg_ratings[avg_ratings['number_of_ratings']>100]

In [11]:
# Check popular books
avg_ratings100.sort_values(by='avg_rating', ascending=False).head()

Unnamed: 0,ISBN,avg_rating,number_of_ratings
97171,0439064864,6.611765,170
97511,0439139597,6.541237,194
97470,0439136350,6.467005,197
144129,0590353403,6.363095,168
98386,043935806X,5.571856,334


In [12]:
# Drop book ratings with 0 because it is already implied and not every book has a rating so it is unnecessary
ratings_df = ratings_df[ratings_df["Book-Rating"] != 0]
ratings_df.shape

(433671, 3)

In [25]:
test_set, train_set = ratings_df[:len(ratings_df)//2], ratings_df[len(ratings_df)//2:]

In [27]:
len(train_set)

216836

In [28]:
len(test_set)

216835

In [29]:
from scipy.sparse import coo_matrix
from sklearn.metrics.pairwise import cosine_similarity

# Assuming unique_books and unique_users are precomputed
user_ids = train_set['User-ID'].unique()
book_ids = train_set['ISBN'].unique()

user_id_to_index = {user_id: index for index, user_id in enumerate(user_ids)}
book_id_to_index = {book_id: index for index, book_id in enumerate(book_ids)}

# Create a COO sparse matrix
data = []
row_indices = []
col_indices = []

for index, row in ratings_df.iterrows():
    user_index = user_id_to_index.get(row['User-ID'])
    book_index = book_id_to_index.get(row['ISBN'])

    if user_index is not None and book_index is not None:
        data.append(row['Book-Rating'])
        row_indices.append(user_index)
        col_indices.append(book_index)

user_book_coo = coo_matrix((data, (row_indices, col_indices)), shape=(len(user_ids), len(book_ids)))

# Compute cosine similarity
cosine_sim_matrix = cosine_similarity(user_book_coo, dense_output=False)

In [30]:
print(cosine_sim_matrix)

  (0, 0)	1.0
  (1, 37124)	0.7071067811865476
  (1, 1)	1.0000000000000002
  (2, 30529)	0.3394112549695428
  (2, 14691)	0.062469504755442426
  (2, 1140)	0.565685424949238
  (2, 40094)	0.7071067811865475
  (2, 39890)	0.37300192329612547
  (2, 39816)	0.12226071776788357
  (2, 39503)	0.4634275570907938
  (2, 39295)	0.2201927530252721
  (2, 39248)	0.11135748693238869
  (2, 38968)	0.07681860194089953
  (2, 38752)	0.026175499305028983
  (2, 38725)	0.11298172894234777
  (2, 37961)	0.22555354977384034
  (2, 37812)	0.7071067811865475
  (2, 37776)	0.5
  (2, 37633)	0.7071067811865475
  (2, 36990)	0.06694094263632394
  (2, 36960)	0.4845015831115092
  (2, 36858)	0.06914898447778771
  (2, 36301)	0.7071067811865475
  (2, 35808)	0.7071067811865475
  (2, 35712)	0.07905694150420947
  :	:
  (40348, 14869)	0.016982209467797507
  (40348, 14696)	0.033590241436966206
  (40348, 14481)	0.3367876570272817
  (40348, 13140)	0.054348874453119894
  (40348, 12762)	0.05410287117201963
  (40348, 10772)	0.039598315929494

In [None]:
user = 2

In [15]:
import numpy as np
#ratings = np.array(ratings_df["Book-Rating"])
unique_users = np.array(ratings_df["User-ID"].sort_values().unique())
# unique_books = np.array(ratings_df["ISBN"].sort_values().unique())

In [16]:
# Number of ratings not equal for all, should remove 0's
user_counts = ratings_df.groupby('User-ID').agg(number_of_ratings = ('ISBN', 'count')).reset_index()
user_counts.head()

Unnamed: 0,User-ID,number_of_ratings
0,8,7
1,9,1
2,10,1
3,12,1
4,14,3


In [17]:
print(user_counts.mean())
print(user_counts.max())

User-ID              139243.292822
number_of_ratings         5.573819
dtype: float64
User-ID              278854
number_of_ratings      8524
dtype: int64


In [18]:
user_book = dict()

#book_index = {k: v for v, k in enumerate(unique_books)}
user_index = {k: v for v, k in enumerate(unique_users)}

In [19]:
arr_users = [{} for _ in range(len(unique_users))]
i = 0
for user_id, group in ratings_df.groupby("User-ID"):
    for index, row in group.iterrows():
        arr_users[i][row['ISBN']] = row['Book-Rating']
    i += 1

In [20]:
# arr_users

In [21]:
# neighbors = dict()

In [22]:
# vals = list(set(arr_users[1].keys()).symmetric_difference(set(arr_users[0].keys())))
# vals

In [23]:
# for keys in arr_users[1].keys():
#     print(keys)

In [24]:
# from itertools import combinations


# for i in range(len(arr_users)):
#     for j in range(len(arr_users[i+1:])):
#         vals = list(set(arr_users[i].keys()).symmetric_difference(set(arr_users[j].keys())))
#         arr1 = [0] * len(vals)
#         arr2 = [0] * len(vals)
#         print(i,j)
#         for k in range(len(vals)):
#             arr1[k] = arr_users[i].get(vals[k],0)
#             arr2[k] = arr_users[j].get(vals[k],0)
