In [3]:
import pandas as pd
# ================== Preprocessing ====================
# Set the name of the file
rating_filename = 'Ratings.csv'
# user_filename = 'Users.csv'
# book_filename = 'Books.csv'

ratings_df = pd.read_csv(rating_filename)

print(ratings_df.head())

# Rating - User-ID, ISBN, Book-Rating
# Connect via ISBN

   User-ID        ISBN  Book-Rating
0   276725  034545104X            0
1   276726  0155061224            5
2   276727  0446520802            0
3   276729  052165615X            3
4   276729  0521795028            6


In [4]:
print(len(ratings_df["User-ID"].unique()))
print(len(ratings_df["ISBN"].unique()))
print(ratings_df.shape)

105283
340556
(1149780, 3)


In [5]:
# Take the aggregate ratings (worst case scenario if user is new and age is unknown)
avg_ratings = ratings_df.groupby('ISBN').agg(avg_rating = ('Book-Rating', 'mean'),
                                                number_of_ratings = ('Book-Rating', 'count')).reset_index()

# Keep the books with over 100 ratings
avg_ratings100 = avg_ratings[avg_ratings['number_of_ratings']>100]

In [6]:
# Check popular books
avg_ratings100.sort_values(by='avg_rating', ascending=False).head()

Unnamed: 0,ISBN,avg_rating,number_of_ratings
97171,0439064864,6.611765,170
97511,0439139597,6.541237,194
97470,0439136350,6.467005,197
144129,0590353403,6.363095,168
98386,043935806X,5.571856,334


In [7]:
# Drop book ratings with 0 because it is already implied and not every book has a rating so it is unnecessary
ratings_df = ratings_df[ratings_df["Book-Rating"] != 0]
ratings_df.shape

(433671, 3)

In [8]:
import numpy as np
ratings = np.array(ratings_df["Book-Rating"])
unique_users = np.array(ratings_df["User-ID"].sort_values().unique())
unique_books = np.array(ratings_df["ISBN"].sort_values().unique())

In [9]:
# Number of ratings not equal for all, should remove 0's
user_counts = ratings_df.groupby('User-ID').agg(number_of_ratings = ('ISBN', 'count')).reset_index()
user_counts.head()

Unnamed: 0,User-ID,number_of_ratings
0,8,7
1,9,1
2,10,1
3,12,1
4,14,3


In [10]:
print(user_counts.mean())
print(user_counts.max())

User-ID              139243.292822
number_of_ratings         5.573819
dtype: float64
User-ID              278854
number_of_ratings      8524
dtype: int64


In [11]:
user_book = dict()

book_index = {k: v for v, k in enumerate(unique_books)}
user_index = {k: v for v, k in enumerate(unique_users)}

In [12]:
arr_users = [{} for _ in range(len(unique_users))]
i = 0
for user_id, group in ratings_df.groupby("User-ID"):
    for index, row in group.iterrows():
        arr_users[i][row['ISBN']] = row['Book-Rating']
    i += 1

In [13]:
arr_users

[{'0002005018': 5,
  '074322678X': 5,
  '0887841740': 5,
  '1552041778': 5,
  '1567407781': 6,
  '1575663937': 6,
  '1881320189': 7},
 {'0452264464': 6},
 {'8477024456': 6},
 {'1879384493': 10},
 {'0061076031': 5, '0439095026': 5, '0689821166': 6},
 {'0345402871': 9},
 {'0425099148': 7, '0553264990': 5, '0891075275': 6, '0891076182': 3},
 {'0375759778': 7},
 {'3404921038': 7},
 {'0446310786': 10, '0449005615': 9},
 {'0060168013': 8},
 {'0553582909': 8, '0671888587': 7},
 {'0553582747': 7},
 {'0440223571': 8},
 {'0440225701': 9},
 {'0060914068': 3, '0156047624': 10, '0245542957': 6, '0380715899': 9},
 {'0671623249': 7, '0679865691': 9},
 {'2070423204': 7},
 {'0394743741': 7},
 {'0617683993': 6, '0676973655': 3, '1853260053': 8},
 {'1414035004': 10},
 {'0060938412': 5},
 {'1558531025': 5},
 {'0394895894': 8},
 {'0375410538': 5},
 {'0966986105': 10},
 {'087113375X': 7},
 {'0340767936': 5},
 {'0060930365': 4},
 {'0071416331': 8, '0375509038': 8},
 {'0553062042': 7},
 {'0316769487': 9},
 {'

In [14]:
neighbors = dict()

In [16]:
vals = list(set(arr_users[1].keys()).symmetric_difference(set(arr_users[0].keys())))
vals

['0002005018',
 '1575663937',
 '0452264464',
 '1881320189',
 '074322678X',
 '1567407781',
 '1552041778',
 '0887841740']

In [17]:
for keys in arr_users[1].keys():
    print(keys)

0452264464


In [27]:
# for i in range(len(arr_users)):
#     for j in range(len(arr_users[i+1:])):
#         vals = list(set(arr_users[i].keys()).symmetric_difference(set(arr_users[j].keys())))
#         arr1 = [0] * len(vals)
#         arr2 = [0] * len(vals)
#         for k in range(len(vals)):
#             arr1[k] = arr_users[i].get(vals.)

In [19]:
# Iterate through the grouped DataFrame
# for user_id, group in ratings_df.groupby("User-ID"):
#     user_ratings = [0] * len(unique_books)
#     for index, row in group.iterrows():
#         # Access individual rows using 'row'
#         user_ratings[book_index.get(row['ISBN'])] = row['Book-Rating']
#         #print(f"User-ID: {user_id}, ISBN: {row['ISBN']}, Rating: {row['Book-Rating']}")
#     user_book[user_id] = user_ratings

from scipy.sparse import coo_matrix
from sklearn.metrics.pairwise import cosine_similarity

# Assuming unique_books and unique_users are precomputed
user_ids = ratings_df['User-ID'].unique()
book_ids = ratings_df['ISBN'].unique()

user_id_to_index = {user_id: index for index, user_id in enumerate(user_ids)}
book_id_to_index = {book_id: index for index, book_id in enumerate(book_ids)}

# Create a COO sparse matrix
data = []
row_indices = []
col_indices = []

for index, row in ratings_df.iterrows():
    user_index = user_id_to_index.get(row['User-ID'])
    book_index = book_id_to_index.get(row['ISBN'])

    if user_index is not None and book_index is not None:
        data.append(row['Book-Rating'])
        row_indices.append(user_index)
        col_indices.append(book_index)

user_book_coo = coo_matrix((data, (row_indices, col_indices)), shape=(len(user_ids), len(book_ids)))

# Compute cosine similarity
cosine_sim_matrix = cosine_similarity(user_book_coo, dense_output=False)


In [23]:
print(cosine_sim_matrix)

  (0, 0)	1.0
  (1, 1)	0.9999999999999999
  (2, 56039)	1.0
  (2, 34171)	0.40032038451271784
  (2, 24920)	0.20991342856239587
  (2, 2)	1.0
  (3, 3)	1.0
  (4, 74622)	0.31330417999518295
  (4, 73374)	1.0
  (4, 73165)	0.12777531299998798
  (4, 73060)	0.1909283775779191
  (4, 72800)	0.4016096644512494
  (4, 71608)	0.2952692976787463
  (4, 71536)	0.4845015831115092
  (4, 71259)	0.09627964063946631
  (4, 70767)	0.17078251276599332
  (4, 69242)	0.6581451817144176
  (4, 67743)	0.22971445240057015
  (4, 66151)	0.24595948397164089
  (4, 61584)	0.19384707146759894
  (4, 60854)	0.19806834933486603
  (4, 60128)	0.6085806194501846
  (4, 59894)	0.2988400950769137
  (4, 59354)	0.06586795017517848
  (4, 58180)	1.0
  :	:
  (77801, 8493)	0.02814174105955321
  (77801, 3785)	0.011305416563471619
  (77801, 3576)	0.2886751345948129
  (77801, 373)	0.14433756729740643
  (77802, 77802)	1.0
  (77802, 70299)	0.13022202248430043
  (77802, 68119)	0.06202571747248687
  (77802, 63778)	0.08287893668111398
  (77802, 5144

In [69]:
from scipy.sparse import csr_matrix
from pandas.api.types import CategoricalDtype

rcLabel, vLabel = ('User-ID', 'ISBN'), 'Book-Rating'
rcCat = [CategoricalDtype(sorted(df[col].unique()), ordered=True) for col in rcLabel]
rc = [df[column].astype(aType).cat.codes for column, aType in zip(rcLabel, rcCat)]
mat = csr_matrix((df[vLabel], rc), shape=tuple(cat.categories.size for cat in rcCat))
dfOut = ( pd.DataFrame.sparse.from_spmatrix(
    mat, index=rcCat[0].categories, columns=rcCat[1].categories) )