In [3]:
import pandas as pd

In [4]:
my_books = pd.read_csv("liked_books.csv", index_col=0)

In [5]:
my_books

Unnamed: 0,user_id,book_id,rating,title
0,-1,2517439,5,"The Forever War (The Forever War, #1)"
1,-1,113576,5,The Smartest Guys in the Room: The Amazing Ris...
2,-1,35100,5,Battle Cry of Freedom
3,-1,228221,5,The Mask of Command
5,-1,17662739,5,"2001: A Space Odyssey (Space Odyssey, #1)"
6,-1,356824,5,India After Gandhi: The History of the World's...
7,-1,12125412,5,The Lady or the Tiger?: and Other Logic Puzzles
8,-1,139069,5,Endurance: Shackleton's Incredible Voyage
10,-1,76680,5,"Foundation (Foundation, #1)"
11,-1,1898,5,Into Thin Air: A Personal Account of the Mount...


In [6]:
my_books["book_id"] = my_books['book_id'].astype(str)

In [7]:
csv_book_mapping = {}

with open("book_id_map.csv", "r") as f:
    while True:
        line = f.readline()
        if not line:
            break
        csv_id, book_id = line.strip().split(",")
        csv_book_mapping[csv_id] = book_id

In [8]:
book_set = set(my_books["book_id"])

In [9]:
overlap_users = {}

with open("goodreads_interactions.csv.crdownload") as f:
    while True:
        line = f.readline()
        if not line:
            break
        try:
            user_id, csv_id, _, rating, _ = line.strip().split(",")

            book_id = csv_book_mapping.get(csv_id)
            if book_id in book_set:
                if user_id not in overlap_users:
                    overlap_users[user_id] = 1
                else:
                    overlap_users[user_id] += 1
        except ValueError:
            continue

In [10]:
len(overlap_users)

192601

In [11]:
filtered_overlap_users = set([k for k in overlap_users if overlap_users[k] > my_books.shape[0]/5])

In [12]:
interactions_list = []

with open("goodreads_interactions.csv.crdownload") as f:
    while True:
        line = f.readline()
        if not line:
            break

        try:
            user_id, csv_id, _, rating, _ = line.strip().split(",")
            if user_id in filtered_overlap_users:
                books_id = csv_book_mapping[csv_id]
                interactions_list.append([user_id, book_id, rating])
        except ValueError:
            continue

In [13]:
len(interactions_list)

4578250

In [14]:
interactions_list[0]

['282', '6936382', '4']

In [15]:
interactions = pd.DataFrame(interactions_list, columns=["user_id", "book_id", "rating"])

In [16]:
interactions = pd.concat([my_books[["user_id", "book_id", "rating"]], interactions])

In [17]:
interactions

Unnamed: 0,user_id,book_id,rating
0,-1,2517439,5
1,-1,113576,5
2,-1,35100,5
3,-1,228221,5
5,-1,17662739,5
...,...,...,...
4578245,361921,6936382,0
4578246,361921,6936382,0
4578247,361921,6936382,0
4578248,361921,6936382,0


In [18]:
interactions['book_id'] = interactions['book_id'].astype(str)
interactions['user_id'] = interactions['user_id'].astype(str)
interactions['rating'] = pd.to_numeric(interactions['rating'])

In [19]:
interactions['user_id'].unique()

array(['-1', '282', '874', ..., '360934', '361563', '361921'],
      dtype=object)

In [20]:
interactions['user_index'] = interactions['user_id'].astype('category').cat.codes

In [21]:
interactions.iloc[0]

user_id            -1
book_id       2517439
rating              5
user_index          0
Name: 0, dtype: object

In [22]:
interactions['book_index'] = interactions['book_id'].astype('category').cat.codes

In [23]:
len(interactions['book_index'].unique())

28

In [24]:
from scipy.sparse import coo_matrix

In [25]:
ratings_mat_coo = coo_matrix((interactions['rating'], (interactions['user_index'], interactions['book_index'])))

In [26]:
ratings_mat_coo

<1019x28 sparse matrix of type '<class 'numpy.int64'>'
	with 4578277 stored elements in COOrdinate format>

In [27]:
ratings_mat = ratings_mat_coo.tocsr()

In [28]:
interactions[interactions['user_id'] == '-1']

Unnamed: 0,user_id,book_id,rating,user_index,book_index
0,-1,2517439,5,0,11
1,-1,113576,5,0,0
2,-1,35100,5,0,15
3,-1,228221,5,0,9
5,-1,17662739,5,0,6
6,-1,356824,5,0,16
7,-1,12125412,5,0,1
8,-1,139069,5,0,4
10,-1,76680,5,0,23
11,-1,1898,5,0,8


In [29]:
my_index = 0

In [30]:
from sklearn.metrics.pairwise import cosine_similarity

In [31]:
similarity = cosine_similarity(ratings_mat[my_index,:], ratings_mat).flatten()

In [32]:
import numpy as np

In [33]:
indices = np.argpartition(similarity, -15)[-15:]

In [34]:
indices

array([334, 345, 347, 343, 346, 341, 344, 348, 342, 336, 340, 339, 338,
       337,   0])

In [35]:
similar_users = interactions[interactions['user_index'].isin(indices)].copy()

In [36]:
similar_users = similar_users[similar_users['user_id'] != '-1']

In [37]:
similar_users

Unnamed: 0,user_id,book_id,rating,user_index,book_index
24642,2111,6936382,4,345,22
24643,2111,6936382,4,345,22
24644,2111,6936382,4,345,22
24645,2111,6936382,5,345,22
24646,2111,6936382,4,345,22
...,...,...,...,...,...
2800847,212566,6936382,0,348,22
2800848,212566,6936382,0,348,22
2800849,212566,6936382,0,348,22
2800850,212566,6936382,0,348,22


In [38]:
book_recs = similar_users.groupby("book_id").rating.agg(['count', 'mean'])

In [39]:
book_recs

Unnamed: 0_level_0,count,mean
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
6936382,37611,0.737869


In [40]:
books_titles = pd.read_json('books_titles.json')
books_titles['book_id'] = books_titles['book_id'].astype(str)

In [41]:
book_recs = book_recs.merge(books_titles, how='inner', on='book_id')

In [42]:
book_recs

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title
0,6936382,37611,0.737869,Anna and the French Kiss (Anna and the French ...,267364,https://www.goodreads.com/book/show/6936382-an...,https://images.gr-assets.com/books/1358271931m...,anna and the french kiss anna and the french k...


In [43]:
book_recs['adjusted_count'] = book_recs['count'] * (book_recs['count'] / book_recs['ratings'])

In [44]:
book_recs['score'] = book_recs['mean'] * book_recs['adjusted_count']

In [45]:
book_recs = book_recs[~book_recs['book_id'].isin(my_books['book_id'])]

In [46]:
book_recs

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title,adjusted_count,score
0,6936382,37611,0.737869,Anna and the French Kiss (Anna and the French ...,267364,https://www.goodreads.com/book/show/6936382-an...,https://images.gr-assets.com/books/1358271931m...,anna and the french kiss anna and the french k...,5290.866837,3903.967894


In [47]:
my_books['mod_title'] = my_books['title'].str.replace("[^a-zA-Z0-9 ]", "", regex=True).str.lower()

In [49]:
my_books['mod_title'] = my_books['mod_title'].str.replace("\s+", " ", regex=True)

In [51]:
book_recs = book_recs[~book_recs['mod_title'].isin(my_books['mod_title'])]

In [53]:
book_recs = book_recs[book_recs['count']>2]

In [54]:
top_recs = book_recs.sort_values('score', ascending=False)

In [55]:
top_recs

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title,adjusted_count,score
0,6936382,37611,0.737869,Anna and the French Kiss (Anna and the French ...,267364,https://www.goodreads.com/book/show/6936382-an...,https://images.gr-assets.com/books/1358271931m...,anna and the french kiss anna and the french k...,5290.866837,3903.967894


In [56]:
def make_clickable(val):
    return '<a target="_blank" href="{}">Goodreads</a>'.format(val)

def show_image(val):
    return '<img src="{}" width=50></img>'.format(val)

In [57]:
top_recs.style.format({'url': make_clickable, 'cover_image': show_image})

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title,adjusted_count,score
0,6936382,37611,0.737869,"Anna and the French Kiss (Anna and the French Kiss, #1)",267364,Goodreads,,anna and the french kiss anna and the french kiss 1,5290.866837,3903.967894
