In [1]:
import pandas as pd

my_books = pd.read_csv('liked_books.csv', index_col=0)

In [3]:
my_books.head()

Unnamed: 0,user_id,book_id,rating,title
0,-1,2517439,5,"The Forever War (The Forever War, #1)"
1,-1,113576,5,The Smartest Guys in the Room: The Amazing Ris...
2,-1,35100,5,Battle Cry of Freedom
3,-1,228221,5,The Mask of Command
5,-1,17662739,5,"2001: A Space Odyssey (Space Odyssey, #1)"


In [12]:
my_books.shape

(27, 4)

In [4]:
my_books['book_id'] = my_books['book_id'].astype(str)

In [5]:
!head book_id_map.csv

book_id_csv,book_id
0,34684622
1,34536488
2,34017076
3,71730
4,30422361
5,33503613
6,33517540
7,34467031
8,6383669


In [6]:
csv_book_mapping = {}

with open('book_id_map.csv', 'r') as f:
    while True:
        line = f.readline()
        if not line:
            break        
        csv_id, book_id = line.strip().split(',')
        csv_book_mapping[csv_id] = book_id

In [7]:
book_set = set(my_books['book_id'])

In [8]:
!head goodreads_interactions.csv

user_id,book_id,is_read,rating,is_reviewed
0,948,1,5,0
0,947,1,5,1
0,946,1,5,0
0,945,1,5,0
0,944,1,5,0
0,943,1,5,0
0,942,1,5,0
0,941,1,5,0
0,940,1,5,0


In [9]:
!wc -l goodreads_interactions.csv

 228648343 goodreads_interactions.csv


In [10]:
overlap_users = {}

with open('goodreads_interactions.csv') as f:
    while True:
        line = f.readline()
        if not line:
            break
        
        user_id, csv_id, _, rating, _ = line.strip().split(',')
        book_id = csv_book_mapping.get(csv_id)
        if book_id in book_set:
            if user_id not in overlap_users:
                overlap_users[user_id] = 1
            else:
                overlap_users[user_id] += 1

In [11]:
len(overlap_users)

316341

In [13]:
filtered_overlap_users = set([i for i in overlap_users if overlap_users[i] > my_books.shape[0]/5])

In [14]:
interaction_list = []

with open('goodreads_interactions.csv') as f:
    while True:
        line = f.readline()
        if not line:
            break
        
        user_id, csv_id, _, rating, _ = line.strip().split(',')
        if user_id in filtered_overlap_users:
            book_id = csv_book_mapping[csv_id]
            interaction_list.append([user_id, book_id, rating])

In [15]:
len(interaction_list)

5638701

In [16]:
interaction_list[0]

['282', '627206', '4']

In [17]:
interactions = pd.DataFrame(interaction_list, columns=['user_id', 'book_id', 'rating'])

In [18]:
interactions = pd.concat([my_books[['user_id', 'book_id', 'rating']], interactions])

In [19]:
interactions

Unnamed: 0,user_id,book_id,rating
0,-1,2517439,5
1,-1,113576,5
2,-1,35100,5
3,-1,228221,5
5,-1,17662739,5
...,...,...,...
5638696,804100,475178,0
5638697,804100,186074,0
5638698,804100,153008,0
5638699,804100,45107,0


In [20]:
interactions['user_id'] = interactions['user_id'].astype(str)
interactions['book_id'] = interactions['book_id'].astype(str)
interactions['rating'] = pd.to_numeric(interactions['rating'])

In [21]:
interactions['user_id'].unique()

array(['-1', '282', '874', ..., '442043', '712588', '804100'],
      dtype=object)

In [22]:
interactions['user_index'] = interactions['user_id'].astype('category').cat.codes

In [25]:
interactions['user_index'].unique()

array([   0,  555, 1216, ..., 1054, 1143, 1183], dtype=int16)

In [26]:
interactions['book_index'] = interactions['book_id'].astype('category').cat.codes

In [27]:
interactions.head()

Unnamed: 0,user_id,book_id,rating,user_index,book_index
0,-1,2517439,5,0,414880
1,-1,113576,5,0,38971
2,-1,35100,5,0,575858
3,-1,228221,5,0,356004
5,-1,17662739,5,0,214285


In [28]:
len(interactions['user_index'].unique())

1259

In [29]:
len(interactions['book_index'].unique())

802870

In [30]:
1259 * 802870 # cells

1010813330

In [31]:
# sparse metrix
from scipy.sparse import coo_matrix

# coo_metrix((value, (row_position, column_position)))
rating_coo_matrix = coo_matrix((interactions['rating'], (interactions['user_index'], interactions['book_index'])))

In [32]:
rating_coo_matrix

<1259x802870 sparse matrix of type '<class 'numpy.int64'>'
	with 5638728 stored elements in COOrdinate format>

In [33]:
ratings_matrix = rating_coo_matrix.tocsr()

In [34]:
interactions[interactions['user_id'] == '-1']

Unnamed: 0,user_id,book_id,rating,user_index,book_index
0,-1,2517439,5,0,414880
1,-1,113576,5,0,38971
2,-1,35100,5,0,575858
3,-1,228221,5,0,356004
5,-1,17662739,5,0,214285
6,-1,356824,5,0,581743
7,-1,12125412,5,0,59763
8,-1,139069,5,0,124430
10,-1,76680,5,0,722098
11,-1,1898,5,0,276178


In [35]:
my_index = 0

In [36]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(ratings_matrix[my_index, :], ratings_matrix).flatten()

In [40]:
similarity[0]

0.9999999999999999

In [41]:
import numpy as np

indices = np.argpartition(similarity, -15)[-15:]

In [42]:
indices

array([1188,  942,  218,  129,  496,  435, 1208,  795, 1213, 1210, 1143,
        321,  294,  862,    0])

In [46]:
similar_users = interactions[interactions['user_index'].isin(indices)].copy()

In [47]:
similar_users = similar_user[similar_user['user_id'] != '-1']

In [48]:
similar_users

Unnamed: 0,user_id,book_id,rating,user_index,book_index
45312,4133,5359,3,942,632143
45313,4133,10464963,4,942,13492
45314,4133,3858,3,942,593622
45315,4133,11827808,4,942,51904
45316,4133,7913305,4,942,732465
...,...,...,...,...,...
5638521,712588,32388712,3,1143,543119
5638522,712588,16322,5,1143,183365
5638523,712588,860543,0,1143,759827
5638524,712588,853510,5,1143,756768


In [49]:
book_recs = similar_users.groupby('book_id')['rating'].agg(['count', 'mean'])

In [50]:
book_recs

Unnamed: 0_level_0,count,mean
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,6,3.833333
100322,1,0.000000
100365,1,0.000000
10046142,1,0.000000
1005,3,0.000000
...,...,...
99561,2,2.500000
99610,1,3.000000
99664,1,4.000000
9969571,3,2.333333


In [51]:
book_titles = pd.read_json('book_titles.json')
book_titles['book_id'] = book_titles['book_id'].astype(str)

In [52]:
book_recs = book_recs.merge(book_titles, how='inner', on='book_id')

In [53]:
book_recs.head()

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title
0,1,6,3.833333,Harry Potter and the Half-Blood Prince (Harry ...,1713866,https://www.goodreads.com/book/show/1.Harry_Po...,https://images.gr-assets.com/books/1361039191m...,harry potter and the halfblood prince harry po...
1,100322,1,0.0,Assata: An Autobiography,11057,https://www.goodreads.com/book/show/100322.Assata,https://images.gr-assets.com/books/1328857268m...,assata an autobiography
2,100365,1,0.0,The Mote in God's Eye,48736,https://www.goodreads.com/book/show/100365.The...,https://images.gr-assets.com/books/1399490037m...,the mote in gods eye
3,10046142,1,0.0,Dancing in the Glory of Monsters: The Collapse...,2391,https://www.goodreads.com/book/show/10046142-d...,https://images.gr-assets.com/books/1328757755m...,dancing in the glory of monsters the collapse ...
4,1005,3,0.0,Think and Grow Rich,87634,https://www.goodreads.com/book/show/1005.Think...,https://s.gr-assets.com/assets/nophoto/book/11...,think and grow rich


In [54]:
book_recs['adjusted_count'] = book_recs['count'] * (book_recs['count'] / book_recs['ratings'])

In [55]:
book_recs['score'] = book_recs['mean'] * book_recs['adjusted_count']

In [56]:
book_recs = book_recs[~book_recs['book_id'].isin(my_books['book_id'])]

In [59]:
my_books['mod_title'] = my_books['title'].str.replace('[^a-zA-Z0-9 ]', '', regex=True).str.lower()

In [60]:
my_books['mod_title'] = my_books['mod_title'].str.replace('\s+', ' ', regex=True)

In [61]:
book_recs = book_recs[~book_recs['mod_title'].isin(my_books['mod_title'])]

In [62]:
book_recs = book_recs[book_recs['count'] > 2]

In [63]:
book_recs = book_recs[book_recs['mean'] > 4]

In [64]:
top_recs = book_recs.sort_values('score', ascending=False)

In [65]:
top_recs

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title,adjusted_count,score
2558,78983,4,4.25,"Kane and Abel (Kane and Abel, #1)",75215,https://www.goodreads.com/book/show/78983.Kane...,https://s.gr-assets.com/assets/nophoto/book/11...,kane and abel kane and abel 1,0.000213,0.000904
1441,2767793,4,4.25,"The Hero of Ages (Mistborn, #3)",149260,https://www.goodreads.com/book/show/2767793-th...,https://images.gr-assets.com/books/1480717763m...,the hero of ages mistborn 3,0.000107,0.000456
2260,62291,5,4.8,"A Storm of Swords (A Song of Ice and Fire, #3)",477834,https://www.goodreads.com/book/show/62291.A_St...,https://images.gr-assets.com/books/1497931121m...,a storm of swords a song of ice and fire 3,5.2e-05,0.000251
1173,2318271,3,4.333333,The Last Lecture,245804,https://www.goodreads.com/book/show/2318271.Th...,https://images.gr-assets.com/books/1388075896m...,the last lecture,3.7e-05,0.000159
1100,22034,3,4.333333,The Godfather,259150,https://www.goodreads.com/book/show/22034.The_...,https://images.gr-assets.com/books/1394988109m...,the godfather,3.5e-05,0.00015
243,119322,4,4.25,"The Golden Compass (His Dark Materials, #1)",973154,https://www.goodreads.com/book/show/119322.The...,https://images.gr-assets.com/books/1505766203m...,the golden compass his dark materials 1,1.6e-05,7e-05
1906,4381,3,4.333333,Fahrenheit 451,591506,https://www.goodreads.com/book/show/4381.Fahre...,https://images.gr-assets.com/books/1351643740m...,fahrenheit 451,1.5e-05,6.6e-05
600,157993,3,4.333333,The Little Prince,763309,https://www.goodreads.com/book/show/157993.The...,https://images.gr-assets.com/books/1367545443m...,the little prince,1.2e-05,5.1e-05


In [66]:
def make_clickable(link):
    return '<a target="_blank", href="{}">Goodreads</a>'.format(link)

def show_image(link):
    return '<img src="{}", width=50></img>'.format(link)

In [67]:
top_recs.style.format({'url': make_clickable, 'cover_image': show_image})

Unnamed: 0,book_id,count,mean,title,ratings,url,cover_image,mod_title,adjusted_count,score
2558,78983,4,4.25,"Kane and Abel (Kane and Abel, #1)",75215,Goodreads,,kane and abel kane and abel 1,0.000213,0.000904
1441,2767793,4,4.25,"The Hero of Ages (Mistborn, #3)",149260,Goodreads,,the hero of ages mistborn 3,0.000107,0.000456
2260,62291,5,4.8,"A Storm of Swords (A Song of Ice and Fire, #3)",477834,Goodreads,,a storm of swords a song of ice and fire 3,5.2e-05,0.000251
1173,2318271,3,4.333333,The Last Lecture,245804,Goodreads,,the last lecture,3.7e-05,0.000159
1100,22034,3,4.333333,The Godfather,259150,Goodreads,,the godfather,3.5e-05,0.00015
243,119322,4,4.25,"The Golden Compass (His Dark Materials, #1)",973154,Goodreads,,the golden compass his dark materials 1,1.6e-05,7e-05
1906,4381,3,4.333333,Fahrenheit 451,591506,Goodreads,,fahrenheit 451,1.5e-05,6.6e-05
600,157993,3,4.333333,The Little Prince,763309,Goodreads,,the little prince,1.2e-05,5.1e-05
