In [1]:
import numpy as np
import pandas as pd

from data_loader import BooksDataLoader
from book_recommender import BookRecommender
from book_recommender import partition
from book_recommender import hit_rate

In [2]:
books_data_loader = BooksDataLoader()

readers_data = books_data_loader.get_readers_data()
books_data = books_data_loader.get_books_data()
books_meta_data = books_data_loader.get_books_meta_data()

In [3]:
book_recommender = BookRecommender()

In [4]:
readers_data.head()

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3


In [5]:
#books_meta_data = books_meta_data.set_index('book_id').join(books_data.set_index('book_id'))

In [6]:
books_available = set(map(int, book_recommender.books_available()))

In [34]:
selected_user_list = readers_data[readers_data.book_id.isin(books_available)].user_id.unique().tolist()[:100]

In [35]:
%%time 

books_reviewed_by_users = []

rating_threshold = 3

for user in selected_user_list:

    reviewed_all =  set(readers_data[readers_data.user_id == user].book_id.values)
    reviewed = reviewed_all.intersection(books_available)

    # filter out 
    reviewed = set(book_id for book_id in reviewed
                    if (readers_data[(readers_data.user_id == user) 
                                     & (readers_data.book_id == book_id)].rating >= rating_threshold).any())

    if len(reviewed) > 1:
        books_reviewed_by_users.append((user, reviewed))

CPU times: user 1min 54s, sys: 4.53 s, total: 1min 58s
Wall time: 14.9 s


In [37]:
books_reviewed_by_users_df = pd.DataFrame(books_reviewed_by_users, columns=['User', 'Books_Reviewed'])

In [38]:
books_reviewed_by_users_df.head()

Unnamed: 0,User,Books_Reviewed
0,4,"{7, 264, 492, 401, 54, 344, 1210, 575}"
1,1,"{1796, 6665, 3638, 54, 1176}"
2,9,"{3, 1923, 264, 238, 52, 344, 56, 795}"
3,15,"{3, 1381, 2661, 1993, 1645, 1042, 52, 310, 344..."
4,18,"{1153, 1923, 743, 7, 3883, 2923, 4979, 534, 32..."


In [39]:
books_reviewed_by_users_df.User.unique()

array([  4,   1,   9,  15,  18,  22,  32,  34,  40,  31,  55,  65,  70,
        72,  73,  75,  76,  61,  54,  78,  89,  93,  26,  29, 103, 105,
       108, 112, 113, 115, 116, 125, 126, 137, 135, 124, 142, 149, 156,
         8, 130, 158, 167, 169, 177, 175, 178, 171, 185, 184, 183, 168,
       106, 195, 143, 199, 179,  10, 203, 204, 207, 214, 206, 220, 212,
       228, 229, 232, 233, 237, 239, 240, 245, 246, 242, 250, 248, 257,
       247, 258, 162, 263, 264, 265, 274, 278, 283, 276, 286, 287])

In [40]:
selected_user_index = 7
reviewed_books = books_reviewed_by_users_df.iloc[selected_user_index, 1]
reviewed_books

{232, 1343, 1657, 2407, 3314}

In [41]:
selected_user_id = books_reviewed_by_users_df.iloc[selected_user_index, 0]
selected_user_id

34

In [42]:
books_reviewed_by_users_df.shape

(90, 2)

In [43]:
reviewed_books_sub = books_data.loc[books_data.book_id.isin(reviewed_books), 
                                    ['book_id','original_title', 'authors', 'average_rating']]
reviewed_books_sub = reviewed_books_sub.set_index('book_id')

In [44]:
reviewed_books_sub

Unnamed: 0_level_0,original_title,authors,average_rating
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
232,The Gunslinger,Stephen King,3.99
1343,The Light Fantastic,Terry Pratchett,3.94
1657,Infinite Jest,David Foster Wallace,4.31
2407,Princeps' Fury,Jim Butcher,4.35
3314,Quicksilver,Neal Stephenson,3.92


In [45]:
pd.set_option('max_colwidth', 200)

In [46]:
display(books_meta_data.loc[books_meta_data.book_id.isin(reviewed_books), 
                            ['book_id', 'genre']].set_index('book_id').join(reviewed_books_sub))

Unnamed: 0_level_0,genre,original_title,authors,average_rating
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
232,Fiction / Fantasy / Dark Fantasy Fiction / Thrillers / Supernatural,The Gunslinger,Stephen King,3.99
2407,Fiction / Fantasy / Action & Adventure Fiction / Fantasy / Epic Fiction / Fantasy / Military,Princeps' Fury,Jim Butcher,4.35
1343,Fiction / Fantasy / Action & Adventure Fiction / Fantasy / Contemporary Fiction / Fantasy / Humorous,The Light Fantastic,Terry Pratchett,3.94
1657,Fiction / Classics Fiction / Literary Fiction / Sports,Infinite Jest,David Foster Wallace,4.31
3314,Fiction / Fantasy / Historical Fiction / Historical / General Fiction / Science Fiction / General,Quicksilver,Neal Stephenson,3.92


**#TODO Consolidate genres where applicable**

Notice that book '7379' has 'Humorous' listed as one of it's genre yet, similarity score is 0.0 with the book '534' even though one of it's genre is 'Humor'. That's for user 28

# Partition The Data

In [47]:
books_reviewed_by_users_df.head()

Unnamed: 0,User,Books_Reviewed
0,4,"{7, 264, 492, 401, 54, 344, 1210, 575}"
1,1,"{1796, 6665, 3638, 54, 1176}"
2,9,"{3, 1923, 264, 238, 52, 344, 56, 795}"
3,15,"{3, 1381, 2661, 1993, 1645, 1042, 52, 310, 344, 795, 7260}"
4,18,"{1153, 1923, 743, 7, 3883, 2923, 4979, 534, 3254, 344, 1176}"


In [48]:
part_list, left_out_list = partition(books_reviewed_by_users_df.Books_Reviewed.values)

In [49]:
books_reviewed_by_users_df.loc[:, 'Books_Reviewed'] = part_list
books_reviewed_by_users_df.loc[:, 'Left_Out'] = left_out_list

In [50]:
books_reviewed_by_users_df.head(n=5)

Unnamed: 0,User,Books_Reviewed,Left_Out
0,4,"{7, 264, 492, 54, 344, 1210, 575}",401
1,1,"{1796, 6665, 3638, 1176}",54
2,9,"{3, 264, 238, 52, 344, 56, 795}",1923
3,15,"{3, 1381, 1993, 1645, 1042, 52, 310, 344, 795, 7260}",2661
4,18,"{1153, 1923, 743, 7, 3883, 2923, 4979, 534, 344, 1176}",3254


# Make Recommendations

In [51]:
%%time
recommended_books = book_recommender.recommend(books_reviewed_by_users_df.User.values,
                                               books_reviewed_by_users_df.Books_Reviewed.values)

CPU times: user 1min 2s, sys: 581 ms, total: 1min 2s
Wall time: 1min 2s


In [52]:
recommended_books = list(map(lambda x: list(map(int, x)), recommended_books))

In [53]:
hrate = hit_rate(recommended_books, books_reviewed_by_users_df.Left_Out.values)
hrate

0.5555555555555556

In [54]:
books_reviewed_by_users_df.loc[:, 'Recommended_Books'] = recommended_books

In [55]:
def topn(user_id, n=10, df=books_reviewed_by_users_df):

    book_list = list(map(int, df.loc[df.User == user_id, 'Recommended_Books'].values[0]))
    books_with_rating = books_data.loc[books_data.book_id.isin(book_list), ['book_id', 'average_rating']]
    return books_with_rating.nlargest(n, 'average_rating').book_id.tolist()

In [56]:
books_reviewed_by_users_df.Left_Out.values

array([ 401,   54, 1923, 2661, 3254, 1360,    7, 1657,  344, 2047, 1856,
        232,  773,  211, 9083, 1210, 1332, 3638,   56,  161, 1796,   54,
       1714, 1106, 2860, 1394, 1100,  238, 5295, 6575, 1593, 1025, 1154,
          7, 3013,   54,  344, 1809, 4631,   54, 1025, 2155, 2508, 1210,
        238,  344,    7,  401,    7,  264,  238, 1381, 3126,  455,   52,
       1149,  238, 7002, 1811, 2508, 1549,  264, 1657,  951,  155, 1332,
       4431, 7379, 2508,   52,    7, 9170,  308,   54, 1746,  308,   56,
       1923,  238, 6579, 1061, 2841,  155, 1532,  211,   56,   56, 2047,
        703, 1923])

In [57]:
top_list = [topn(user_id, 10) for user_id in books_reviewed_by_users_df.User]

In [58]:
hit_rate(top_list, books_reviewed_by_users_df.Left_Out.values)

0.06666666666666667

In [59]:
def top_recommendation_by_user(user_id):
    
    top_10 = topn(user_id)
    top_10_md = books_data.loc[books_data.book_id.isin(top_10), ['book_id', 'title', 'authors', 'average_rating']]

    return top_10_md

In [62]:
books_reviewed_by_users_df.User.unique()

array([  4,   1,   9,  15,  18,  22,  32,  34,  40,  31,  55,  65,  70,
        72,  73,  75,  76,  61,  54,  78,  89,  93,  26,  29, 103, 105,
       108, 112, 113, 115, 116, 125, 126, 137, 135, 124, 142, 149, 156,
         8, 130, 158, 167, 169, 177, 175, 178, 171, 185, 184, 183, 168,
       106, 195, 143, 199, 179,  10, 203, 204, 207, 214, 206, 220, 212,
       228, 229, 232, 233, 237, 239, 240, 245, 246, 242, 250, 248, 257,
       247, 258, 162, 263, 264, 265, 274, 278, 283, 276, 286, 287])

In [77]:
sel_user = 75

In [78]:
top_recommendation_by_user(sel_user)

Unnamed: 0,book_id,title,authors,average_rating
1656,1657,Infinite Jest,David Foster Wallace,4.31
3125,3126,The Complete Short Stories,Ernest Hemingway,4.28
4797,4798,"Genghis: Birth of an Empire (Conqueror, #1)",Conn Iggulden,4.34
5016,5017,Dreams of a Dark Warrior (Immortals After Dark #11),Kresley Cole,4.39
5141,5142,"Her Mother's Hope (Marta's Legacy, #1)",Francine Rivers,4.28
6209,6210,"Ransom (Highlands' Lairds, #2)",Julie Garwood,4.33
8916,8917,"Her Daughter's Dream (Marta's Legacy, #2)",Francine Rivers,4.34
9300,9301,"Ever After (Lost Love, #2)",Karen Kingsbury,4.31
9413,9414,"Deeper Than Midnight (Midnight Breed, #9)",Lara Adrian,4.29
9616,9617,"Fame (Firstborn, #1)",Karen Kingsbury,4.34


In [79]:
select_user_books = readers_data[(readers_data.user_id == sel_user) & (readers_data.book_id.isin(books_available)) & (readers_data.rating >= 3)].book_id.values

In [80]:
books_data.loc[books_data.book_id.isin(select_user_books), ['book_id', 'title', 'authors', 'average_rating']]

Unnamed: 0,book_id,title,authors,average_rating
237,238,Running with Scissors,Augusten Burroughs,3.7
343,344,Naked,David Sedaris,4.08
742,743,"Lamb: The Gospel According to Biff, Christ's Childhood Pal",Christopher Moore,4.26
1209,1210,"Anne of Avonlea (Anne of Green Gables, #2)",L.M. Montgomery,4.21
1713,1714,The Kitchen God's Wife,Amy Tan,3.98
7378,7379,The Pleasure of My Company,Steve Martin,3.78
