# Imports

In [1]:
import os
import sys
from pathlib import Path

import numpy as np
import pandas as pd

In [2]:
parent_path = Path(os.getcwd()).resolve().parent
sys.path.append(str(parent_path))

from data_loader import BooksDataLoader
from book_recommender import BookRecommender
from book_recommender import partition
from book_recommender import hit_rate

# Reading Data In Memory

In [3]:
books_data_loader = BooksDataLoader(base_dir='../')

readers_data = books_data_loader.get_readers_data()
books_data = books_data_loader.get_books_data()
books_meta_data = books_data_loader.get_books_meta_data()

In [4]:
readers_data.head()

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3


In [5]:
book_recommender = BookRecommender(base_dir='../')

In [6]:
books_available = set(map(int, book_recommender.get_books_available()))

# User Selection

A user subset is selected and each user's reviewed books rated by the user equal to and above the threshold are extracted.

In [7]:
selected_user_list = readers_data[readers_data.book_id.isin(books_available)].user_id.unique().tolist()[:100]

In [8]:
%%time 

books_reviewed_by_users = []

rating_threshold = 3

for user in selected_user_list:

    reviewed_all =  set(readers_data[readers_data.user_id == user].book_id.values)
    reviewed = reviewed_all.intersection(books_available)

    #filter out low rating books
    reviewed = set(book_id for book_id in reviewed
                    if (readers_data[(readers_data.user_id == user) 
                                     & (readers_data.book_id == book_id)].rating >= rating_threshold).any())

    if len(reviewed) > 1:
        books_reviewed_by_users.append((user, reviewed))

CPU times: user 11.8 s, sys: 2.8 s, total: 14.6 s
Wall time: 14.7 s


In [9]:
books_reviewed_by_users_df = pd.DataFrame(books_reviewed_by_users, columns=['User', 'Books_Reviewed'])

Below show a sample of the users and their books reviewed.

In [10]:
books_reviewed_by_users_df.head()

Unnamed: 0,User,Books_Reviewed
0,4,"{7, 264, 492, 401, 54, 344, 1210, 575}"
1,1,"{1796, 6665, 3638, 54, 1176}"
2,9,"{3, 1923, 264, 238, 52, 344, 56, 795}"
3,15,"{3, 1381, 2661, 1993, 1645, 1042, 52, 310, 344..."
4,18,"{1153, 1923, 743, 7, 3883, 2923, 4979, 534, 32..."


These are the unique users in our test sample.

In [11]:
books_reviewed_by_users_df.User.unique()

array([  4,   1,   9,  15,  18,  22,  32,  34,  40,  31,  55,  65,  70,
        72,  73,  75,  76,  61,  54,  78,  89,  93,  26,  29, 103, 105,
       108, 112, 113, 115, 116, 125, 126, 137, 135, 124, 142, 149, 156,
         8, 130, 158, 167, 169, 177, 175, 178, 171, 185, 184, 183, 168,
       106, 195, 143, 199, 179,  10, 203, 204, 207, 214, 206, 220, 212,
       228, 229, 232, 233, 237, 239, 240, 245, 246, 242, 250, 248, 257,
       247, 258, 162, 263, 264, 265, 274, 278, 283, 276, 286, 287])

Just to explore a little more, selecting a specific user. User at index 7 is user_id 34.

In [12]:
selected_user_index = 7
reviewed_books = books_reviewed_by_users_df.iloc[selected_user_index, 1]
reviewed_books

{232, 1343, 1657, 2407, 3314}

In [13]:
selected_user_id = books_reviewed_by_users_df.iloc[selected_user_index, 0]
selected_user_id

34

In [14]:
books_reviewed_by_users_df.shape

(90, 2)

In [15]:
reviewed_books_sub = books_data.loc[books_data.book_id.isin(reviewed_books), 
                                    ['book_id','original_title', 'authors', 'average_rating']]
reviewed_books_sub = reviewed_books_sub.set_index('book_id')

These are the books reviewed by user 34.

In [16]:
reviewed_books_sub

Unnamed: 0_level_0,original_title,authors,average_rating
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
232,The Gunslinger,Stephen King,3.99
1343,The Light Fantastic,Terry Pratchett,3.94
1657,Infinite Jest,David Foster Wallace,4.31
2407,Princeps' Fury,Jim Butcher,4.35
3314,Quicksilver,Neal Stephenson,3.92


In [17]:
pd.set_option('max_colwidth', 200)

Table below shows the books by user 34 alongside their genre information.

In [18]:
display(books_meta_data.loc[books_meta_data.book_id.isin(reviewed_books), 
                            ['book_id', 'genre']].set_index('book_id').join(reviewed_books_sub))

Unnamed: 0_level_0,genre,original_title,authors,average_rating
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
232,Fiction / Fantasy / Dark Fantasy Fiction / Thrillers / Supernatural,The Gunslinger,Stephen King,3.99
2407,Fiction / Fantasy / Action & Adventure Fiction / Fantasy / Epic Fiction / Fantasy / Military,Princeps' Fury,Jim Butcher,4.35
1343,Fiction / Fantasy / Action & Adventure Fiction / Fantasy / Contemporary Fiction / Fantasy / Humorous,The Light Fantastic,Terry Pratchett,3.94
1657,Fiction / Classics Fiction / Literary Fiction / Sports,Infinite Jest,David Foster Wallace,4.31
3314,Fiction / Fantasy / Historical Fiction / Historical / General Fiction / Science Fiction / General,Quicksilver,Neal Stephenson,3.92


# Partition The Data

Here, we are going to randomly take out one book from the reader's reviewed list of books. 

In [19]:
books_reviewed_by_users_df.head()

Unnamed: 0,User,Books_Reviewed
0,4,"{7, 264, 492, 401, 54, 344, 1210, 575}"
1,1,"{1796, 6665, 3638, 54, 1176}"
2,9,"{3, 1923, 264, 238, 52, 344, 56, 795}"
3,15,"{3, 1381, 2661, 1993, 1645, 1042, 52, 310, 344, 795, 7260}"
4,18,"{1153, 1923, 743, 7, 3883, 2923, 4979, 534, 3254, 344, 1176}"


In [20]:
part_list, left_out_list = partition(books_reviewed_by_users_df.Books_Reviewed.values)

In [21]:
books_reviewed_by_users_df.loc[:, 'Books_Reviewed'] = part_list
books_reviewed_by_users_df.loc[:, 'Left_Out'] = left_out_list

We are also going to extract the reader's rating to the book they've reviewed.

In [22]:
def book_ratings_for_user(user_id):
    ''' Returns a dictionary mapping their rating to the book they reviewed.
    '''
    reviewed = books_reviewed_by_users_df[books_reviewed_by_users_df.User == user_id].Books_Reviewed.values[0]
    selected = readers_data.loc[(readers_data.user_id==user_id) & readers_data.book_id.isin(reviewed),['book_id','rating']]
    return selected.set_index('book_id').to_dict()['rating']

In [23]:
books_reviewed_by_users_df.loc[:, 'Books_Rating'] = books_reviewed_by_users_df.User.map(book_ratings_for_user)

In [24]:
books_reviewed_by_users_df.head(n=5)

Unnamed: 0,User,Books_Reviewed,Left_Out,Books_Rating
0,4,"{7, 264, 492, 54, 344, 1210, 575}",401,"{264: 3, 575: 3, 492: 4, 1210: 5, 344: 3, 54: 4, 7: 4}"
1,1,"{1796, 3638, 54, 1176}",6665,"{3638: 3, 1796: 5, 1176: 4, 54: 3}"
2,9,"{1923, 264, 238, 52, 344, 56, 795}",3,"{344: 4, 1923: 4, 238: 3, 795: 4, 264: 4, 52: 4, 56: 5}"
3,15,"{3, 2661, 1993, 1645, 1042, 52, 310, 344, 795, 7260}",1381,"{310: 5, 795: 5, 1993: 5, 7260: 4, 344: 4, 1042: 4, 3: 5, 52: 5, 2661: 4, 1645: 4}"
4,18,"{1153, 1923, 7, 3883, 2923, 4979, 534, 3254, 344, 1176}",743,"{344: 4, 1923: 4, 1176: 3, 7: 5, 1153: 4, 2923: 4, 3883: 3, 4979: 3, 3254: 3, 534: 4}"


# Make Recommendations

Now, we're going to use the reader's book list, and their rating of the books they reviewed to get recommendations.

In [25]:
%%time
recommended_books_with_scores = book_recommender.recommend(books_reviewed_by_users_df.User.values,
                                                           books_reviewed_by_users_df.Books_Reviewed.values,
                                                           books_reviewed_by_users_df.Books_Rating.values)

CPU times: user 2min 40s, sys: 202 ms, total: 2min 40s
Wall time: 2min 40s


The list returned contains sorted ordered dictionary in descending order of each book's score for each reader. Below, we are also going to extract just the book id's.

In [26]:
recommended_books = [list(recommended.keys()) for recommended in recommended_books_with_scores]

# Hit Rate Evaluation

Here, we are evaluating the hit rate, where a hit is counted as 1 if the left out book from the reader's original reviewed list is present in the recommended list.

In [27]:
hrate = hit_rate(recommended_books, books_reviewed_by_users_df.Left_Out.values)
hrate

0.7555555555555555

# Recommended Books Detail

The method `top_recommendation_by_user` below extracts the Top-N recommendation for users. The `get_top_recommendation` further manipulates the scores to discourage author repetition.

In [28]:
books_reviewed_by_users_df.loc[:, 'Recommended_Books'] = recommended_books_with_scores

In [29]:
def top_recommendation_by_user(user_id, k=10):
    '''Top-K recommendation for the user.
    
    Args:
    ----
    user_id: user id of the reader.
    k: limit on the number of the recommended books.
    
    Returns:
    -------
    
    top_md: The Top-K recommendations which discourages repeated authors.
    top_md_original: The Top-K recommendations from the originally returned recommendations.
    
    '''

    books_with_scores =  books_reviewed_by_users_df.loc[books_reviewed_by_users_df.User == user_id, 
                                            'Recommended_Books'].values[0]

    top = book_recommender.get_top_recommendations(books_with_scores, k)

    top_md = pd.DataFrame(columns=['book_id', 'title', 'authors', 'average_rating'])
    top_md_original = pd.DataFrame(columns=['book_id', 'title', 'authors', 'average_rating'])

    for book in top:
        top_md = top_md.append(books_data.loc[books_data.book_id == book, 
                                              ['book_id', 'title', 'authors', 'average_rating']])
        
    for book in list(books_with_scores.keys())[:k]:
        top_md_original = top_md_original.append(books_data.loc[books_data.book_id == book, 
                                                                ['book_id', 'title', 'authors', 'average_rating']])

    return top_md, top_md_original

Displaying the test readers again for easy user selection

In [30]:
books_reviewed_by_users_df.User.unique()

array([  4,   1,   9,  15,  18,  22,  32,  34,  40,  31,  55,  65,  70,
        72,  73,  75,  76,  61,  54,  78,  89,  93,  26,  29, 103, 105,
       108, 112, 113, 115, 116, 125, 126, 137, 135, 124, 142, 149, 156,
         8, 130, 158, 167, 169, 177, 175, 178, 171, 185, 184, 183, 168,
       106, 195, 143, 199, 179,  10, 203, 204, 207, 214, 206, 220, 212,
       228, 229, 232, 233, 237, 239, 240, 245, 246, 242, 250, 248, 257,
       247, 258, 162, 263, 264, 265, 274, 278, 283, 276, 286, 287])

Table below shows the original list by the selected user.

In [31]:
sel_user = 220

In [32]:
select_user_books = readers_data[(readers_data.user_id == sel_user) 
                                 & (readers_data.book_id.isin(books_available)) 
                                ].book_id.values

In [33]:
books_data.loc[books_data.book_id.isin(select_user_books), ['book_id', 'title', 'authors', 'average_rating']]

Unnamed: 0,book_id,title,authors,average_rating
6,7,The Hobbit,J.R.R. Tolkien,4.25
192,193,Outliers: The Story of Success,Malcolm Gladwell,4.11
210,211,Blink: The Power of Thinking Without Thinking,Malcolm Gladwell,3.89
343,344,Naked,David Sedaris,4.08
491,492,"Speaker for the Dead (Ender's Saga, #2)",Orson Scott Card,4.04
574,575,Timeline,Michael Crichton,3.83
950,951,Survivor,Chuck Palahniuk,3.93
1391,1392,"Sex, Drugs, and Cocoa Puffs: A Low Culture Manifesto",Chuck Klosterman,3.75
1644,1645,Rant,Chuck Palahniuk,3.82
1808,1809,"Mossflower (Redwall, #2)",Brian Jacques,4.09


The two tables below shows the Top-N books for the selected user. The first table discourage author while the second displays the N books from the recommendations ordered by score as it is.

In [36]:
top_md, top_md_original = top_recommendation_by_user(sel_user, 200)
display(top_md.head(n=15))
display(top_md_original.head(n=15))

Unnamed: 0,book_id,title,authors,average_rating
533,534,When You Are Engulfed in Flames,David Sedaris,4.04
2899,2900,"Ender in Exile (Ender's Saga, #1.2)",Orson Scott Card,3.89
1645,1646,"Shadow of the Hegemon (Ender's Shadow, #2)",Orson Scott Card,3.93
2375,2376,"Lord Brocktree (Redwall, #13)",Brian Jacques,3.95
2010,2011,Snuff,Chuck Palahniuk,3.18
2197,2198,"Queen of Sorcery (The Belgariad, #2)",David Eddings,4.13
106,107,A Walk to Remember,Nicholas Sparks,4.15
1922,1923,Barrel Fever: Stories and Essays,David Sedaris,3.78
7382,7383,"Outcast of Redwall (Redwall, #8)",Brian Jacques,3.9
210,211,Blink: The Power of Thinking Without Thinking,Malcolm Gladwell,3.89


Unnamed: 0,book_id,title,authors,average_rating
533,534,When You Are Engulfed in Flames,David Sedaris,4.04
2899,2900,"Ender in Exile (Ender's Saga, #1.2)",Orson Scott Card,3.89
1645,1646,"Shadow of the Hegemon (Ender's Shadow, #2)",Orson Scott Card,3.93
1922,1923,Barrel Fever: Stories and Essays,David Sedaris,3.78
2010,2011,Snuff,Chuck Palahniuk,3.18
5287,5288,"The Lost Gate (Mither Mages, #1)",Orson Scott Card,3.82
9642,9643,"The Gate Thief (Mither Mages, #2)",Orson Scott Card,3.77
2375,2376,"Lord Brocktree (Redwall, #13)",Brian Jacques,3.95
7382,7383,"Outcast of Redwall (Redwall, #8)",Brian Jacques,3.9
210,211,Blink: The Power of Thinking Without Thinking,Malcolm Gladwell,3.89


The first table is a little more diverse with respect to authors compared to the second table.