In [29]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import HashingVectorizer


# Data Analysis & Pre-processing

In [30]:
df = pd.read_csv("books.csv")

In [31]:
df.head()

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,0439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,0439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,0439554896,9780439554893,eng,352,6333,244,11/1/2003,Scholastic
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/2004,Scholastic Inc.
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,0439682584,9780439682589,eng,2690,41428,164,9/13/2004,Scholastic


In [32]:
df.columns

Index(['bookID', 'title', 'authors', 'average_rating', 'isbn', 'isbn13',
       'language_code', '  num_pages', 'ratings_count', 'text_reviews_count',
       'publication_date', 'publisher'],
      dtype='object')

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11127 entries, 0 to 11126
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   bookID              11127 non-null  int64  
 1   title               11127 non-null  object 
 2   authors             11127 non-null  object 
 3   average_rating      11127 non-null  float64
 4   isbn                11127 non-null  object 
 5   isbn13              11127 non-null  int64  
 6   language_code       11127 non-null  object 
 7     num_pages         11127 non-null  int64  
 8   ratings_count       11127 non-null  int64  
 9   text_reviews_count  11127 non-null  int64  
 10  publication_date    11127 non-null  object 
 11  publisher           11127 non-null  object 
dtypes: float64(1), int64(5), object(6)
memory usage: 1.0+ MB


In [34]:
df.describe()

Unnamed: 0,bookID,average_rating,isbn13,num_pages,ratings_count,text_reviews_count
count,11127.0,11127.0,11127.0,11127.0,11127.0,11127.0
mean,21310.938887,3.933631,9759888000000.0,336.376921,17936.41,541.854498
std,13093.358023,0.352445,442896400000.0,241.127305,112479.4,2576.176608
min,1.0,0.0,8987060000.0,0.0,0.0,0.0
25%,10287.0,3.77,9780345000000.0,192.0,104.0,9.0
50%,20287.0,3.96,9780586000000.0,299.0,745.0,46.0
75%,32104.5,4.135,9780873000000.0,416.0,4993.5,237.5
max,45641.0,5.0,9790008000000.0,6576.0,4597666.0,94265.0


Only keep the English language books.

In [35]:
uniq_lc = df["language_code"].unique()
eng_lc = []
for lc in uniq_lc:
    if 'en' in lc:
        eng_lc.append(lc)

print(eng_lc)

['eng', 'en-US', 'en-GB', 'enm', 'en-CA']


In [36]:
books_eng = df[(df["language_code"] == 'eng')|
                (df["language_code"] == 'en-US')|
                (df["language_code"] == 'en-GB')|
                (df["language_code"] == 'enm')|
                (df["language_code"] == 'en-CA')]

In [37]:
books_eng

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,0439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,0439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,0439554896,9780439554893,eng,352,6333,244,11/1/2003,Scholastic
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/2004,Scholastic Inc.
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,0439682584,9780439682589,eng,2690,41428,164,9/13/2004,Scholastic
...,...,...,...,...,...,...,...,...,...,...,...,...
11121,45630,Whores for Gloria,William T. Vollmann,3.69,0140231579,9780140231571,en-US,160,932,111,2/1/1994,Penguin Books
11122,45631,Expelled from Eden: A William T. Vollmann Reader,William T. Vollmann/Larry McCaffery/Michael He...,4.06,1560254416,9781560254416,eng,512,156,20,12/21/2004,Da Capo Press
11123,45633,You Bright and Risen Angels,William T. Vollmann,4.08,0140110879,9780140110876,eng,635,783,56,12/1/1988,Penguin Books
11124,45634,The Ice-Shirt (Seven Dreams #1),William T. Vollmann,3.96,0140131965,9780140131963,eng,415,820,95,8/1/1993,Penguin Books


# Feature Engineering

In [38]:
feature = books_eng["title"] # take book title as the feature information for KNN
vectorizer = HashingVectorizer(n_features=2**12) # encode text into numeric values with hashing, no vocabulary dictionary needed, using a large feature number to avoid mapping coliisons
feature_vec = vectorizer.transform(feature)

In [39]:
user_input = input("What is your favourite book:") # same process to transform user input
user_vec = vectorizer.transform([user_input])
print(user_input)

Kafka on the shore - haruki murakami


# Recommendation Model 
Choosing K-NearestNeighbor as the recommendation model to start with. It is a common baseline for recommender systems, and it's easy to implement.

In [40]:
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=111) # cosine similarity as item to item similarity metrics - prioritising orientation instead of magnitude, pair-wise comparison of all samples calculated (brute), k = sqrt(N_training_samples) for noise and bias balance, odd number to avoid ties
model_knn.fit(feature_vec)
distances, indices = model_knn.kneighbors(user_vec, 10, return_distance=True) # limit the recommendation up to 10 for user
indices = indices.tolist()

In [41]:
books_eng.iloc[indices[0]] # view for sense check

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
11046,45314,Kafka on the Shore,Haruki Murakami/Philip Gabriel,4.14,1400043662,9781400043668,en-US,436,1989,315,1/26/2005,Knopf Publishing Group
1405,4929,Kafka on the Shore,Haruki Murakami/Philip Gabriel,4.14,1400079276,9781400079278,eng,467,225397,12452,1/3/2006,Vintage International
783,2551,On the Road,Jack Kerouac,3.63,143036386,9780143036388,eng,320,488,39,9/6/2005,Penguin Books
11100,45536,On the Road,Jack Kerouac/Ann Charters,3.63,142437255,9780142437254,en-US,307,3271,342,1/3/2006,Penguin Classics
784,2552,On the Road,Jack Kerouac/Ann Charters,3.63,141182679,9780141182674,eng,281,5575,502,2/24/2000,Penguin Books
9585,38180,On the Beach,Nevil Shute,3.94,1842322761,9781842322765,eng,296,28414,1744,10/31/2002,House of Stratus
5600,20564,The Mill on the Floss,George Eliot/A.S. Byatt,3.79,141439629,9780141439624,eng,579,41816,1337,2/27/2003,Penguin Classics
10808,44145,The Bar on the Seine,Georges Simenon/David Watson,3.69,143038311,9780143038313,en-US,160,380,54,12/26/2006,Penguin Books
9385,37205,The Mill on the Floss,George Eliot,3.79,140620273,9780140620276,en-GB,536,198,32,2/24/1994,Penguin Books
3061,11296,Haruki Murakami and the Music of Words,Jay Rubin,3.83,99455447,9780099455448,eng,462,1444,68,1/6/2005,Vintage


In [42]:
# output for user below
print("Based on your favourite book",'"', user_input, '"',", here are some books you might like:")
count = 0
suggestion_set = set()
for i in range(len(indices[0])): # filter out the same book as the user input, the repeating book title output, and only suggest the rest
    suggest = books_eng["title"].iloc[indices[0][i]]
    if user_input.strip().lower() != suggest.strip().lower():
        suggestion_set.add(suggest)

for item in suggestion_set:
    count += 1
    print(count,": ",item)

Based on your favourite book " Kafka on the shore - haruki murakami " , here are some books you might like:
1 :  Kafka on the Shore
2 :  The Mill on the Floss
3 :  Haruki Murakami and the Music of Words
4 :  The Bar on the Seine
5 :  On the Beach
6 :  On the Road


# Evaluation
1. Manually checking if the list of recommendation is reasonable
2. Sending the result to user for feedback

## Some of the results

-----------------------
Result 1:    
>Based on your favourite book " 8 rules of love " , here are some books you might like:     
>1 :  The Rules of Attraction           
>2 :  Love            
>3 :  Of Love and Other Demons           
>4 :  The Progress of Love          
>5 :  The History of Love         
>6 :  Of Love and Shadows        
>7 :  A General Theory of Love          
>8 :  EULIS! The History of Love           

Analysis:   
The keywords "love" is picked up from the title, and the books recommended are relevant functional books on ruls of love.

------------------------
Result 2:   
>Based on your favourite book " To kill a mockingbird " , here are some books you might like:        
>1 :  The Emotional Intelligence Quick Book: Everything You Need to Know to Put Your EQ to Work      
>2 :  Harper Lee's To Kill a Mockingbird (Bloom's Guides)      
>3 :  How to Go to College Almost for Free  Updated       
>4 :  A Passage to India: A Reader's Guide to Essential Criticism        
>5 :  To Have and to Hold       
>6 :  Can't Wait to Get to Heaven           
>7 :  Writings 1878–1899: Psychology: Briefer Course / The Will to Believe / Talks to Teachers and to Students / Essays

Analysis:         
We can see in the pattern of the title that wiords similar to "kill" have been picked up, and the phrase structure "to do" has been picked up. However, in terms of book genre and content, most recommended books are a different type from the input.

------------------------
Result 3:
>Based on your favourite book " the time traveler's wife " , here are some books you might like:        
>1 :  The Illuminati Papers           
>2 :  The Pilot's Wife               
>3 :  The Part-Time Wife (The Secret Lives of Society Wives  #6)                
>4 :  The Time Machine             
>5 :  The Perfect Wife            
>6 :  The Illuminati    
              
Analysis:        
The word "the" and title with such structure has been picked up, keyword "wife" and "time" has also been picked up in different ways. Most of the recommendations are somehow relevant. User reported that they have not heard most of the recommended books.

------------------------

# Comments and Future Work

It seems that using only book title gives recommendation in books with similar or same words. While this might be useful to some extent, context is also important. 
             
To give more context-aware recommendation, having more information such as book summary and book genre in the datasetet might help with the recommendation in the future.  