In [564]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import HashingVectorizer


# Data Analysis & Pre-processing

In [565]:
df = pd.read_csv("books.csv")

In [566]:
df.head()

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,0439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,0439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,0439554896,9780439554893,eng,352,6333,244,11/1/2003,Scholastic
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/2004,Scholastic Inc.
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,0439682584,9780439682589,eng,2690,41428,164,9/13/2004,Scholastic


In [567]:
df.columns

Index(['bookID', 'title', 'authors', 'average_rating', 'isbn', 'isbn13',
       'language_code', '  num_pages', 'ratings_count', 'text_reviews_count',
       'publication_date', 'publisher'],
      dtype='object')

In [568]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11127 entries, 0 to 11126
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   bookID              11127 non-null  int64  
 1   title               11127 non-null  object 
 2   authors             11127 non-null  object 
 3   average_rating      11127 non-null  float64
 4   isbn                11127 non-null  object 
 5   isbn13              11127 non-null  int64  
 6   language_code       11127 non-null  object 
 7     num_pages         11127 non-null  int64  
 8   ratings_count       11127 non-null  int64  
 9   text_reviews_count  11127 non-null  int64  
 10  publication_date    11127 non-null  object 
 11  publisher           11127 non-null  object 
dtypes: float64(1), int64(5), object(6)
memory usage: 1.0+ MB


In [569]:
df.describe()

Unnamed: 0,bookID,average_rating,isbn13,num_pages,ratings_count,text_reviews_count
count,11127.0,11127.0,11127.0,11127.0,11127.0,11127.0
mean,21310.938887,3.933631,9759888000000.0,336.376921,17936.41,541.854498
std,13093.358023,0.352445,442896400000.0,241.127305,112479.4,2576.176608
min,1.0,0.0,8987060000.0,0.0,0.0,0.0
25%,10287.0,3.77,9780345000000.0,192.0,104.0,9.0
50%,20287.0,3.96,9780586000000.0,299.0,745.0,46.0
75%,32104.5,4.135,9780873000000.0,416.0,4993.5,237.5
max,45641.0,5.0,9790008000000.0,6576.0,4597666.0,94265.0


Only keep the English language books.

In [570]:
uniq_lc = df["language_code"].unique()
eng_lc = []
for lc in uniq_lc:
    if 'en' in lc:
        eng_lc.append(lc)

print(eng_lc)

['eng', 'en-US', 'en-GB', 'enm', 'en-CA']


In [571]:
books_eng = df[(df["language_code"] == 'eng')|
                (df["language_code"] == 'en-US')|
                (df["language_code"] == 'en-GB')|
                (df["language_code"] == 'enm')|
                (df["language_code"] == 'en-CA')]

In [572]:
books_eng

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,0439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,0439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,0439554896,9780439554893,eng,352,6333,244,11/1/2003,Scholastic
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/2004,Scholastic Inc.
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,0439682584,9780439682589,eng,2690,41428,164,9/13/2004,Scholastic
...,...,...,...,...,...,...,...,...,...,...,...,...
11121,45630,Whores for Gloria,William T. Vollmann,3.69,0140231579,9780140231571,en-US,160,932,111,2/1/1994,Penguin Books
11122,45631,Expelled from Eden: A William T. Vollmann Reader,William T. Vollmann/Larry McCaffery/Michael He...,4.06,1560254416,9781560254416,eng,512,156,20,12/21/2004,Da Capo Press
11123,45633,You Bright and Risen Angels,William T. Vollmann,4.08,0140110879,9780140110876,eng,635,783,56,12/1/1988,Penguin Books
11124,45634,The Ice-Shirt (Seven Dreams #1),William T. Vollmann,3.96,0140131965,9780140131963,eng,415,820,95,8/1/1993,Penguin Books


# Feature Engineering

In [573]:
feature = books_eng["title"] # take book title as the feature information for KNN
vectorizer = HashingVectorizer(n_features=2**12) # encode text into numeric values with hashing, no vocabulary dictionary needed, using a large feature number to avoid mapping coliisons
feature_vec = vectorizer.transform(feature)

In [574]:
user_input = input("What is your favourite book:") # same process to transform user input
user_vec = vectorizer.transform([user_input])
print(user_input)

8 rules of love


# Recommendation Model 

In [575]:
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=111) # cosine similarity as item to item similarity metrics, pair-wise comparison of all samples calculated (brute), k = sqrt(N_training_samples) for noise and bias balance, odd number to avoid ties
model_knn.fit(feature_vec)
distances, indices = model_knn.kneighbors(user_vec, 10, return_distance=True) # limit the recommendation up to 10 for user
indices = indices.tolist()
print(indices)

[[1621, 2577, 3889, 2933, 4339, 7807, 1099, 8676, 5991, 6818]]


In [576]:
type(indices)

list

In [577]:
books_eng.iloc[indices[0]]

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
1706,5937,Love,Pablo Neruda,4.46,0786881488,9780786881482,eng,43,1333,68,6/16/1995,Miramax Books
2694,9912,The Rules of Attraction,Bret Easton Ellis,3.68,067978148X,9780679781486,eng,283,35311,974,6/30/1998,Vintage Contemporaries
4110,14679,Love,Stendhal/Gilbert Sale/Suzanne Sale/B.C.J.G. Kn...,3.71,014044307X,9780140443073,eng,336,1083,51,8/28/1975,Penguin Classics
3073,11326,Love,Toni Morrison,3.75,1400078474,9781400078479,eng,224,7580,452,1/4/2005,Vintage
4593,16532,Of Love and Shadows,Isabel Allende/Margaret Sayers Peden,3.97,0553383833,9780553383836,eng,304,17808,403,8/30/2005,Dial Press Trade Paperback
8239,31659,The Progress of Love,Alice Munro,4.14,0375724702,9780375724701,eng,320,2128,115,12/12/2000,Vintage
1159,3867,The History of Love,Nicole Krauss,3.92,0393328627,9780393328622,eng,255,110082,9777,5/17/2006,Norton
9151,35711,A General Theory of Love,Thomas Lewis/Fari Amini/Richard Lannon,4.11,0375709223,9780375709227,eng,288,3675,422,1/9/2001,Vintage
6338,23876,Of Love and Other Demons,Gabriel García Márquez,3.98,0517405091,9780517405093,eng,160,35045,1116,5/2/1995,Penguin Group (USA)
7211,27627,EULIS! The History of Love,Paschal Beverly Randolph,3.85,0766184153,9780766184152,eng,144,9,0,3/5/2004,Kessinger Publishing


# User Interface

In [578]:
print("Based on your favourite book",'"', user_input, '"',", here are some books you might like:")
count = 0
suggestion_set = set()
for i in range(len(indices[0])):
    suggest = books_eng["title"].iloc[indices[0][i]]
    if user_input.strip().lower() != suggest.strip().lower():
        suggestion_set.add(suggest)

for item in suggestion_set:
    count += 1
    print(count,": ",item)

Based on your favourite book " 8 rules of love " , here are some books you might like:
1 :  The Rules of Attraction
2 :  Love
3 :  Of Love and Other Demons
4 :  The Progress of Love
5 :  The History of Love
6 :  Of Love and Shadows
7 :  A General Theory of Love
8 :  EULIS! The History of Love


# Comments and Future Work

Using only book title gives recommendation in books with similar or same words regardless of the contex, not exactly "accurate". Having more information such as book summary and book genre in the datasetet might help with the recommendation.  