In [2]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import HashingVectorizer


# Data Analysis & Pre-processing

In [3]:
df = pd.read_csv("books.csv")

In [4]:
df.head()

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,0439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,0439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,0439554896,9780439554893,eng,352,6333,244,11/1/2003,Scholastic
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/2004,Scholastic Inc.
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,0439682584,9780439682589,eng,2690,41428,164,9/13/2004,Scholastic


In [5]:
df.columns

Index(['bookID', 'title', 'authors', 'average_rating', 'isbn', 'isbn13',
       'language_code', '  num_pages', 'ratings_count', 'text_reviews_count',
       'publication_date', 'publisher'],
      dtype='object')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11127 entries, 0 to 11126
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   bookID              11127 non-null  int64  
 1   title               11127 non-null  object 
 2   authors             11127 non-null  object 
 3   average_rating      11127 non-null  float64
 4   isbn                11127 non-null  object 
 5   isbn13              11127 non-null  int64  
 6   language_code       11127 non-null  object 
 7     num_pages         11127 non-null  int64  
 8   ratings_count       11127 non-null  int64  
 9   text_reviews_count  11127 non-null  int64  
 10  publication_date    11127 non-null  object 
 11  publisher           11127 non-null  object 
dtypes: float64(1), int64(5), object(6)
memory usage: 1.0+ MB


In [7]:
df.describe()

Unnamed: 0,bookID,average_rating,isbn13,num_pages,ratings_count,text_reviews_count
count,11127.0,11127.0,11127.0,11127.0,11127.0,11127.0
mean,21310.938887,3.933631,9759888000000.0,336.376921,17936.41,541.854498
std,13093.358023,0.352445,442896400000.0,241.127305,112479.4,2576.176608
min,1.0,0.0,8987060000.0,0.0,0.0,0.0
25%,10287.0,3.77,9780345000000.0,192.0,104.0,9.0
50%,20287.0,3.96,9780586000000.0,299.0,745.0,46.0
75%,32104.5,4.135,9780873000000.0,416.0,4993.5,237.5
max,45641.0,5.0,9790008000000.0,6576.0,4597666.0,94265.0


In [8]:
print(df["language_code"].unique())
print(len(df["language_code"].unique()))

['eng' 'en-US' 'fre' 'spa' 'en-GB' 'mul' 'grc' 'enm' 'en-CA' 'ger' 'jpn'
 'ara' 'nl' 'zho' 'lat' 'por' 'srp' 'ita' 'rus' 'msa' 'glg' 'wel' 'swe'
 'nor' 'tur' 'gla' 'ale']
27


In [9]:
uniq_lc = df["language_code"].unique()
eng_lc = []
for lc in uniq_lc:
    if 'en' in lc:
        eng_lc.append(lc)

print(eng_lc)

['eng', 'en-US', 'en-GB', 'enm', 'en-CA']


In [10]:
books_eng = df[(df["language_code"] == 'eng')|
                (df["language_code"] == 'en-US')|
                (df["language_code"] == 'en-GB')|
                (df["language_code"] == 'enm')|
                (df["language_code"] == 'en-CA')]

In [11]:
books_eng

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,0439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,0439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,0439554896,9780439554893,eng,352,6333,244,11/1/2003,Scholastic
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/2004,Scholastic Inc.
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,0439682584,9780439682589,eng,2690,41428,164,9/13/2004,Scholastic
...,...,...,...,...,...,...,...,...,...,...,...,...
11121,45630,Whores for Gloria,William T. Vollmann,3.69,0140231579,9780140231571,en-US,160,932,111,2/1/1994,Penguin Books
11122,45631,Expelled from Eden: A William T. Vollmann Reader,William T. Vollmann/Larry McCaffery/Michael He...,4.06,1560254416,9781560254416,eng,512,156,20,12/21/2004,Da Capo Press
11123,45633,You Bright and Risen Angels,William T. Vollmann,4.08,0140110879,9780140110876,eng,635,783,56,12/1/1988,Penguin Books
11124,45634,The Ice-Shirt (Seven Dreams #1),William T. Vollmann,3.96,0140131965,9780140131963,eng,415,820,95,8/1/1993,Penguin Books


# Feature Engineering

In [15]:
feature = df["title"]
# test_sample = df["title"][2]
# test_sample = "Harry Potter"
test_sample = input("What is your favourite book:")
vectorizer = HashingVectorizer(n_features=20)
feature_vec = vectorizer.transform(feature)
test_vec = vectorizer.transform([test_sample])

# Recommendation Model 

In [16]:
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=7)
model_knn.fit(feature_vec)
distances, indices = model_knn.kneighbors(test_vec, 7, return_distance=True)
print(indices)
df.iloc[indices[0]]

[[ 7363  8533     6   988  9742 10349   615]]


Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
7363,28371,Witch Grass,Raymond Queneau/Barbara Wright,3.93,1590170318,9781590170311,eng,328,335,38,1/31/2003,NYRB Classics
8533,32773,Mountain Madness (Wilderness #24),David Robbins/David Thompson,4.45,843943998,9780843943993,eng,170,22,2,6/1/1998,Leisure Books
6,10,Harry Potter Collection (Harry Potter #1-6),J.K. Rowling,4.73,439827604,9780439827607,eng,3342,28242,808,9/12/2005,Scholastic
988,3357,Harry Potter Y La Piedra Filosofal (Harry Pott...,J.K. Rowling,4.47,613359607,9780613359603,spa,254,142,12,3/6/2001,Turtleback Books
9742,38936,Lori's Little Secret (Bravo Family #15) (Brav...,Christine Rimmer,3.76,373246838,9780373246830,eng,248,101,6,5/1/2005,Silhouette
10349,41909,Harry Potter ve Sırlar Odası (Harry Potter #2),J.K. Rowling/Sevin Okyay,4.42,3570211029,9783570211021,tur,403,1000,41,10/1/2001,Yapı Kredi Yayınları
615,2005,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling,4.57,747584664,9780747584667,eng,768,1213,78,6/23/2006,Bloomsbury Publishing


In [None]:
# the deduct the user input, duplicate title of the input, 
# and then output the booktitle, authors, average rating and rating numbers, and publisher