In [1]:
#import libraries
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [5]:
path = "/content/sample_data/book-crossings.zip"


!unzip /content/sample_data/book-crossings.zip



books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

Archive:  /content/sample_data/book-crossings.zip
replace BX-Book-Ratings.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace BX-Books.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace BX-Users.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [6]:
!ls BX-Books.csv

BX-Books.csv


In [7]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

In [8]:
df_books.head()

Unnamed: 0,isbn,title,author
0,195153448,Classical Mythology,Mark P. O. Morford
1,2005018,Clara Callan,Richard Bruce Wright
2,60973129,Decision in Normandy,Carlo D'Este
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
4,393045218,The Mummies of Urumchi,E. J. W. Barber


In [9]:
df_ratings.head()

Unnamed: 0,user,isbn,rating
0,276725,034545104X,0.0
1,276726,0155061224,5.0
2,276727,0446520802,0.0
3,276729,052165615X,3.0
4,276729,0521795028,6.0


In [10]:
print(f"The shape of books df is: {df_books.shape}")
print(f"The shape of ratings df is: {df_ratings.shape}")

The shape of books df is: (271379, 3)
The shape of ratings df is: (1149780, 3)


In [11]:
df_books["title"].nunique()

242154

In [12]:
df_ratings.duplicated().sum()

0

In [13]:
books_with_ratings = df_ratings.merge(df_books, on = "isbn")
books_with_ratings

Unnamed: 0,user,isbn,rating,title,author
0,276725,034545104X,0.0,Flesh Tones: A Novel,M. J. Rose
1,276726,0155061224,5.0,Rites of Passage,Judith Rae
2,276727,0446520802,0.0,The Notebook,Nicholas Sparks
3,276729,052165615X,3.0,Help!: Level 1,Philip Prowse
4,276729,0521795028,6.0,The Amsterdam Connection : Level 4 (Cambridge ...,Sue Leather
...,...,...,...,...,...
1031170,276704,0876044011,0.0,Edgar Cayce on the Akashic Records: The Book o...,Kevin J. Todeschi
1031171,276704,1563526298,9.0,Get Clark Smart : The Ultimate Guide for the S...,Clark Howard
1031172,276706,0679447156,0.0,Eight Weeks to Optimum Health: A Proven Progra...,Andrew Weil
1031173,276709,0515107662,10.0,The Sherbrooke Bride (Bride Trilogy (Paperback)),Catherine Coulter


In [14]:
# dropping users with < 200 ratings:
x = books_with_ratings.groupby('user').count()['rating'] >= 200

# using boolean indexing to get users which are eligible:

eligible_users = x[x].index
eligible_users

Index([   254,   2276,   2766,   2977,   3363,   4017,   4385,   6251,   6323,
         6543,
       ...
       271705, 273979, 274004, 274061, 274301, 274308, 275970, 277427, 277639,
       278418],
      dtype='int32', name='user', length=816)

In [15]:
filtered_ratings = books_with_ratings[books_with_ratings['user'].isin(eligible_users)]
filtered_ratings

Unnamed: 0,user,isbn,rating,title,author
1150,277427,002542730X,10.0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner
1151,277427,0026217457,0.0,Vegetarian Times Complete Cookbook,Lucy Moll
1152,277427,003008685X,8.0,Pioneers,James Fenimore Cooper
1153,277427,0030615321,0.0,"Ask for May, Settle for June (A Doonesbury book)",G. B. Trudeau
1154,277427,0060002050,0.0,On a Wicked Dawn (Cynster Novels),Stephanie Laurens
...,...,...,...,...,...
1029396,275970,1931868123,0.0,There's a Porcupine in My Outhouse: Misadventu...,Mike Tougias
1029397,275970,3411086211,10.0,Die Biene.,Sybil GrÃ?Â¤fin SchÃ?Â¶nfeldt
1029398,275970,3829021860,0.0,The Penis Book,Joseph Cohen
1029399,275970,4770019572,0.0,Musashi,Eiji Yoshikawa


In [17]:
# Dropping books with less than 100 ratings:
y = filtered_ratings.groupby('title').count()['rating'] >= 100
eligible_books = y[y].index
eligible_books

# here the books are indexed by their titles only

Index(['1st to Die: A Novel', '2nd Chance', 'A Bend in the Road',
       'A Is for Alibi (Kinsey Millhone Mysteries (Paperback))',
       'A Map of the World', 'A Painted House', 'A Prayer for Owen Meany',
       'A Time to Kill', 'A Walk to Remember', 'Airframe',
       ...
       'We Were the Mulvaneys', 'When the Wind Blows',
       'Where the Heart Is (Oprah's Book Club (Paperback))',
       'While I Was Gone', 'Whispers', 'White Oleander : A Novel',
       'White Oleander : A Novel (Oprah's Book Club)',
       'Wicked: The Life and Times of the Wicked Witch of the West',
       'Wild Animus', '\O\" Is for Outlaw"'],
      dtype='object', name='title', length=152)

In [18]:
filtered_books = filtered_ratings[filtered_ratings["title"].isin(eligible_books)]

# this will keep only those books which are present in our eligible_books df

In [19]:
filtered_books

# we can see we have duplicates in this data, so we will drop duplicates on the basis of titles. we're not dropping duplicated on the basis of isbn as same title has different isbns

Unnamed: 0,user,isbn,rating,title,author
1163,277427,0060930535,0.0,The Poisonwood Bible: A Novel,Barbara Kingsolver
1165,277427,0060934417,0.0,Bel Canto: A Novel,Ann Patchett
1168,277427,0061009059,9.0,One for the Money (Stephanie Plum Novels (Pape...,Janet Evanovich
1188,277427,0142001740,0.0,The Secret Life of Bees,Sue Monk Kidd
1206,277427,0312966091,0.0,Three To Get Deadly : A Stephanie Plum Novel (...,Janet Evanovich
...,...,...,...,...,...
1028833,275970,0618002227,0.0,The Fellowship of the Ring (The Lord of the Ri...,J. R. R. Tolkien
1028852,275970,0670032379,0.0,The Secret Life of Bees,Sue Monk Kidd
1028904,275970,0679442790,0.0,The Reader,Bernhard Schlink
1028938,275970,0679893105,0.0,"The Golden Compass (His Dark Materials, Book 1)",PHILIP PULLMAN


In [20]:
filtered_books.drop_duplicates()

Unnamed: 0,user,isbn,rating,title,author
1163,277427,0060930535,0.0,The Poisonwood Bible: A Novel,Barbara Kingsolver
1165,277427,0060934417,0.0,Bel Canto: A Novel,Ann Patchett
1168,277427,0061009059,9.0,One for the Money (Stephanie Plum Novels (Pape...,Janet Evanovich
1188,277427,0142001740,0.0,The Secret Life of Bees,Sue Monk Kidd
1206,277427,0312966091,0.0,Three To Get Deadly : A Stephanie Plum Novel (...,Janet Evanovich
...,...,...,...,...,...
1028833,275970,0618002227,0.0,The Fellowship of the Ring (The Lord of the Ri...,J. R. R. Tolkien
1028852,275970,0670032379,0.0,The Secret Life of Bees,Sue Monk Kidd
1028904,275970,0679442790,0.0,The Reader,Bernhard Schlink
1028938,275970,0679893105,0.0,"The Golden Compass (His Dark Materials, Book 1)",PHILIP PULLMAN


In [21]:
# creating a pivot table to pass into nearest neighbors, to find distances:

pivot = filtered_books.pivot_table(index = "title", columns = "user", values = "rating")
pivot

user,254,2276,2766,2977,3363,4017,4385,6251,6323,6543,...,271705,273979,274004,274061,274301,274308,275970,277427,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1st to Die: A Novel,,,,,,,,,,9.0,...,,,,,,,,,,
2nd Chance,,10.0,,,,,,,,0.0,...,,,,,,0.0,,,0.0,
A Bend in the Road,0.0,,7.0,,,,,,,,...,,0.0,,,,,,,,
A Is for Alibi (Kinsey Millhone Mysteries (Paperback)),,,7.0,,,,,,,,...,,,,,10.0,0.0,,,,
A Map of the World,,,,,,,,,,,...,0.0,0.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
White Oleander : A Novel,0.0,,,7.0,0.0,0.0,,8.0,0.0,,...,,,,,,,0.0,0.0,,
White Oleander : A Novel (Oprah's Book Club),,,0.0,,0.0,0.0,,0.0,,,...,0.0,0.0,,,,,,0.0,,
Wicked: The Life and Times of the Wicked Witch of the West,,,,,0.0,,,0.0,,10.0,...,,0.0,,,,,,,,
Wild Animus,,,6.0,0.0,0.0,,,0.0,,0.0,...,,0.0,,,0.0,,,0.0,,


In [22]:
# # filling in the null values with 0s
pivot.fillna(0, inplace = True)
pivot = pivot.sort_index()
pivot.head(3)

user,254,2276,2766,2977,3363,4017,4385,6251,6323,6543,...,271705,273979,274004,274061,274301,274308,275970,277427,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Bend in the Road,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
book = 'The Lovely Bones: A Novel'
pivot.loc[book,:].sum()

846.0

In [24]:
cluster = NearestNeighbors(algorithm="brute", metric = "cosine")

In [25]:
cluster.fit(pivot)
distances, indices = cluster.kneighbors(pivot)

In [26]:
# distances
# first element in indices is the book itself and the next elements are the books closest to it.
indices[0]

array([  0,  11,  66, 140,  51])

In [27]:
# first element is the book itself and the rest are distances from book given to 5 closest books
distances[0]

array([0.        , 0.69253355, 0.7219643 , 0.722319  , 0.7284503 ],
      dtype=float32)

In [28]:
# dist of books from given book:
distances[np.where(pivot.index == "2nd Chance")[0][0]][0]

0.0

In [29]:
# function to return recommended books - this will be tested
def get_recommends(book = "Where the Heart Is (Oprah's Book Club (Paperback))", n = 5):
  dist, suggested_books = cluster.kneighbors(pivot.loc[book].values.reshape(1,-1), n_neighbors=6)
  # return suggested_books[0]
  books = pivot.iloc[suggested_books[0]].index.values
  result = list(zip(books, dist[0]))
  result[0] = result[0][0]
  result[1] = sorted([[book,distance] for book,distance in result[1:]], key=lambda x: -x[1])[:4]
  recommended_books = result[:2]
  return recommended_books
get_recommends()

["Where the Heart Is (Oprah's Book Club (Paperback))",
 [["The Pilot's Wife : A Novel", 0.8168827],
  ['Bel Canto: A Novel', 0.8146534],
  ['The Joy Luck Club', 0.8086661],
  ['The Notebook', 0.8009312]]]

In [30]:
print(get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))"))

["Where the Heart Is (Oprah's Book Club (Paperback))", [["The Pilot's Wife : A Novel", 0.8168827], ['Bel Canto: A Novel', 0.8146534], ['The Joy Luck Club', 0.8086661], ['The Notebook', 0.8009312]]]


In [31]:
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)

def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
  if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
    test_pass = False
  recommended_books = ["I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']
  recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
  for i in range(2):
    if recommends[1][i][0] not in recommended_books:
      test_pass = False
    if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
      test_pass = False
  if test_pass:
    print("You passed the challenge! 🎉🎉🎉🎉🎉")
  else:
    print("You haven't passed yet. Keep trying!")

test_book_recommendation()

["Where the Heart Is (Oprah's Book Club (Paperback))", [["The Pilot's Wife : A Novel", 0.8168827], ['Bel Canto: A Novel', 0.8146534], ['The Joy Luck Club', 0.8086661], ['The Notebook', 0.8009312]]]
You haven't passed yet. Keep trying!
