In [2]:
import numpy as np
import pandas as pd
import math
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import matplotlib as mpl
from bokeh.plotting import figure, output_file, show
from bokeh.palettes import magma
import pandas as pd

In [5]:
books_filename = 'Books.csv'
ratings_filename = 'Ratings.csv'

In [7]:
books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})


counts1 = ratings['user'].value_counts()
counts2 = ratings['isbn'].value_counts()
# Removing users with less than 200 rating and books with less than 100 ratings
ratings = ratings[~ratings['user'].isin(counts1[counts1 < 200].index)]
ratings = ratings[~ratings['isbn'].isin(counts2[counts2 < 100].index)]


final_df = pd.merge(right=ratings, left = books, on="isbn")
# drop duplicates
final_df = final_df.drop_duplicates(subset=["title", "user"])

final_df_pivot = final_df.pivot(
  index='title',
  columns='user',
  values='rating'
).fillna(0)


In [31]:
final_df.title.unique()

array(['The Testament', 'Beloved (Plume Contemporary Fiction)',
       'Wild Animus', 'Airframe', 'Timeline', 'To Kill a Mockingbird',
       'Seabiscuit: An American Legend', "I'll Be Seeing You",
       'From the Corner of His Eye', 'Isle of Dogs', 'Purity in Death',
       "Left Behind: A Novel of the Earth's Last Days (Left Behind #1)",
       'The Street Lawyer', 'Breathing Lessons', 'The Joy Luck Club',
       'The Tao of Pooh', 'Seabiscuit', 'The Catcher in the Rye',
       'Midnight in the Garden of Good and Evil: A Savannah Story',
       "Pretend You Don't See Her", 'The Pillars of the Earth',
       "Corelli's Mandolin : A Novel",
       'The Five People You Meet in Heaven', 'The Beach House',
       'Angels &amp; Demons', 'The Girl Who Loved Tom Gordon',
       'The Sum of All Fears', 'Little Altars Everywhere: A Novel',
       'Before I Say Good-Bye', 'Atonement : A Novel',
       "Bridget Jones's Diary", 'The God of Small Things',
       'The Big Bad Wolf: A Novel', 'Four

In [10]:
final_df_pivot.head()

user,254,2276,2766,2977,3363,4017,4385,6242,6251,6323,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Beautiful Mind: The Life of Mathematical Genius and Nobel Laureate John Nash,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
## function to return top n recommended books
def recommended_books(book = "", n = 5):
  titles = list(final_df_pivot.index.values)
  data = final_df_pivot.values

  def title_to_index(title):
    ind = titles.index(title)
    return data[ind,:]

  def index_to_title(ind):
    return titles[ind]
  model = NearestNeighbors(metric="cosine",algorithm="brute", p=2)
  model.fit(data)
  idx = title_to_index(book)
  distances, indices = model.kneighbors(
    np.reshape(idx,[1,-1]),
    n_neighbors=n+1
  )

  raw_recommends = sorted(
    list(
      zip(
        indices.squeeze().tolist(),
        distances.squeeze().tolist()
      )
    ),
    key=lambda x: x[1]
  )[1:]

  recommended_books = []
  print('Recommendations for {}:'.format(book))
  for i, (idx, dist) in enumerate(raw_recommends):
      dist = dist
      recommended_books.append([index_to_title(idx), dist])
      print('{0}: {1}, with distance of {2:,.2f}'.format(i+1, index_to_title(idx), dist))
  print('-----------------')
  return [book, recommended_books]

In [32]:
recommended_books_list = recommended_books("Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))", 10)

Recommendations for Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback)):
1: Harry Potter and the Chamber of Secrets (Book 2), with distance of 0.54
2: Harry Potter and the Prisoner of Azkaban (Book 3), with distance of 0.61
3: Harry Potter and the Goblet of Fire (Book 4), with distance of 0.65
4: Harry Potter and the Order of the Phoenix (Book 5), with distance of 0.73
5: A Time to Kill, with distance of 0.80
6: Three To Get Deadly : A Stephanie Plum Novel (A Stephanie Plum Novel), with distance of 0.80
7: Montana Sky, with distance of 0.81
8: Like Water for Chocolate: A Novel in Monthly Installments With Recipes, Romances and Home Remedies, with distance of 0.81
9: The Joy Luck Club, with distance of 0.81
10: Anne of Green Gables (Anne of Green Gables Novels (Paperback)), with distance of 0.81
-----------------
