# Book Recommender

In [114]:
import pandas as pd
import scipy.sparse as sparse
import numpy as np
from scipy.sparse.linalg import spsolve
import matplotlib.pyplot as plt
%matplotlib inline
from pandas.api.types import CategoricalDtype
import random
import implicit
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
import warnings
warnings.filterwarnings("ignore",category =RuntimeWarning)
from pprint import pprint

In [110]:
# Data import
books = pd.read_csv('book_reco/BX-Books.csv', sep=';', error_bad_lines=False, encoding="latin-1")
books.columns = ['ISBN', 'bookTitle', 'bookAuthor', 'yearOfPublication', 'publisher', 'imageUrlS', 'imageUrlM', 'imageUrlL']
books = books.drop(['yearOfPublication', 'publisher', 'bookAuthor', 'imageUrlS', 'imageUrlM', 'imageUrlL'], axis=1)
users = pd.read_csv('book_reco/BX-Users.csv', sep=';', error_bad_lines=False, encoding="latin-1")
users.columns = ['userID', 'Location', 'Age']
ratings = pd.read_csv('book_reco/BX-Book-Ratings.csv', sep=';', error_bad_lines=False, encoding="latin-1")
ratings.columns = ['userID', 'ISBN', 'bookRating']

b'Skipping line 6452: expected 8 fields, saw 9\nSkipping line 43667: expected 8 fields, saw 10\nSkipping line 51751: expected 8 fields, saw 9\n'
b'Skipping line 92038: expected 8 fields, saw 9\nSkipping line 104319: expected 8 fields, saw 9\nSkipping line 121768: expected 8 fields, saw 9\n'
b'Skipping line 144058: expected 8 fields, saw 9\nSkipping line 150789: expected 8 fields, saw 9\nSkipping line 157128: expected 8 fields, saw 9\nSkipping line 180189: expected 8 fields, saw 9\nSkipping line 185738: expected 8 fields, saw 9\n'
b'Skipping line 209388: expected 8 fields, saw 9\nSkipping line 220626: expected 8 fields, saw 9\nSkipping line 227933: expected 8 fields, saw 11\nSkipping line 228957: expected 8 fields, saw 10\nSkipping line 245933: expected 8 fields, saw 9\nSkipping line 251296: expected 8 fields, saw 9\nSkipping line 259941: expected 8 fields, saw 9\nSkipping line 261529: expected 8 fields, saw 9\n'
  interactivity=interactivity, compiler=compiler, result=result)


In [115]:
pprint(users.head(2))

   userID                   Location   Age
0       1         nyc, new york, usa   NaN
1       2  stockton, california, usa  18.0


In [116]:
pprint(books.head(2))

         ISBN            bookTitle
0  0195153448  Classical Mythology
1  0002005018         Clara Callan


In [117]:
pprint(ratings.head(2))

   userID        ISBN  bookRating
0  276725  034545104X           0
1  276726  0155061224           5


In [118]:
pprint(f'Number of Users {users.shape[0]}')
pprint(f'Columns in User dataframe {list(users.columns)}')
pprint(f'Number of Books {books.shape[0]}')
pprint(f'Columns in Books dataframe {list(books.columns)}')
pprint(f'Number of Ratings {ratings.shape[0]}')
pprint(f'Columns in Ratings dataframe {list(ratings.columns)}')

'Number of Users 278858'
"Columns in User dataframe ['userID', 'Location', 'Age']"
'Number of Books 271360'
"Columns in Books dataframe ['ISBN', 'bookTitle']"
'Number of Ratings 1149780'
"Columns in Ratings dataframe ['userID', 'ISBN', 'bookRating']"


## Data munging for utility matrix

In [135]:
# ratings by user for a particular book
books_with_ratings = pd.merge(ratings, books, on='ISBN')
books_with_ratings = books_with_ratings.dropna(axis=0, subset=['bookTitle'])
books_with_ratings = books_with_ratings.drop('ISBN', axis=1)
pprint(books_with_ratings.head(2))

   userID  bookRating             bookTitle
0  276725           0  Flesh Tones: A Novel
1    2313           5  Flesh Tones: A Novel


In [136]:
# How many times a book got rated
count_of_book_rating = (books_with_ratings.groupby('bookTitle')['bookRating'].count().reset_index().rename(columns={'bookRating': 'totalRatingCount'})[['bookTitle', 'totalRatingCount']])
pprint(count_of_book_rating.head(2))

                                           bookTitle  totalRatingCount
0   A Light in the Storm: The Civil War Diary of ...                 4
1                              Always Have Popsicles                 1


In [137]:
# User ratings with books and their total rating counts
combined_with_ratingcount = books_with_ratings.merge(count_of_book_rating,
                                    left_on='bookTitle',
                                    right_on='bookTitle', 
                                    how='left')
pprint(combined_with_ratingcount.head(2))

   userID  bookRating             bookTitle  totalRatingCount
0  276725           0  Flesh Tones: A Novel                60
1    2313           5  Flesh Tones: A Novel                60


In [138]:
# Excluding books that have rating count less than 50 only keeping popular ones
rating_threshold = 50
popular_books = combined_with_ratingcount.query('totalRatingCount >= @rating_threshold')
pprint(popular_books.head(2))

   userID  bookRating             bookTitle  totalRatingCount
0  276725           0  Flesh Tones: A Novel                60
1    2313           5  Flesh Tones: A Novel                60


In [140]:
# For location specific recommendation going to merge popular books df with users df
combined = popular_books.merge(users,
                                  left_on='userID',
                                  right_on='userID',
                                  how='left')
combined = combined.drop('totalRatingCount', axis=1)
usa_user_rating = combined[combined['Location'].str.contains('usa')]
usa_user_rating = usa_user_rating.drop('Age', axis=1)
pprint(usa_user_rating.head(2))

   userID  bookRating             bookTitle               Location
0  276725           0  Flesh Tones: A Novel      tyler, texas, usa
1    2313           5  Flesh Tones: A Novel  cincinnati, ohio, usa


## KNN based recommender

In [142]:
# Keeping only unique user and book rating combination
usa_user_rating = usa_user_rating.drop_duplicates(['userID', 'bookTitle'])

# Create pivot of user books and users and fill na with zero
usa_user_rating_pivot = usa_user_rating.pivot(index = 'bookTitle', columns = 'userID', values = 'bookRating').fillna(0)

# Conerting pivot table to CSR matrix
usa_user_rating_matrix = csr_matrix(usa_user_rating_pivot.values)

# Fitting the KNN algorithm
knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
knn.fit(usa_user_rating_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

In [144]:
# Choose a random number from book list
query_index = np.random.choice(usa_user_rating_pivot.shape[0])

# Extract index and distances of nearest neighbors of query index
distances, indices = knn.kneighbors(usa_user_rating_pivot.iloc[query_index, :].values.reshape(1, -1), n_neighbors = 6)

# Serve recommendation
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(usa_user_rating_pivot.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, usa_user_rating_pivot.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for The Hangman's Beautiful Daughter:

1: She Walks These Hills, with distance of 0.8538214350321209:
2: The Triumph of Katie Byrne, with distance of 0.8686691101456137:
3: If I'd Killed Him When I Met Him (Elizabeth MacPherson Novels (Paperback)), with distance of 0.8792071484855862:
4: To Say Nothing of the Dog, with distance of 0.8885182412493589:
5: Killer Pancake, with distance of 0.8958602455579392:


## Matrix factorization recommender

In [146]:
# User book rating matrix
usa_user_rating_pivot_2d = usa_user_rating.pivot(index='userID', columns='bookTitle', values='bookRating').fillna(0)
usa_user_rating_pivot_2d.head()

bookTitle,10 Lb. Penalty,16 Lighthouse Road,1984,1st to Die: A Novel,2010: Odyssey Two,204 Rosewood Lane,2061: Odyssey Three,24 Hours,2nd Chance,3rd Degree,...,YOU BELONG TO ME,Year of Wonders,You Belong To Me,You Shall Know Our Velocity,Young Wives,Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,Zoya,"\O\"" Is for Outlaw""","\Surely You're Joking, Mr. Feynman!\"": Adventures of a Curious Character""",stardust
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [149]:
print(f'Our utility matrix has {usa_user_rating_pivot_2d.shape[0]} users and {usa_user_rating_pivot_2d.shape[1]} books')

Our utility matrix has 35074 users and 2441 books


In [150]:
# We will decompose our utility matrix to 12 latent variables
X = usa_user_rating_pivot_2d.values.T
SVD = TruncatedSVD(n_components=12, random_state=17)
matrix = SVD.fit_transform(X)
print(f' New matrix shape {matrix.shape}')

 New matrix shape (2441, 12)


In [155]:
# Calculate pearson correlation of all the books
corr = np.corrcoef(matrix)
print(f'Shape of correlation matrix {corr.shape}')

Shape of correlation matrix (2441, 2441)


In [151]:
# Getting the booklist
book_titles = usa_user_rating_pivot_2d.columns
book_list = list(book_titles)
fight_club = book_list.index('Fight Club')
print(f'Index of Fight club is {fight_club}')

Index of Fight club is 654


In [156]:
# Let's find out movies with high correlation with Fight club
fight_club_corr = corr[fight_club]
list(book_titles[(fight_club_corr > 0.9)])

['Bearing an Hourglass (Incarnations of Immortality (Paperback))',
 'Crystal Line',
 'Dune Messiah (Dune Chronicles, Book 2)',
 "Enchanters' End Game (The Belgariad, Book 5)",
 'Fight Club',
 'Invisible Monsters',
 'Survivor : A Novel',
 'Sword of Shannara',
 'The Color of Magic',
 'The Left Hand of Darkness (Remembering Tomorrow)']