In [50]:
import numpy as np
import pandas as pd

In [51]:
#Reading the excel sheets into a dataframe
books = pd.read_csv("BX-Books.csv", sep = ";", encoding = "latin-1",  on_bad_lines='skip')
users = pd.read_csv("BX-Users.csv", sep = ";", encoding = "latin-1",  on_bad_lines='skip')
ratings = pd.read_csv("BX-Book-Ratings.csv", sep = ";", encoding = "latin-1",  on_bad_lines='skip')

  books = pd.read_csv("BX-Books.csv", sep = ";", encoding = "latin-1",  on_bad_lines='skip')


# Preprocessing Data

In [52]:
#Extracting only the columns that we'll need 
books = books[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher']]

#Renaming the columns to make them easy to use
books.rename(columns = {'Book-Title':'title', 'Book-Author':'author', 'Year-Of-Publication':'year', 'Publisher':'publisher'}, inplace=True)
users.rename(columns = {'User-ID':'user_id', 'Location':'location', 'Age':'age'}, inplace=True)
ratings.rename(columns = {'User-ID':'user_id', 'Book-Rating':'rating'}, inplace=True)   

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books.rename(columns = {'Book-Title':'title', 'Book-Author':'author', 'Year-Of-Publication':'year', 'Publisher':'publisher'}, inplace=True)


In [53]:
books.head(20)

Unnamed: 0,ISBN,title,author,year,publisher
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company
5,0399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group
6,0425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000,Berkley Publishing Group
7,0671870432,PLEADING GUILTY,Scott Turow,1993,Audioworks
8,0679425608,Under the Black Flag: The Romance and the Real...,David Cordingly,1996,Random House
9,074322678X,Where You'll Find Me: And Other Stories,Ann Beattie,2002,Scribner


# Exploratory Data Analysis

## Flaw in the dataset 
To build our model, we should only rely on users who have given a decent amount of ratings so the model will be accurate enough. So we decided to limit the minimal number of ratings required at 200 ratings.

The same goes for books, those who are not rated enough will disturb our model and never get recommended in the first place, so we will need to only extract the book who have at least 50 ratings

## Step 1 : Extracting users and ratings of more than 200

In [54]:
#Extracting the users with at least 200 ratings
x = ratings['user_id'].value_counts() > 200
y = x[x].index
print(y.shape) #899 users are included in our model
#Reducing the ratings set to the preselected users
ratings = ratings[ratings['user_id'].isin(y)]

(899,)


## Step 2 : Merging the ratings with the books 

In [55]:
rating_with_books = ratings.merge(books, on='ISBN')
rating_with_books.head()

Unnamed: 0,user_id,ISBN,rating,title,author,year,publisher
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc
1,3363,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc
2,11676,002542730X,6,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc
3,12538,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc
4,13552,002542730X,0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc


## Step 3 : Extracting books that have received more than 50 ratings

In [56]:
number_rating = rating_with_books.groupby('title')['rating'].count().reset_index()
number_rating.rename(columns= {'rating':'number_of_ratings'}, inplace=True)
#Merging everything
final_rating = rating_with_books.merge(number_rating, on='title')
final_rating = final_rating[final_rating['number_of_ratings'] >= 50]
final_rating.drop_duplicates(['user_id','title'], inplace=True)
print(number_rating[number_rating['number_of_ratings'] >= 50])

                                                    title  number_of_ratings
493                                                  1984                 76
527                                   1st to Die: A Novel                162
664                                            2nd Chance                124
810                                             4 Blondes                 71
1020                                84 Charing Cross Road                 54
...                                                   ...                ...
159044                                    Year of Wonders                 59
159277                                   You Belong To Me                 56
159916  Zen and the Art of Motorcycle Maintenance: An ...                 64
160067                                               Zoya                 59
160158                                \O\" Is for Outlaw"                105

[742 rows x 2 columns]


## Step 4 : Create pivot table 
Now we will create a pivot table where columns will be user ids, the index will be book title and the value is ratings. And the user id who has not rated any book will have value as NAN so impute it with zero.

In [57]:
book_pivot = final_rating.pivot_table(columns='user_id', index='ISBN', values="rating")
book_pivot.fillna(0, inplace=True)
print(book_pivot)
MOVIE_ISBN_LIST = book_pivot.index.tolist()
# print(MOVIE_ISBN_LIST)

user_id     254     2276    2766    2977    3363    3757    4017    4385    \
ISBN                                                                         
0001047973     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
0006177379     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
0020697406     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
002542730X     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
002542730x     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
...            ...     ...     ...     ...     ...     ...     ...     ...   
B00009EF82     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
B00009NDAN     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
B0000T6KHI     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
B0001PIOX4     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
B000234N76     0.0     0.0     0.0     0.0     0.0     0.0     0

But here in the pivot table, we have lots of zero values and on clustering, this computing power will increase to calculate the distance of zero values so we will convert the pivot table to the sparse matrix and then feed it to the model

In [58]:
from scipy.sparse import csr_matrix
book_sparse = csr_matrix(book_pivot)

Now we will train the nearest neighbors algorithm. here we need to specify an algorithm which is brute means find the distance of every point to every other point.

In [59]:
from sklearn.neighbors import NearestNeighbors
#model = NearestNeighbors(algorithm = "brute")
model = NearestNeighbors(n_neighbors= 10, algorithm = "brute")
model.fit(book_sparse)

NearestNeighbors(algorithm='brute', n_neighbors=10)

## Step 5 : Testing the model 
Let’s make a prediction and see whether it is suggesting books or not. we will find the nearest neighbors to the input book id and after that, we will print the top 5 books which are closer to those books. It will provide us distance and book id at that distance. let us pass harry potter which is at index 237.

In [61]:
#Getting the suggestions and the distances
BOOK_ISBN = "0020697406"
FIRST_MATCHING_ISBN = [isbn for isbn in MOVIE_ISBN_LIST if BOOK_ISBN == isbn][0]
INDEX_SUGGESTION = MOVIE_ISBN_LIST.index(FIRST_MATCHING_ISBN)
distances, suggestions = model.kneighbors(book_pivot.iloc[INDEX_SUGGESTION, :].values.reshape(1, -1))

In [65]:
#Printing all the suggestions we got 
print("Given book's ISBN : " + book_pivot.index[INDEX_SUGGESTION])
#for i in range(len(suggestions)):
#  print(book_pivot.index[suggestions[i]])
for movie in suggestions[0]:
    print(f'Book number {movie} => ' + book_pivot.iloc[movie].name)

Given book's ISBN : 0020697406
Book number 1493 => 0613033191
Book number 472 => 0380781506
Book number 1503 => 0613292146
Book number 1499 => 0613224450
Book number 1497 => 0613185048
Book number 1494 => 0613064054
Book number 468 => 0380752166
Book number 469 => 0380760061
Book number 1505 => 0613335864
Book number 1492 => 0613032950
