![Book Recommender System](https://images.projectsgeek.com/2018/07/recommendation.png)

# Importing the required libraries

In [34]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Importing the dataset from kaggle

In [2]:
# Copy the 'kaggle.json' file to colab folder
!mkdir -p ~/.kaggle 
!cp kaggle.json ~/.kaggle/

In [3]:
# API key of the dataset
!kaggle datasets download -d arashnic/book-recommendation-dataset

Downloading book-recommendation-dataset.zip to /content
 76% 18.0M/23.8M [00:00<00:00, 101MB/s] 
100% 23.8M/23.8M [00:00<00:00, 109MB/s]


In [4]:
# Unzip the files to google colab folder
import zipfile
zip_ref = zipfile.ZipFile('/content/book-recommendation-dataset.zip')
zip_ref.extractall('/content')
zip_ref.close()

In [5]:
# Loading the .csv files 
books = pd.read_csv('/content/Books.csv')
users = pd.read_csv('/content/Users.csv')
ratings = pd.read_csv('/content/Ratings.csv')

# books dataset
books.sample(2)

  books = pd.read_csv('/content/Books.csv')


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
50538,60961325,The Celluloid Closet: Homosexuality in the Movies,Vito Russo,1987,Perennial,http://images.amazon.com/images/P/0060961325.0...,http://images.amazon.com/images/P/0060961325.0...,http://images.amazon.com/images/P/0060961325.0...
95082,8401499593,Papel Moneda,Ken Follett,1999,"Plaza &amp; Janes Editores, S.A.",http://images.amazon.com/images/P/8401499593.0...,http://images.amazon.com/images/P/8401499593.0...,http://images.amazon.com/images/P/8401499593.0...


In [6]:
# users dataset
users.sample(2)

Unnamed: 0,User-ID,Location,Age
191684,191685,"astoria, new york, usa",26.0
42099,42100,"san diego, california, usa",


In [7]:
# ratings dataset
ratings.sample(2)

Unnamed: 0,User-ID,ISBN,Book-Rating
319059,76352,671729489,0
941093,227705,44902427,0


In [8]:
# Looking at the shape of each of these datasets
books.shape, users.shape, ratings.shape

((271360, 8), (278858, 3), (1149780, 3))

# Analyzing Data

### Null values detection

In [9]:
# Looking for null values in books, ratings, and users dataset
print('books:')
print(books.isnull().sum())
print('\nratings:')
print(ratings.isnull().sum())
print('\nusers:')
print(users.isnull().sum())

books:
ISBN                   0
Book-Title             0
Book-Author            1
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64

ratings:
User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

users:
User-ID          0
Location         0
Age         110762
dtype: int64


In [10]:
# Finding the percentage of null values in users dataset
users.isnull().mean()

User-ID     0.000000
Location    0.000000
Age         0.397199
dtype: float64

### Looking for duplicates in books, users, and ratings dataset

In [11]:
print('"books" Dataset: ', books.duplicated().sum())
print('"users" Dataset: ', users.duplicated().sum())
print('"ratings" Dataset: ', ratings.duplicated().sum())

"books" Dataset:  0
"users" Dataset:  0
"ratings" Dataset:  0


# EDA

In [12]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271360 entries, 0 to 271359
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 271360 non-null  object
 1   Book-Title           271360 non-null  object
 2   Book-Author          271359 non-null  object
 3   Year-Of-Publication  271360 non-null  object
 4   Publisher            271358 non-null  object
 5   Image-URL-S          271360 non-null  object
 6   Image-URL-M          271360 non-null  object
 7   Image-URL-L          271357 non-null  object
dtypes: object(8)
memory usage: 16.6+ MB


# Popularity based recommender system
- Will display top 50 books with highest average rating, and a minimum of 250 ratings.

In [13]:
# Merging books and rating with column 'ISBN'
ratings_with_name = ratings.merge(books, on='ISBN')
ratings_with_name.sample(2)

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
873849,104665,671452851,0,DEATHSTONE,Kelli M. Gary,1982,Pocket,http://images.amazon.com/images/P/0671452851.0...,http://images.amazon.com/images/P/0671452851.0...,http://images.amazon.com/images/P/0671452851.0...
187459,91184,60929790,0,One Hundred Years of Solitude,Gabriel Garcia Marquez,1998,Perennial,http://images.amazon.com/images/P/0060929790.0...,http://images.amazon.com/images/P/0060929790.0...,http://images.amazon.com/images/P/0060929790.0...


In [14]:
ratings_with_name.shape

(1031136, 10)

- Ratings dataset has some books which is not present in books dataset

In [15]:
# Total rating per book
num_rating_df = ratings_with_name.groupby('Book-Title').count()['Book-Rating'].reset_index()
num_rating_df.rename(columns={'Book-Rating': 'Num-Rating'}, inplace=True)
num_rating_df.sample(2)

Unnamed: 0,Book-Title,Num-Rating
27956,Brief an ein nie geborenes Kind.,2
150063,Riddle of the Wayward Books (Wishbone Mysterie...,3


In [16]:
# Average rating per book
avg_rating_df = ratings_with_name.groupby('Book-Title').mean()['Book-Rating'].reset_index()
avg_rating_df.rename(columns={'Book-Rating': 'Avg-Rating'}, inplace=True)
avg_rating_df.sample(2)

  avg_rating_df = ratings_with_name.groupby('Book-Title').mean()['Book-Rating'].reset_index()


Unnamed: 0,Book-Title,Avg-Rating
172122,"Tallulah Bankhead (Outlines (Bath, England).)",4.0
151445,Romeo and Ghouliette (BC 23) (Bone Chillers),0.0


In [17]:
# Merge the num_rating_df and avg_rating_df
popular_df = num_rating_df.merge(avg_rating_df, on='Book-Title')
popular_df.sample(2)

Unnamed: 0,Book-Title,Num-Rating,Avg-Rating
187928,The Fleetwood Correspondence: A Devilish Tale ...,1,0.0
48915,Der indische Baum.,1,5.0


In [18]:
# Filter only those books whose Num-Rating is more than 250
popular_df = popular_df[popular_df['Num-Rating'] >= 250].sort_values('Avg-Rating', ascending=False).head(50)
popular_df

Unnamed: 0,Book-Title,Num-Rating,Avg-Rating
80434,Harry Potter and the Prisoner of Azkaban (Book 3),428,5.852804
80422,Harry Potter and the Goblet of Fire (Book 4),387,5.824289
80441,Harry Potter and the Sorcerer's Stone (Book 1),278,5.73741
80426,Harry Potter and the Order of the Phoenix (Boo...,347,5.501441
80414,Harry Potter and the Chamber of Secrets (Book 2),556,5.183453
191612,The Hobbit : The Enchanting Prelude to The Lor...,281,5.007117
187377,The Fellowship of the Ring (The Lord of the Ri...,368,4.94837
80445,Harry Potter and the Sorcerer's Stone (Harry P...,575,4.895652
211384,"The Two Towers (The Lord of the Rings, Part 2)",260,4.880769
219741,To Kill a Mockingbird,510,4.7


In [19]:
# Merging popular_df with books to get author name, publisher, image etc.
popular_df.merge(books, on='Book-Title').sample(2)

Unnamed: 0,Book-Title,Num-Rating,Avg-Rating,ISBN,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
11,Harry Potter and the Order of the Phoenix (Boo...,347,5.501441,439358078,J. K. Rowling,2004,Scholastic Paperbacks,http://images.amazon.com/images/P/0439358078.0...,http://images.amazon.com/images/P/0439358078.0...,http://images.amazon.com/images/P/0439358078.0...
148,Left Behind: A Novel of the Earth's Last Days ...,318,4.003145,842329110,Tim Lahaye,1995,Tyndale House Publishers,http://images.amazon.com/images/P/0842329110.0...,http://images.amazon.com/images/P/0842329110.0...,http://images.amazon.com/images/P/0842329110.0...


In [20]:
# Since we have many rows with the same book name we have to drop duplicates on top of book title
popular_df = popular_df.merge(books, on='Book-Title').drop_duplicates('Book-Title')[['Book-Title', 'Book-Author', 'Image-URL-M', 'Num-Rating', 'Avg-Rating']].reset_index()
popular_df

Unnamed: 0,index,Book-Title,Book-Author,Image-URL-M,Num-Rating,Avg-Rating
0,0,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,http://images.amazon.com/images/P/0439136350.0...,428,5.852804
1,3,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,http://images.amazon.com/images/P/0439139597.0...,387,5.824289
2,5,Harry Potter and the Sorcerer's Stone (Book 1),J. K. Rowling,http://images.amazon.com/images/P/0590353403.0...,278,5.73741
3,9,Harry Potter and the Order of the Phoenix (Boo...,J. K. Rowling,http://images.amazon.com/images/P/043935806X.0...,347,5.501441
4,13,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,http://images.amazon.com/images/P/0439064872.0...,556,5.183453
5,16,The Hobbit : The Enchanting Prelude to The Lor...,J.R.R. TOLKIEN,http://images.amazon.com/images/P/0345339681.0...,281,5.007117
6,17,The Fellowship of the Ring (The Lord of the Ri...,J.R.R. TOLKIEN,http://images.amazon.com/images/P/0345339703.0...,368,4.94837
7,26,Harry Potter and the Sorcerer's Stone (Harry P...,J. K. Rowling,http://images.amazon.com/images/P/059035342X.0...,575,4.895652
8,28,"The Two Towers (The Lord of the Rings, Part 2)",J.R.R. TOLKIEN,http://images.amazon.com/images/P/0345339711.0...,260,4.880769
9,39,To Kill a Mockingbird,Harper Lee,http://images.amazon.com/images/P/0446310786.0...,510,4.7


# Collaborative filtering based recommender system
- Users who rated more than 200 books and books with minimum 50 ratings

In [21]:
# Only those users who have rated more than 200 books
x = ratings_with_name.groupby('User-ID').count()['Book-Rating'] > 200
padhe_likhe_users = x[x].index
padhe_likhe_users

Int64Index([   254,   2276,   2766,   2977,   3363,   4017,   4385,   6251,
              6323,   6543,
            ...
            271705, 273979, 274004, 274061, 274301, 274308, 275970, 277427,
            277639, 278418],
           dtype='int64', name='User-ID', length=811)

In [22]:
# Filtering out padhe_likhe_users rated books from ratings_with_name df
filtered_rating = ratings_with_name[ratings_with_name['User-ID'].isin(padhe_likhe_users)]

In [23]:
# Filtering out books with minimum 50 ratings
y = filtered_rating.groupby('Book-Title').count()['Book-Rating'] >= 50
famous_books = y[y].index
famous_books

Index(['1984', '1st to Die: A Novel', '2nd Chance', '4 Blondes',
       'A Bend in the Road', 'A Case of Need',
       'A Child Called \It\": One Child's Courage to Survive"',
       'A Civil Action', 'A Day Late and a Dollar Short', 'A Fine Balance',
       ...
       'Winter Solstice', 'Wish You Well', 'Without Remorse',
       'Wizard and Glass (The Dark Tower, Book 4)', 'Wuthering Heights',
       'Year of Wonders', 'You Belong To Me',
       'Zen and the Art of Motorcycle Maintenance: An Inquiry into Values',
       'Zoya', '\O\" Is for Outlaw"'],
      dtype='object', name='Book-Title', length=706)

In [24]:
final_ratings = filtered_rating[filtered_rating['Book-Title'].isin(famous_books)]
final_ratings.sample(2)

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
153334,59971,451172817,0,Needful Things,Stephen King,2004,Signet Book,http://images.amazon.com/images/P/0451172817.0...,http://images.amazon.com/images/P/0451172817.0...,http://images.amazon.com/images/P/0451172817.0...
176077,183196,345350499,8,The Mists of Avalon,MARION ZIMMER BRADLEY,1987,Del Rey,http://images.amazon.com/images/P/0345350499.0...,http://images.amazon.com/images/P/0345350499.0...,http://images.amazon.com/images/P/0345350499.0...


In [25]:
pt = final_ratings.pivot_table(index='Book-Title', columns='User-ID', values='Book-Rating')
pt

User-ID,254,2276,2766,2977,3363,4017,4385,6251,6323,6543,...,271705,273979,274004,274061,274301,274308,275970,277427,277639,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,,,,,,,,,,...,10.0,,,,,,0.0,,,
1st to Die: A Novel,,,,,,,,,,9.0,...,,,,,,,,,,
2nd Chance,,10.0,,,,,,,,0.0,...,,,,,,0.0,,,0.0,
4 Blondes,,,,,,,,0.0,,,...,,,,,,,,,,
A Bend in the Road,0.0,,7.0,,,,,,,,...,,0.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,,,,7.0,,,,,,0.0,...,,9.0,,,,,0.0,,,
You Belong To Me,,,,,,,,,0.0,,...,,,,,,,,,,
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,,,,,0.0,,,0.0,,,...,,,,,,,0.0,,,
Zoya,,,,,,,,,,,...,,0.0,,,,,,,,


In [26]:
pt.fillna(0, inplace=True)
pt

User-ID,254,2276,2766,2977,3363,4017,4385,6251,6323,6543,...,271705,273979,274004,274061,274301,274308,275970,277427,277639,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Bend in the Road,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
You Belong To Me,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zoya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- Here each book is a vector in a 810 dimensional space.
- Now we find the eucledian distance between one book and another to find similar books

In [27]:
# Calculating the distance through cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

In [28]:
# The resultant dataset contains the similarity score of each book with rest of the books
similarity_scores = cosine_similarity(pt)
similarity_scores

array([[1.        , 0.10255025, 0.01220856, ..., 0.12110367, 0.07347567,
        0.04316046],
       [0.10255025, 1.        , 0.2364573 , ..., 0.07446129, 0.16773875,
        0.14263397],
       [0.01220856, 0.2364573 , 1.        , ..., 0.04558758, 0.04938579,
        0.10796119],
       ...,
       [0.12110367, 0.07446129, 0.04558758, ..., 1.        , 0.07085128,
        0.0196177 ],
       [0.07347567, 0.16773875, 0.04938579, ..., 0.07085128, 1.        ,
        0.10602962],
       [0.04316046, 0.14263397, 0.10796119, ..., 0.0196177 , 0.10602962,
        1.        ]])

In [29]:
similarity_scores.shape

(706, 706)

- It is the similarity score of each book with all the other book

In [30]:
# Function for recommending books
def recommend(book_name):
  # Index
  index = np.where(pt.index == book_name)[0][0]
  similar_items = sorted(list(enumerate(similarity_scores[index])), key=lambda x: x[1], reverse=True)[1:5]
  data = []
  for i in similar_items:
    item = []
    temp_df = books[books['Book-Title'] == pt.index[i[0]]]
    item.extend(list(temp_df.drop_duplicates('Book-Title')['Book-Title'].values))
    item.extend(list(temp_df.drop_duplicates('Book-Title')['Book-Author'].values))
    item.extend(list(temp_df.drop_duplicates('Book-Title')['Image-URL-M'].values))
    data.append(item)
  return data

In [31]:
# Testing
recommend('Harry Potter and the Prisoner of Azkaban (Book 3)')

[['Harry Potter and the Goblet of Fire (Book 4)',
  'J. K. Rowling',
  'http://images.amazon.com/images/P/0439139597.01.MZZZZZZZ.jpg'],
 ['Harry Potter and the Chamber of Secrets (Book 2)',
  'J. K. Rowling',
  'http://images.amazon.com/images/P/0439064872.01.MZZZZZZZ.jpg'],
 ['Harry Potter and the Order of the Phoenix (Book 5)',
  'J. K. Rowling',
  'http://images.amazon.com/images/P/043935806X.01.MZZZZZZZ.jpg'],
 ["Harry Potter and the Sorcerer's Stone (Book 1)",
  'J. K. Rowling',
  'http://images.amazon.com/images/P/0590353403.01.MZZZZZZZ.jpg']]

# Exporting using pickle

In [32]:
import pickle
pickle.dump(popular_df, open('popular.pkl', 'wb'))

In [33]:
pickle.dump(pt, open('pt.pkl', 'wb'))
pickle.dump(books, open('books.pkl', 'wb'))
pickle.dump(similarity_scores, open('similarity_scores.pkl', 'wb'))