<h1 style="text-align: center;">Book Recommender System</h1>

In [1]:
import pickle
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

## Importing Datasets

Kaggle: [Book Recommendation Dataset](https://www.kaggle.com/datasets/arashnic/book-recommendation-dataset)

In [2]:
dtype_dict = {"Year-Of-Publication": "str"}

books = pd.read_csv("../Dataset/Books.csv", dtype=dtype_dict)
users = pd.read_csv("../Dataset/Users.csv")
ratings = pd.read_csv("../Dataset/Ratings.csv")

#### 1. Books Dataset

In [3]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271360 entries, 0 to 271359
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 271360 non-null  object
 1   Book-Title           271360 non-null  object
 2   Book-Author          271358 non-null  object
 3   Year-Of-Publication  271360 non-null  object
 4   Publisher            271358 non-null  object
 5   Image-URL-S          271360 non-null  object
 6   Image-URL-M          271360 non-null  object
 7   Image-URL-L          271357 non-null  object
dtypes: object(8)
memory usage: 16.6+ MB


In [4]:
books.duplicated().sum()

0

In [5]:
books.isnull().sum()

ISBN                   0
Book-Title             0
Book-Author            2
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64

#### 2. Users Dataset

In [6]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278858 entries, 0 to 278857
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   User-ID   278858 non-null  int64  
 1   Location  278858 non-null  object 
 2   Age       168096 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 6.4+ MB


In [7]:
users.duplicated().sum()

0

In [8]:
users.isnull().sum()

User-ID          0
Location         0
Age         110762
dtype: int64

In [9]:
users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


#### 3. Ratings Dataset

In [10]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   ISBN         1149780 non-null  object
 2   Book-Rating  1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


In [11]:
ratings.duplicated().sum()

0

In [12]:
ratings.isnull().sum()

User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

In [13]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


## Popularity Based Recommender System

In [14]:
df = ratings.merge(books, on='ISBN')

In [15]:
rating_count = df.groupby('Book-Title')['Book-Rating'].count().reset_index()
rating_count.rename(columns={'Book-Rating': 'Rating Count'}, inplace=True)
rating_count.head()

Unnamed: 0,Book-Title,Rating Count
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1
4,Beyond IBM: Leadership Marketing and Finance ...,1


In [16]:
rating_avg = df.groupby('Book-Title')['Book-Rating'].mean().reset_index()
rating_avg.rename(columns={'Book-Rating': 'Rating Avg'}, inplace=True)
rating_avg.head()

Unnamed: 0,Book-Title,Rating Avg
0,A Light in the Storm: The Civil War Diary of ...,2.25
1,Always Have Popsicles,0.0
2,Apple Magic (The Collector's series),0.0
3,"Ask Lily (Young Women of Faith: Lily Series, ...",8.0
4,Beyond IBM: Leadership Marketing and Finance ...,0.0


In [17]:
popular = rating_count.merge(rating_avg, on='Book-Title')
popular.head()

Unnamed: 0,Book-Title,Rating Count,Rating Avg
0,A Light in the Storm: The Civil War Diary of ...,4,2.25
1,Always Have Popsicles,1,0.0
2,Apple Magic (The Collector's series),1,0.0
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1,8.0
4,Beyond IBM: Leadership Marketing and Finance ...,1,0.0


In [18]:
criteria = popular['Rating Count']>=250

popular = popular[criteria].sort_values('Rating Avg', ascending=False).head(50)
popular.head()

Unnamed: 0,Book-Title,Rating Count,Rating Avg
80434,Harry Potter and the Prisoner of Azkaban (Book 3),428,5.852804
80422,Harry Potter and the Goblet of Fire (Book 4),387,5.824289
80441,Harry Potter and the Sorcerer's Stone (Book 1),278,5.73741
80426,Harry Potter and the Order of the Phoenix (Boo...,347,5.501441
80414,Harry Potter and the Chamber of Secrets (Book 2),556,5.183453


In [19]:
attributes = ['Book-Title', 'Rating Count', 'Rating Avg', 'Book-Author', 'Year-Of-Publication', 'Publisher', 'Image-URL-M']

popular = popular.merge(books, on='Book-Title').drop_duplicates('Book-Title')[attributes]
popular['Image-URL-M'] = popular['Image-URL-M'].str.replace('http://', 'https://', case=False)

popular.head()

Unnamed: 0,Book-Title,Rating Count,Rating Avg,Book-Author,Year-Of-Publication,Publisher,Image-URL-M
0,Harry Potter and the Prisoner of Azkaban (Book 3),428,5.852804,J. K. Rowling,1999,Scholastic,https://images.amazon.com/images/P/0439136350....
3,Harry Potter and the Goblet of Fire (Book 4),387,5.824289,J. K. Rowling,2000,Scholastic,https://images.amazon.com/images/P/0439139597....
5,Harry Potter and the Sorcerer's Stone (Book 1),278,5.73741,J. K. Rowling,1998,Scholastic,https://images.amazon.com/images/P/0590353403....
9,Harry Potter and the Order of the Phoenix (Boo...,347,5.501441,J. K. Rowling,2003,Scholastic,https://images.amazon.com/images/P/043935806X....
13,Harry Potter and the Chamber of Secrets (Book 2),556,5.183453,J. K. Rowling,2000,Scholastic,https://images.amazon.com/images/P/0439064872....


## Collaborative Filtering Based Recommender System

#### Filtered Users

In [20]:
# Criteria for users who rate at least 200 books
users_criteria = 200

In [21]:
x = df.groupby('User-ID').count()['Book-Rating'] > users_criteria
filtered_users = x[x].index
filtered_users

Index([   254,   2276,   2766,   2977,   3363,   4017,   4385,   6251,   6323,
         6543,
       ...
       271705, 273979, 274004, 274061, 274301, 274308, 275970, 277427, 277639,
       278418],
      dtype='int64', name='User-ID', length=811)

In [22]:
# Extracting ratings according to filtered users

filtered_ratings = df[df['User-ID'].isin(filtered_users)]
filtered_ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
2,6543,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
5,23768,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
7,28523,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
15,77940,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...
16,81977,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...,http://images.amazon.com/images/P/034545104X.0...


In [23]:
print(f"Filtered Users Shape: {filtered_users.shape}\nFiltered Ratings Shape: {filtered_ratings.shape}")

Filtered Users Shape: (811,)
Filtered Ratings Shape: (474007, 10)


#### Filterted Books

In [24]:
# Criteria for books with at least 50 ratings
books_criteria = 50

In [25]:
y = filtered_ratings.groupby('Book-Title').count()['Book-Rating'] >= books_criteria
filtered_books = y[y].index
filtered_books

Index(['1984', '1st to Die: A Novel', '2nd Chance', '4 Blondes',
       'A Bend in the Road', 'A Case of Need',
       'A Child Called \It\": One Child's Courage to Survive"',
       'A Civil Action', 'A Day Late and a Dollar Short', 'A Fine Balance',
       ...
       'Winter Solstice', 'Wish You Well', 'Without Remorse',
       'Wizard and Glass (The Dark Tower, Book 4)', 'Wuthering Heights',
       'Year of Wonders', 'You Belong To Me',
       'Zen and the Art of Motorcycle Maintenance: An Inquiry into Values',
       'Zoya', '\O\" Is for Outlaw"'],
      dtype='object', name='Book-Title', length=706)

In [26]:
final_ratings = filtered_ratings[filtered_ratings['Book-Title'].isin(filtered_books)]
final_ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
63,278418,446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...
65,3363,446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...
66,7158,446520802,10,The Notebook,Nicholas Sparks,1996,Warner Books,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...
69,11676,446520802,10,The Notebook,Nicholas Sparks,1996,Warner Books,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...
74,23768,446520802,6,The Notebook,Nicholas Sparks,1996,Warner Books,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...,http://images.amazon.com/images/P/0446520802.0...


In [27]:
print(f"Final Ratings Shape: {final_ratings.shape}")

Final Ratings Shape: (58586, 10)


#### Pivot Table

In [28]:
pt = final_ratings.pivot_table(index='Book-Title', columns='User-ID', values='Book-Rating')
pt.fillna(0, inplace=True)
pt

User-ID,254,2276,2766,2977,3363,4017,4385,6251,6323,6543,...,271705,273979,274004,274061,274301,274308,275970,277427,277639,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Bend in the Road,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
You Belong To Me,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zoya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Similarity Scores

In [29]:
# Calculating the similarity of the kth book with the nth books

similarity_scores = cosine_similarity(pt)

In [30]:
similarity_scores.shape

(706, 706)

In [31]:
similarity_scores[0]

array([1.        , 0.10255025, 0.01220856, 0.        , 0.05367224,
       0.02774901, 0.08216491, 0.13732869, 0.03261686, 0.03667591,
       0.02322418, 0.06766487, 0.02083978, 0.09673735, 0.13388865,
       0.08303112, 0.11153543, 0.05100411, 0.02517784, 0.11706383,
       0.        , 0.14333793, 0.07847534, 0.06150451, 0.08723968,
       0.        , 0.07009814, 0.13658681, 0.07600328, 0.12167134,
       0.00768046, 0.01473221, 0.        , 0.07965814, 0.04522617,
       0.01556271, 0.09495938, 0.0182307 , 0.02610465, 0.07984012,
       0.11679969, 0.0569124 , 0.08354155, 0.08471898, 0.08785938,
       0.05491435, 0.0548505 , 0.27026514, 0.09779123, 0.06016046,
       0.08958835, 0.06748675, 0.        , 0.04468098, 0.01920872,
       0.        , 0.05629067, 0.00557964, 0.07877059, 0.05219479,
       0.18908177, 0.        , 0.01240656, 0.02984572, 0.04279502,
       0.12680125, 0.16566735, 0.        , 0.13357242, 0.06615478,
       0.        , 0.        , 0.        , 0.10968075, 0.02806

#### Recommender Function

Testing the recommend function with popular books:
- 1984
- The Notebook
- The Da Vinci Code
- Message in a Bottle

In [32]:
def recommend(book, limit=5, img_ratio='M'):
    """
    Recommends similar books based on a given book using collaborative filtering.
    """
    # gets the index of the book from the pivot table
    index = np.where(pt.index.str.lower() == book.lower())[0]
    if len(index) == 0:
        return []

    index = index[0]
    # gets the top 5 books according to the similarity scores
    searched_indices = np.argsort(similarity_scores[index])[::-1][1:limit]

    img = f'Image-URL-{img_ratio}'
    data = []

    # extracting book title, author, and image data
    for i in searched_indices:
        temp_df = books[books['Book-Title'] == pt.index[i]]
        
        book_title = temp_df.drop_duplicates('Book-Title')['Book-Title'].values[0]
        book_author = temp_df.drop_duplicates('Book-Title')['Book-Author'].values[0]

        url = temp_df.drop_duplicates('Book-Title')[img].values[0]
        img_url = 'https://' + url[len('http://'):] if url.startswith('http://') else url

        data.append([book_title, book_author, img_url])

    return data


In [33]:
recommend('The Da Vinci Code')

[['Angels &amp; Demons',
  'Dan Brown',
  'https://images.amazon.com/images/P/0671027360.01.MZZZZZZZ.jpg'],
 ['Touching Evil',
  'Kay Hooper',
  'https://images.amazon.com/images/P/0553583441.01.MZZZZZZZ.jpg'],
 ['Saving Faith',
  'David Baldacci',
  'https://images.amazon.com/images/P/0446608890.01.MZZZZZZZ.jpg'],
 ["The Sweet Potato Queens' Book of Love",
  'JILL CONNER BROWNE',
  'https://images.amazon.com/images/P/0609804138.01.MZZZZZZZ.jpg']]

## Pickle Dump

In [34]:
pickle.dump(books, open('../Dump/books.pkl', 'wb'))
pickle.dump(pt, open('../Dump/pivot_table.pkl', 'wb'))
pickle.dump(popular, open('../Dump/popular.pkl', 'wb'))
pickle.dump(similarity_scores, open('../Dump/similarity_scores.pkl', 'wb'))