In [1]:
import pandas as pd
import numpy as np

In [2]:
books = pd.read_csv(r'.\dataset\Books.csv', low_memory=False)
rating = pd.read_csv(r'.\dataset\Ratings.csv')
user = pd.read_csv(r'.\dataset\Users.csv')

print(books.shape)
print(rating.shape)
print(user.shape)

(271360, 8)
(1149780, 3)
(278858, 3)


In [3]:
# Count the total number of missing or NaN values in the books DataFrame
books.isnull().sum()

ISBN                   0
Book-Title             0
Book-Author            2
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64

In [4]:
# Count the total number of missing or NaN values in the user DataFrame
user.isnull().sum()

User-ID          0
Location         0
Age         110762
dtype: int64

In [5]:
# Count the total number of missing or NaN values in the rating DataFrame
rating.isnull().sum()

User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

In [6]:
#finds duplicates
books.duplicated().sum()

0

In [7]:
user.duplicated().sum()

0

In [8]:
rating.duplicated().sum()

0

## Popularity based system

In [9]:
rating.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [10]:
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [11]:
rating_with_name = rating.merge(books, on="ISBN") 
dup_num = rating_with_name['Book-Title'].duplicated()
print(dup_num.head())

0    False
1     True
2     True
3     True
4     True
Name: Book-Title, dtype: bool


In [12]:
#grouping of data based on Book-Title
num_rating_df = rating_with_name.groupby('Book-Title').count()['Book-Rating'].reset_index()
num_rating_df.rename(columns={'Book-Rating':'Num-Rating'}, inplace=True)
num_rating_df

Unnamed: 0,Book-Title,Num-Rating
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1
4,Beyond IBM: Leadership Marketing and Finance ...,1
...,...,...
241066,Ã?Â?lpiraten.,2
241067,Ã?Â?rger mit Produkt X. Roman.,4
241068,Ã?Â?sterlich leben.,1
241069,Ã?Â?stlich der Berge.,3


In [13]:
# Getting the index of null values
null_indices = rating_with_name[rating_with_name['Book-Rating'].isnull()].index
print(null_indices)

Index([], dtype='int64')


In [14]:
#grouping of data based on Book-Title
# print(rating_with_name['Book-Rating'].dtype)
avg_num_rating_df = rating_with_name.groupby('Book-Title')['Book-Rating'].mean().reset_index()
avg_num_rating_df.rename(columns={'Book-Rating':'Avg-Num-Rating'}, inplace=True)
print(avg_num_rating_df)

                                               Book-Title  Avg-Num-Rating
0        A Light in the Storm: The Civil War Diary of ...        2.250000
1                                   Always Have Popsicles        0.000000
2                    Apple Magic (The Collector's series)        0.000000
3        Ask Lily (Young Women of Faith: Lily Series, ...        8.000000
4        Beyond IBM: Leadership Marketing and Finance ...        0.000000
...                                                   ...             ...
241066                                      Ã?Â?lpiraten.        0.000000
241067                     Ã?Â?rger mit Produkt X. Roman.        5.250000
241068                                Ã?Â?sterlich leben.        7.000000
241069                              Ã?Â?stlich der Berge.        2.666667
241070                                  Ã?Â?thique en toc        4.000000

[241071 rows x 2 columns]


In [15]:
popular_df = num_rating_df.merge(avg_num_rating_df, on='Book-Title')
popular_df

Unnamed: 0,Book-Title,Num-Rating,Avg-Num-Rating
0,A Light in the Storm: The Civil War Diary of ...,4,2.250000
1,Always Have Popsicles,1,0.000000
2,Apple Magic (The Collector's series),1,0.000000
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1,8.000000
4,Beyond IBM: Leadership Marketing and Finance ...,1,0.000000
...,...,...,...
241066,Ã?Â?lpiraten.,2,0.000000
241067,Ã?Â?rger mit Produkt X. Roman.,4,5.250000
241068,Ã?Â?sterlich leben.,1,7.000000
241069,Ã?Â?stlich der Berge.,3,2.666667


In [16]:
# Top 50 books based on user ratings
popular_df = popular_df[popular_df['Num-Rating'] > 250].sort_values('Avg-Num-Rating', ascending=False).head(50)

In [17]:
# popular_df.keys() # Extract indexes from the dataframe
popular_df = popular_df.merge(books, on='Book-Title').drop_duplicates('Book-Title')
popular_df = popular_df[['Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher', 'Image-URL-S', 'Avg-Num-Rating', 'Num-Rating']]
popular_df

Unnamed: 0,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Avg-Num-Rating,Num-Rating
0,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,1999,Scholastic,http://images.amazon.com/images/P/0439136350.0...,5.852804,428
3,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,2000,Scholastic,http://images.amazon.com/images/P/0439139597.0...,5.824289,387
5,Harry Potter and the Sorcerer's Stone (Book 1),J. K. Rowling,1998,Scholastic,http://images.amazon.com/images/P/0590353403.0...,5.73741,278
9,Harry Potter and the Order of the Phoenix (Boo...,J. K. Rowling,2003,Scholastic,http://images.amazon.com/images/P/043935806X.0...,5.501441,347
13,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,2000,Scholastic,http://images.amazon.com/images/P/0439064872.0...,5.183453,556
16,The Hobbit : The Enchanting Prelude to The Lor...,J.R.R. TOLKIEN,1986,Del Rey,http://images.amazon.com/images/P/0345339681.0...,5.007117,281
17,The Fellowship of the Ring (The Lord of the Ri...,J.R.R. TOLKIEN,1986,Del Rey,http://images.amazon.com/images/P/0345339703.0...,4.94837,368
26,Harry Potter and the Sorcerer's Stone (Harry P...,J. K. Rowling,1999,Arthur A. Levine Books,http://images.amazon.com/images/P/059035342X.0...,4.895652,575
28,"The Two Towers (The Lord of the Rings, Part 2)",J.R.R. TOLKIEN,1986,Del Rey,http://images.amazon.com/images/P/0345339711.0...,4.880769,260
39,To Kill a Mockingbird,Harper Lee,1988,Little Brown &amp; Company,http://images.amazon.com/images/P/0446310786.0...,4.7,510


## Collaborative filtering

In [18]:
user_rating = rating_with_name.groupby('User-ID').count()['Book-Rating'] > 200
user_ratings = user_rating[user_rating].index
filtered_rating = rating_with_name[rating_with_name['User-ID'].isin(user_ratings)]  #only consider the users that satisfies the above criteria i.e. they have given minimum of more than 200 ratings.

In [19]:
famous_books = filtered_rating.groupby('Book-Title').count()['Book-Rating'] >= 50
famous_books = famous_books[famous_books].index
famous_books  #Only considered the books that have got total ratings count more than 50

Index(['1984', '1st to Die: A Novel', '2nd Chance', '4 Blondes',
       'A Bend in the Road', 'A Case of Need',
       'A Child Called \It\": One Child's Courage to Survive"',
       'A Civil Action', 'A Day Late and a Dollar Short', 'A Fine Balance',
       ...
       'Winter Solstice', 'Wish You Well', 'Without Remorse',
       'Wizard and Glass (The Dark Tower, Book 4)', 'Wuthering Heights',
       'Year of Wonders', 'You Belong To Me',
       'Zen and the Art of Motorcycle Maintenance: An Inquiry into Values',
       'Zoya', '\O\" Is for Outlaw"'],
      dtype='object', name='Book-Title', length=706)

In [29]:
# Considered only that books which have got atleast 50 ratings and only rated by the user who have given ratings to more than 200 books
final_rated_books = filtered_rating[filtered_rating['Book-Title'].isin(famous_books)]

(58586, 10)

In [21]:
# Pivot table
pt = final_rated_books.pivot_table(index='Book-Title', columns='User-ID', values='Book-Rating')
pt.fillna(0, inplace=True)
pt

User-ID,254,2276,2766,2977,3363,4017,4385,6251,6323,6543,...,271705,273979,274004,274061,274301,274308,275970,277427,277639,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Bend in the Road,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
You Belong To Me,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zoya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
from sklearn.metrics.pairwise import cosine_similarity
similarity_scores = cosine_similarity(pt)
similarity_scores.shape
similarity_scores

array([[1.        , 0.10255025, 0.01220856, ..., 0.12110367, 0.07347567,
        0.04316046],
       [0.10255025, 1.        , 0.2364573 , ..., 0.07446129, 0.16773875,
        0.14263397],
       [0.01220856, 0.2364573 , 1.        , ..., 0.04558758, 0.04938579,
        0.10796119],
       ...,
       [0.12110367, 0.07446129, 0.04558758, ..., 1.        , 0.07085128,
        0.0196177 ],
       [0.07347567, 0.16773875, 0.04938579, ..., 0.07085128, 1.        ,
        0.10602962],
       [0.04316046, 0.14263397, 0.10796119, ..., 0.0196177 , 0.10602962,
        1.        ]])

In [23]:
def recommend(book_name):
    #fetching index
    # np.where(pt.index==f"{book_name}") returns 2D array [0][0] gives the 0th pos index number
    index = np.where(pt.index==f"{book_name}")[0][0]
    #finding vector distances with other books
    # distances = list(enumerate(similarity_scores[index]))
    # finding similar items
    similar_items_index = sorted(list(enumerate(similarity_scores[index])), key = lambda x : x[1],reverse=True)[1:11]

    # book_index = [index, distance]
    data = []
    for book_index in similar_items_index:
        # print(book_index)
        item = []
        temp_df = books[books['Book-Title'] == pt.index[book_index[0]]] #Getting similar book name using index
        item.extend(list(temp_df.drop_duplicates('Book-Title')['Book-Title'].values))
        item.extend(list(temp_df.drop_duplicates('Book-Title')['Book-Author'].values))
        item.extend(list(temp_df.drop_duplicates('Book-Title')['Image-URL-M'].values))
        data.append(item)
    
    return data

In [24]:
recommend('The Da Vinci Code') #sample

[['Angels &amp; Demons',
  'Dan Brown',
  'http://images.amazon.com/images/P/0671027360.01.MZZZZZZZ.jpg'],
 ['Touching Evil',
  'Kay Hooper',
  'http://images.amazon.com/images/P/0553583441.01.MZZZZZZZ.jpg'],
 ['Saving Faith',
  'David Baldacci',
  'http://images.amazon.com/images/P/0446608890.01.MZZZZZZZ.jpg'],
 ["The Sweet Potato Queens' Book of Love",
  'JILL CONNER BROWNE',
  'http://images.amazon.com/images/P/0609804138.01.MZZZZZZZ.jpg'],
 ['Middlesex: A Novel',
  'Jeffrey Eugenides',
  'http://images.amazon.com/images/P/0312422156.01.MZZZZZZZ.jpg'],
 ['The Lovely Bones: A Novel',
  'Alice Sebold',
  'http://images.amazon.com/images/P/0316666343.01.MZZZZZZZ.jpg'],
 ['Timeline',
  'MICHAEL CRICHTON',
  'http://images.amazon.com/images/P/0345417623.01.MZZZZZZZ.jpg'],
 ['The Blue Nowhere : A Novel',
  'Jeffery Deaver',
  'http://images.amazon.com/images/P/0671042262.01.MZZZZZZZ.jpg'],
 ['Sea Glass: A Novel',
  'Anita Shreve',
  'http://images.amazon.com/images/P/0316089699.01.MZZZZZZ

In [26]:
# Serializing models using pickle
import pickle
pickle.dump(popular_df, open('popular.pkl', 'wb'))
pickle.dump(pt, open('pt.pkl', 'wb'))
pickle.dump(books, open('books.pkl', 'wb'))
pickle.dump(similarity_scores, open('similarity_score.pkl', 'wb'))