# Project 4 - Books Recommendation using cosine similarity
Collaborative based filtering->Item based

In [1]:
# Dependencies
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
import sklearn as sk


### Data preparation

In [2]:
#Creating dataframes from csv files to read the data
books_df_original = pd.read_csv('./Resources/Books.csv')
users_df = pd.read_csv('./Resources/Users.csv')
ratings_df = pd.read_csv('./Resources/Ratings.csv')

  books_df_original = pd.read_csv('./Resources/Books.csv')


In [3]:
# Filter out data with no publication year
books_df = books_df_original[books_df_original['Year-Of-Publication'] != 0]

In [4]:
# remove duplicated books records if any by looking at ISBN
books_df=books_df.drop_duplicates(subset=['ISBN'])

In [5]:
books_df.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [6]:
books_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 267790 entries, 0 to 271359
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 267790 non-null  object
 1   Book-Title           267790 non-null  object
 2   Book-Author          267788 non-null  object
 3   Year-Of-Publication  267790 non-null  object
 4   Publisher            267788 non-null  object
 5   Image-URL-S          267790 non-null  object
 6   Image-URL-M          267790 non-null  object
 7   Image-URL-L          267787 non-null  object
dtypes: object(8)
memory usage: 18.4+ MB


In [7]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   ISBN         1149780 non-null  object
 2   Book-Rating  1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


In [8]:
# update the datatype of a 'Book-Rating' field to numeric one
ratings_df['Book-Rating']=pd.to_numeric(ratings_df['Book-Rating'],errors='coerce')
# and check the result
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   ISBN         1149780 non-null  object
 2   Book-Rating  1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


In [9]:
ratings_df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [10]:
# merge books data with ratings, so that we have only those records where books is rated and has title info
ratings_df=pd.merge(books_df,ratings_df,on='ISBN', how = 'inner')
# leave only title, user and rating data
ratings_df=ratings_df.drop(['ISBN','Book-Author','Year-Of-Publication','Publisher','Image-URL-S','Image-URL-M','Image-URL-L'], axis=1)
# drop if any N/As
ratings_df=ratings_df.dropna()
# drop duplicated records when same user could rate book(s) with same title
ratings_df=ratings_df.drop_duplicates()
ratings_df

Unnamed: 0,Book-Title,User-ID,Book-Rating
0,Classical Mythology,2,0
1,Clara Callan,8,5
2,Clara Callan,11400,0
3,Clara Callan,11676,8
4,Clara Callan,41385,0
...,...,...,...
1018387,There's a Bat in Bunk Five,276463,7
1018388,From One to One Hundred,276579,4
1018389,Lily Dale : The True Story of the Town that Ta...,276680,0
1018390,Republic (World's Classics),276680,0


### Filter data, so that only statistically significant data are left

In [14]:
min_books_rated_by_user=50
min_rates_received_by_book=25
top_X_recommendations=5

In [17]:
#find the count of books rated by user
groupped_r_users=ratings_df.groupby('User-ID')['Book-Rating'].count()

#find the count of rates per book-title
groupped_r_books=ratings_df.groupby('Book-Title')['User-ID'].count()

#select only those books which were rated more than min_rates_received_by_book
titles_with_acceptable_rates_count=list(groupped_r_books[groupped_r_books>min_rates_received_by_book].index)

#select only those users (user_id) who rated more than min_books_rated_by_user books
user_ids_with_acceptable_books_count_rated=list(groupped_r_users[groupped_r_users>min_books_rated_by_user].index)

# filter rating-user data to have only books/users of interest (which have highest rates count and rated highest number of books respectively)
rating_input_df=ratings_df[ratings_df['Book-Title'].isin(titles_with_acceptable_rates_count)&ratings_df['User-ID'].isin(user_ids_with_acceptable_books_count_rated)]
rating_input_df


Unnamed: 0,Book-Title,User-ID,Book-Rating
31,The Kitchen God's Wife,11676,9
32,The Kitchen God's Wife,29526,9
33,The Kitchen God's Wife,36836,0
34,The Kitchen God's Wife,46398,9
38,The Kitchen God's Wife,113270,0
...,...,...,...
1017769,Angel Falls,244688,0
1018040,Naked Prey,250405,0
1018092,The Thin Woman,259260,0
1018240,"The Two Towers (The Lord of the Rings, Part 2)",259901,10


In [18]:
# use pivot method to create a matrix with columns=User-ID and rows='Book-Title' and values equal to rating value given for a book by a respective user
df_books_ratigs_user=rating_input_df.pivot_table(index='Book-Title', columns='User-ID', values='Book-Rating')
df_books_ratigs_user

User-ID,243,254,507,638,643,741,882,929,1211,1424,...,277928,277965,278026,278137,278144,278188,278418,278582,278633,278843
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Salem's Lot,,,,,,,,,,,...,,,,,,,,,,
10 Lb. Penalty,,,,,,,,,,,...,,,,,,,,,,
101 Dalmatians,,,0.0,,,,,,,,...,,,,,,,0.0,,,
"14,000 Things to Be Happy About",,,,,,,,8.0,,,...,,,,,,,,,,
16 Lighthouse Road,,,,,,,,,,,...,,,,,,0.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"\ Lamb to the Slaughter and Other Stories (Penguin 60s S.)""",,,,,,,,,,,...,,,,,,,0.0,,,
"\O\"" Is for Outlaw""",,,,,,,,,,,...,,,,,,,,,,
"\Surely You're Joking, Mr. Feynman!\"": Adventures of a Curious Character""",,,,,,,,,,,...,,,,,,,,,,
e,,,,,,,,,,,...,,,,,,,,,,


In [19]:
# filling n/a with 0 so far, assuming it means that no interest for a book by a user,
# but will probably update with mean per book???
df_books_ratigs_user=df_books_ratigs_user.fillna(0)
df_books_ratigs_user

User-ID,243,254,507,638,643,741,882,929,1211,1424,...,277928,277965,278026,278137,278144,278188,278418,278582,278633,278843
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Salem's Lot,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Lb. Penalty,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101 Dalmatians,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"14,000 Things to Be Happy About",0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16 Lighthouse Road,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"\ Lamb to the Slaughter and Other Stories (Penguin 60s S.)""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"\O\"" Is for Outlaw""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"\Surely You're Joking, Mr. Feynman!\"": Adventures of a Curious Character""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
e,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
# create a dictionary for mapping between row number ans Book-Title
index_title_dict=dict(df_books_ratigs_user.reset_index()['Book-Title'])

In [24]:
# apply cosine_similarity
books_similarity = cosine_similarity(df_books_ratigs_user)
books_similarity

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.13677348],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.13677348, 0.        ,
        1.        ]])

In [35]:
# convert output of cosine_similarity into df
books_similarity_df=pd.DataFrame(books_similarity)
books_similarity_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5491,5492,5493,5494,5495,5496,5497,5498,5499,5500
0,1.0,0.000000,0.000000,0.000000,0.000000,0.036525,0.055482,0.166619,0.000000,0.123260,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000
1,0.0,1.000000,0.000000,0.000000,0.000000,0.000000,0.100365,0.000000,0.000000,0.046899,...,0.000000,0.000000,0.140207,0.000000,0.000000,0.0,0.036368,0.000000,0.0,0.000000
2,0.0,0.000000,1.000000,0.000000,0.000000,0.044121,0.000000,0.000000,0.000000,0.046168,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.126281,0.000000,0.0,0.000000
3,0.0,0.000000,0.000000,1.000000,0.000000,0.020292,0.075561,0.000000,0.000000,0.000000,...,0.070199,0.109613,0.000000,0.068514,0.271244,0.0,0.018069,0.063538,0.0,0.073909
4,0.0,0.000000,0.000000,0.000000,1.000000,0.000000,0.028704,0.109312,0.216577,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.050079,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5496,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.027093,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,1.0,0.000000,0.000000,0.0,0.000000
5497,0.0,0.036368,0.126281,0.018069,0.050079,0.023209,0.095122,0.000000,0.000000,0.000000,...,0.017240,0.026920,0.000000,0.066275,0.066615,0.0,1.000000,0.015604,0.0,0.018151
5498,0.0,0.000000,0.000000,0.063538,0.000000,0.017524,0.028748,0.000000,0.000000,0.000000,...,0.060624,0.094662,0.000000,0.059168,0.234246,0.0,0.015604,1.000000,0.0,0.136773
5499,0.0,0.000000,0.000000,0.000000,0.000000,0.042996,0.031349,0.044770,0.000000,0.000000,...,0.000000,0.077419,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1.0,0.000000


In [36]:
# introduce title here
books_similarity_df=books_similarity_df.rename(columns=index_title_dict)
books_similarity_df['Book-Title']=books_similarity_df.index
books_similarity_df['Book-Title']=books_similarity_df['Book-Title'].map(index_title_dict)
books_similarity_df

Unnamed: 0,'Salem's Lot,10 Lb. Penalty,101 Dalmatians,"14,000 Things to Be Happy About",16 Lighthouse Road,1984,1st to Die: A Novel,2010: Odyssey Two,204 Rosewood Lane,2061: Odyssey Three,...,Zodiac: The Eco-Thriller,Zombies of the Gene Pool,Zoya,ZwÃ?Â¶lf.,"\ Lamb to the Slaughter and Other Stories (Penguin 60s S.)""","\O\"" Is for Outlaw""","\Surely You're Joking, Mr. Feynman!\"": Adventures of a Curious Character""",e,stardust,Book-Title
0,1.0,0.000000,0.000000,0.000000,0.000000,0.036525,0.055482,0.166619,0.000000,0.123260,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,'Salem's Lot
1,0.0,1.000000,0.000000,0.000000,0.000000,0.000000,0.100365,0.000000,0.000000,0.046899,...,0.000000,0.140207,0.000000,0.000000,0.0,0.036368,0.000000,0.0,0.000000,10 Lb. Penalty
2,0.0,0.000000,1.000000,0.000000,0.000000,0.044121,0.000000,0.000000,0.000000,0.046168,...,0.000000,0.000000,0.000000,0.000000,0.0,0.126281,0.000000,0.0,0.000000,101 Dalmatians
3,0.0,0.000000,0.000000,1.000000,0.000000,0.020292,0.075561,0.000000,0.000000,0.000000,...,0.109613,0.000000,0.068514,0.271244,0.0,0.018069,0.063538,0.0,0.073909,"14,000 Things to Be Happy About"
4,0.0,0.000000,0.000000,0.000000,1.000000,0.000000,0.028704,0.109312,0.216577,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.050079,0.000000,0.0,0.000000,16 Lighthouse Road
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5496,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.027093,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,1.0,0.000000,0.000000,0.0,0.000000,\ Lamb to the Slaughter and Other Stories (Pen...
5497,0.0,0.036368,0.126281,0.018069,0.050079,0.023209,0.095122,0.000000,0.000000,0.000000,...,0.026920,0.000000,0.066275,0.066615,0.0,1.000000,0.015604,0.0,0.018151,"\O\"" Is for Outlaw"""
5498,0.0,0.000000,0.000000,0.063538,0.000000,0.017524,0.028748,0.000000,0.000000,0.000000,...,0.094662,0.000000,0.059168,0.234246,0.0,0.015604,1.000000,0.0,0.136773,"\Surely You're Joking, Mr. Feynman!\"": Adventu..."
5499,0.0,0.000000,0.000000,0.000000,0.000000,0.042996,0.031349,0.044770,0.000000,0.000000,...,0.077419,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1.0,0.000000,e


In [43]:
# def find_recommendations_cos_sim(book_title):
book_title='1st to Die: A Novel'
# find a similarity list for the book
recommendations=books_similarity_df[[book_title,'Book-Title']].sort_values(by=book_title, ascending=False)
recommendations

Unnamed: 0,1st to Die: A Novel,Book-Title
6,1.000000,1st to Die: A Novel
3002,0.249912,Pop Goes the Weasel
285,0.225930,Along Came a Spider (Alex Cross Novels)
2151,0.215181,Kiss the Girls
3224,0.212501,Roses Are Red (Alex Cross Novels)
...,...,...
3817,0.000000,The Authoritative Calvin and Hobbes (Calvin an...
3816,0.000000,The Austere Academy (A Series of Unfortunate E...
942,0.000000,Crystal Line
943,0.000000,Crystal Singer


In [46]:
recommendations=recommendations[~recommendations['Book-Title'].isin(book_title_list)]
recommendations

Unnamed: 0,1st to Die: A Novel,Book-Title
3002,0.249912,Pop Goes the Weasel
285,0.225930,Along Came a Spider (Alex Cross Novels)
2151,0.215181,Kiss the Girls
3224,0.212501,Roses Are Red (Alex Cross Novels)
583,0.211548,Blood Test (Alex Delaware Novels (Paperback))
...,...,...
3817,0.000000,The Authoritative Calvin and Hobbes (Calvin an...
3816,0.000000,The Austere Academy (A Series of Unfortunate E...
942,0.000000,Crystal Line
943,0.000000,Crystal Singer


In [51]:
# select top top_X_recommendations
top_recommendations=recommendations[:top_X_recommendations].rename(columns={book_title:'similarity rate'})
top_recommendations

Unnamed: 0,similarity rate,Book-Title
3002,0.249912,Pop Goes the Weasel
285,0.22593,Along Came a Spider (Alex Cross Novels)
2151,0.215181,Kiss the Girls
3224,0.212501,Roses Are Red (Alex Cross Novels)
583,0.211548,Blood Test (Alex Delaware Novels (Paperback))


In [59]:
dict_years=dict(recommendations_full_info.groupby('Book-Title')['Year-Of-Publication'].max())
dict_years

{'Along Came a Spider (Alex Cross Novels)': 2003,
 'Blood Test (Alex Delaware Novels (Paperback))': 1995,
 'Kiss the Girls': 2000,
 'Pop Goes the Weasel': 2000,
 'Roses Are Red (Alex Cross Novels)': 2001}

In [62]:
recommendations_full_info=pd.merge(top_recommendations, books_df, left_on='Book-Title',right_on='Book-Title', how='left')
for i, row in recommendations_full_info.iterrows():
    if row['Year-Of-Publication']!=dict_years[row['Book-Title']]:
        recommendations_full_info.loc[i,'Year-Of-Publication']=0
recommendations_full_info=recommendations_full_info[recommendations_full_info['Year-Of-Publication'] != 0]
recommendations_full_info=recommendations_full_info.drop_duplicates(subset=['Book-Title'])
recommendations_full_info

Unnamed: 0,similarity rate,Book-Title,ISBN,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
1,0.249912,Pop Goes the Weasel,446608815,James Patterson,2000,Warner Vision,http://images.amazon.com/images/P/0446608815.0...,http://images.amazon.com/images/P/0446608815.0...,http://images.amazon.com/images/P/0446608815.0...
5,0.22593,Along Came a Spider (Alex Cross Novels),446692638,James Patterson,2003,Warner Books,http://images.amazon.com/images/P/0446692638.0...,http://images.amazon.com/images/P/0446692638.0...,http://images.amazon.com/images/P/0446692638.0...
9,0.215181,Kiss the Girls,446677388,James Patterson,2000,Warner Books,http://images.amazon.com/images/P/0446677388.0...,http://images.amazon.com/images/P/0446677388.0...,http://images.amazon.com/images/P/0446677388.0...
11,0.212501,Roses Are Red (Alex Cross Novels),446605484,James Patterson,2001,Warner Vision,http://images.amazon.com/images/P/0446605484.0...,http://images.amazon.com/images/P/0446605484.0...,http://images.amazon.com/images/P/0446605484.0...
12,0.211548,Blood Test (Alex Delaware Novels (Paperback)),553569635,Jonathan Kellerman,1995,Bantam Books,http://images.amazon.com/images/P/0553569635.0...,http://images.amazon.com/images/P/0553569635.0...,http://images.amazon.com/images/P/0553569635.0...
