# Book Recommendation Engine

Importing necessary libraries

In [175]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from os import listdir
from os.path import isfile, join
from load_sql import Loader

pd.set_option('display.max_columns', 10)

In [176]:
# Load sql data to DataFrames and save them in folder "data"
# Loader.load_sql_to_df()

In [177]:
path = '.\\data'

# Find all files in dir 'path' and unpickle them to dfs
allfiles = [f for f in listdir(path) if isfile(join(path, f))]
print(allfiles)

dfs = {file.strip('.pickle'):pd.read_pickle(f'.\\data\\{file}') for file in allfiles}

books = dfs['books_df']
books.drop(['image_URL_S', 'image_URL_M', 'image_URL_L'], axis=1, inplace=True)
users = dfs['users_df']
ratings = dfs['ratings_df']
user_ratings = dfs['user_ratings_df']

['books_df.pickle', 'ratings_df.pickle', 'users_df.pickle', 'user_ratings_df.pickle']


In [178]:
books

Unnamed: 0,ISBN,book_title,book_author,year_of_publication,publisher
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company
...,...,...,...,...,...
271373,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm)
271374,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books
271375,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco
271376,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press


In [179]:
user_ratings

Unnamed: 0,user_id,user_location,age,ISBN,book_rating
0,276747,"iowa city, iowa, usa",25,0060517794,9
1,276747,"iowa city, iowa, usa",25,0451192001,0
2,276747,"iowa city, iowa, usa",25,0609801279,0
3,276747,"iowa city, iowa, usa",25,0671537458,9
4,276747,"iowa city, iowa, usa",25,0679776818,8
...,...,...,...,...,...
1090416,261528,"plano, texas, usa",24,034540288X,8
1090417,261528,"plano, texas, usa",24,0380013924,0
1090418,261528,"plano, texas, usa",24,0380015390,4
1090419,261528,"plano, texas, usa",24,0451161351,8


In [180]:
print(books.shape)
print(users.shape)
print(ratings.shape)
print(user_ratings.shape)

(271378, 5)
(168096, 3)
(1090421, 4)
(1090421, 5)


In [181]:
user_ratings.dtypes

user_id          object
user_location    object
age              object
ISBN             object
book_rating      object
dtype: object

In [182]:
user_ratings["book_rating"] = pd.to_numeric(user_ratings["book_rating"])
user_ratings.dtypes

user_id           object
user_location     object
age               object
ISBN              object
book_rating      float64
dtype: object

In [183]:
print(user_ratings.shape)
# user_ratings = user_ratings[user_ratings.book_rating != 0]
# print(user_ratings.shape)
# user_ratings = user_ratings[user_ratings.book_rating != str]
# user_ratings = user_ratings[pd.to_numeric(user_ratings['book_rating'],errors='coerce').notna()]
# user_ratings['book_rating'] = pd.to_numeric(user_ratings['book_rating'],errors='coerce')
# user_ratings = user_ratings.dropna(subset=['book_rating'])
user_ratings = user_ratings[user_ratings["book_rating"].isin([1,2,3,4,5,6,7,8,9])]
print(user_ratings.shape)

(1090421, 5)
(337071, 5)


In [184]:
user_ratings[user_ratings["book_rating"].isin([1,2,3,4,5,6,7,8,9])]

Unnamed: 0,user_id,user_location,age,ISBN,book_rating
0,276747,"iowa city, iowa, usa",25,0060517794,9.0
3,276747,"iowa city, iowa, usa",25,0671537458,9.0
4,276747,"iowa city, iowa, usa",25,0679776818,8.0
5,276747,"iowa city, iowa, usa",25,0943066433,7.0
7,276747,"iowa city, iowa, usa",25,1885408226,7.0
...,...,...,...,...,...
1090415,261528,"plano, texas, usa",24,0345370775,9.0
1090416,261528,"plano, texas, usa",24,034540288X,8.0
1090418,261528,"plano, texas, usa",24,0380015390,4.0
1090419,261528,"plano, texas, usa",24,0451161351,8.0


In [185]:
user_ratings[user_ratings['ISBN'] == '0375404120']

Unnamed: 0,user_id,user_location,age,ISBN,book_rating
60409,32039,"morton grove, illinois, usa",40,375404120,6.0
390694,238120,"louisville, kentucky, usa",25,375404120,6.0


In [186]:
# Group the 'grouped_rating' df by 'ISBN' and join the 'book_rating' values in the 'book_rating' column separated by a comma
user_ratings = user_ratings.astype(str)
grouped_rating = user_ratings.groupby('ISBN')['book_rating'].apply(lambda x: ', '.join(x)).reset_index()
grouped_rating

Unnamed: 0,ISBN,book_rating
0,0330299891,6.0
1,0375404120,3.0
2,9022906116,7.0
3,#6612432,5.0
4,)416195113,8.0
...,...,...
155363,ZR903CX0003,1.0
155364,"\""0094749809",7.0
155365,"\""0210000010",8.0
155366,Ô½crosoft,7.0


In [187]:
user_ratings['book_rating'] = user_ratings['book_rating'].astype(float)
try:
    avg_rating = user_ratings.groupby(['ISBN'])['book_rating'].mean()
except:
    avg_rating = 0
avg_rating

ISBN
 0330299891      6.0
 0375404120      3.0
 9022906116      7.0
#6612432         5.0
)416195113       8.0
                ... 
ZR903CX0003      1.0
\"0094749809     7.0
\"0210000010     8.0
Ô½crosoft        7.0
ï¿½3499128624    8.0
Name: book_rating, Length: 155368, dtype: float64

In [188]:
grouped_rating = grouped_rating.merge(avg_rating, on='ISBN', how='left')
grouped_rating

Unnamed: 0,ISBN,book_rating_x,book_rating_y
0,0330299891,6.0,6.0
1,0375404120,3.0,3.0
2,9022906116,7.0,7.0
3,#6612432,5.0,5.0
4,)416195113,8.0,8.0
...,...,...,...
155363,ZR903CX0003,1.0,1.0
155364,"\""0094749809",7.0,7.0
155365,"\""0210000010",8.0,8.0
155366,Ô½crosoft,7.0,7.0


In [189]:
# grouped_rating = grouped_rating.merge(user_ratings, on='ISBN', how='right')
# grouped_rating
# test = grouped_rating[grouped_rating['book_rating'].str.contains(0)]
# test

In [191]:
books = books.merge(grouped_rating, on='ISBN', how='left')
books

Unnamed: 0,ISBN,book_title,book_author,year_of_publication,publisher,book_rating_x,book_rating_y
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,,
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,"8.0, 9.0, 5.0, 8.0, 8.0, 9.0, 9.0, 7.0",7.875
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,"8.0, 7.0",7.500
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,"6.0, 8.0, 9.0, 8.0, 6.0",7.400
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,,
...,...,...,...,...,...,...,...
271373,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),,
271374,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books,,
271375,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco,,
271376,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press,,


In [192]:
# Creating a new column 'tag' that contains: title, author, year of publication, and publisher
books = books.astype(str)
books['tag'] = books['book_title'] + ', ' + books['book_author'] +  ', ' + books['year_of_publication'] +  ', ' + books['publisher'] +  ', ' + books['book_rating_y']
books = books.head(20000)

In [193]:
# Extract the book titles and tags into separate lists
titles = books['book_title'].tolist()
tags = books['tag'].str.strip().str.split(",").tolist()

# Create a bag of words representation of the book tags
def create_bow(tag_list):
    bow = {}
    if not isinstance(tag_list, float):
        for tag in tag_list:
            bow[tag] = 1
    return bow
     

# Create a list of bags of words representations of the book tags
bags_of_words = [create_bow(movie_tags) for movie_tags in tags]
print(bags_of_words)



In [194]:
# Create a dataframe to store the bags of words representation of the book tags
tag_df = pd.DataFrame(bags_of_words, index=titles).fillna(0)

# Calculate the cosine similarity matrix between the books
cos_similarity = cosine_similarity(tag_df)

# Create a dataframe with the cosine similarity scores
similarity_df = pd.DataFrame(cos_similarity, index=tag_df.index, columns=tag_df.index)
print(similarity_df)

                                                    Classical Mythology  \
Classical Mythology                                                 1.0   
Clara Callan                                                        0.0   
Decision in Normandy                                                0.0   
Flu: The Story of the Great Influenza Pandemic ...                  0.0   
The Mummies of Urumchi                                              0.2   
...                                                                 ...   
Retribution                                                         0.0   
The Man in My Basement: A Novel                                     0.0   
In This Skin                                                        0.2   
Cameo Lake                                                          0.0   
What's the Girl Worth?                                              0.4   

                                                    Clara Callan  \
Classical Mythology            

In [195]:
books[books['book_title'].str.contains('Hamlet')]

Unnamed: 0,ISBN,book_title,book_author,year_of_publication,publisher,book_rating_x,book_rating_y,tag
3330,0451521285,The Tragedy of Hamlet Prince of Denmark (Signe...,William Shakespeare,1993,Signet Classics,"8.0, 5.0, 7.0, 8.0, 9.0, 5.0, 5.0",6.714285714285714,The Tragedy of Hamlet Prince of Denmark (Signe...
6636,0140714545,The Tragical History of Hamlet Prince of Denma...,William Shakespeare,2001,Penguin Books,,,The Tragical History of Hamlet Prince of Denma...
7886,0553212923,Hamlet (Bantam Classics),William Shakespeare,1988,Bantam,"9.0, 5.0, 8.0, 8.0, 7.0, 5.0",7.0,"Hamlet (Bantam Classics), William Shakespeare,..."
8193,8449500516,Macbeth - Hamlet,William Shakespeare,1998,Mateos Ediciones,9.0,9.0,"Macbeth - Hamlet, William Shakespeare, 1998, M..."
10484,067172262X,Hamlet,William Shakespeare,1992,Washington Square Press,"9.0, 8.0, 7.0, 6.0, 8.0, 4.0, 7.0, 7.0, 8.0, 5.0",6.9,"Hamlet, William Shakespeare, 1992, Washington ..."
10943,0743456246,Hamlet II: Ophelia's Revenge,David Bergantino,2003,Pocket Star,5.0,5.0,"Hamlet II: Ophelia's Revenge, David Bergantino..."
11008,0451526929,Hamlet (Signet Classics (Paperback)),William Shakespeare,1998,Signet Book,8.0,8.0,"Hamlet (Signet Classics (Paperback)), William ..."
15835,0684852705,Hamlet's Dresser: A Memoir,Bob Smith,2003,Scribner,"9.0, 6.0",7.5,"Hamlet's Dresser: A Memoir, Bob Smith, 2003, S..."
17082,1853260096,Hamlet (Wordsworth Classics),William Shakespeare,1997,NTC/Contemporary Publishing Company,,,"Hamlet (Wordsworth Classics), William Shakespe..."


In [196]:
# Ask the user for a movie they like
movie = input('Enter a movie you like: ')

# Find the index of the movie in the similarity dataframe
movie_index = similarity_df.index.get_loc(movie)

# Get the top 10 most similar movies to the movie
top_10 = similarity_df.iloc[movie_index].sort_values(ascending=False)[1:11]

# Print the top 10 most similar movies to the movie
print(f'Top 10 similar movies to {movie}:')
print(top_10)

Top 10 similar movies to Hamlet:
TAMING OF THE SHREW                            0.6
RICHARD III                                    0.4
Henry V (Folger Shakespeare Library)           0.4
The Complete Works of William Shakespeare      0.4
Cowboys Are My Weakness                        0.4
Othello (Folger Shakespeare Library)           0.4
The Tragedy of Coriolanus (Signet Classics)    0.4
TWELFTH NIGHT                                  0.4
Othello                                        0.4
The TEMPEST                                    0.4
Name: Hamlet, dtype: float64


In [197]:
# Create a TfidfVectorizer object to transform the book tags into a Tf-idf representation
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(books['tag'])
print(tfidf_matrix)

  (0, 16658)	0.11996178274857726
  (0, 18899)	0.18760174774252056
  (0, 24366)	0.2825717959559867
  (0, 17669)	0.3292365485952786
  (0, 421)	0.16855753723336744
  (0, 16336)	0.4960793510952055
  (0, 15217)	0.3000727925531932
  (0, 16610)	0.4287252165203896
  (0, 5461)	0.4624022838077976
  (1, 1177)	0.28885533954331977
  (1, 4583)	0.33237804214727495
  (1, 11105)	0.42184308192336245
  (1, 420)	0.15264324122968595
  (1, 25739)	0.3447551576077915
  (1, 4194)	0.3053330410777163
  (1, 20002)	0.2358984523373717
  (1, 4521)	0.43928764638820844
  (1, 5444)	0.37964428653748333
  (2, 11106)	0.4384153885907692
  (2, 406)	0.20210772214775816
  (2, 8579)	0.45654527109063586
  (2, 4713)	0.3945587934018416
  (2, 17078)	0.4074221497463721
  (2, 12196)	0.1685461273062442
  (2, 6852)	0.45654527109063586
  :	:
  (19996, 12196)	0.19417826346695777
  (19997, 14065)	0.4215599426613315
  (19997, 21793)	0.48935293759507875
  (19997, 21681)	0.2962039442052474
  (19997, 5451)	0.3977565343036463
  (19997, 23462)

In [198]:
# Calculate the cosine similarity between the movies
cos_similarity_tfidf = cosine_similarity(tfidf_matrix)

# Create a df with the cosine similarity scores
cos_similarity_tfidf_df = pd.DataFrame(cos_similarity_tfidf, index=books['book_title'], columns=books['book_title'])

In [199]:
# Find the index of the movie in the similarity dataframe
movie_index = cos_similarity_tfidf_df.index.get_loc(movie)

# Get the top 10 most similar movies to the movie
top_10 = cos_similarity_tfidf_df.iloc[movie_index].sort_values(ascending=False)[1:11]

# Print the top 5 most similar movies to the movie
print(f'Top 10 similar movies to {movie}:')
print(top_10)

Top 10 similar movies to Hamlet:
book_title
Othello                                 0.628984
RICHARD III                             0.625394
The TEMPEST                             0.606918
King Lear                               0.600422
TWELFTH NIGHT                           0.590835
Henry V (Folger Shakespeare Library)    0.586311
TAMING OF THE SHREW                     0.580153
Othello (Folger Shakespeare Library)    0.567503
MIDSUMMER NIGHT'S DREAM                 0.564285
Hamlet (Bantam Classics)                0.556815
Name: Hamlet, dtype: float64
