In [1]:
# Imports
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity




In [2]:
# Load Datasets
books = pd.read_csv('Dataset/books.csv')
ratings = pd.read_csv('Dataset/ratings.csv')
tags = pd.read_csv('Dataset/tags.csv')
book_tags = pd.read_csv('Dataset/book_tags.csv')

In [3]:
# Delete NaN values of original_publication_year
for x in ["original_publication_year"]:
    books = books[books[x].notnull()]

# Delete negative values
books = books[(books['original_publication_year'] >= 0)]


In [4]:
# Grouping by the book_id to know how many ratings have each book
rating_book = ratings.groupby('book_id').book_id.apply(lambda x: len(x)).sort_values()


In [5]:
rating_user = ratings.groupby('user_id').user_id.apply(lambda x: len(x)).sort_values()


In [6]:
# Merge the two datasets grouping by the tag_id. 
data = pd.merge(book_tags, tags, left_on='tag_id', right_on='tag_id', how='inner')

titles = books[['book_id','title','goodreads_book_id','authors']]

# Merge the book dataset with the new one to know the title and the author
data = pd.merge(titles, data, left_on='goodreads_book_id', right_on='goodreads_book_id')


In [7]:
pd.set_option('display.max_colwidth', -1)


  pd.set_option('display.max_colwidth', -1)


In [8]:
# For each book, join all their tags
list_tags = data.groupby(by='goodreads_book_id')['tag_name'].apply(set).apply(list)

# Append this list of tags on the books dataset
books['tags'] = books['goodreads_book_id'].apply(lambda x: ' '.join(list_tags[x]))

In [9]:
pd.reset_option('max_colwidth')


In [21]:
# Function to get the index of the book given its title. 
def get_book_id(book_title):
    index = books.index[books['original_title'] == book_title].to_list()
    if index:
        return index[0]
    else:
        return None
def get_book_id_isbn(isbn):
    index = books.index[books['isbn'] == isbn].to_list()
    if index:
        return index[0]
    else:
        return None
# Function to get the title of a book given its id.
def get_book_title(book_id):
    title = books.iloc[book_id]['original_title']
    return title

def get_book_image(book_id):
    image = books.iloc[book_id]['image_url']
    return image

def get_book_isbn(book_id):
    isbn = books.iloc[book_id]['isbn']
    return isbn
def get_book_authors(book_id):
    authors = books.iloc[book_id]['authors']
    return authors

# Function that takes the book title and returns the most similar books.
def get_similar_books(title, n=5):
    
    # Get the book id
    book_id = get_book_id(title)
    
    if book_id is None:
        print("Book not found.")
    else:
        
        # Get the pariwsie similarity scores of all books with that book
        book_similarities = list(enumerate(similarities[book_id]))
        
        # Sort the books based on the similarity scores
        book_similarities = sorted(book_similarities, key=lambda x: x[1], reverse=True)
        
        # Get the scores of the 5 most similar book
        most_similar_books = book_similarities[1:1+n]
        most_similar_books = list(map(lambda x: (get_book_title(x[0]), round(x[1], 2)),most_similar_books))
        
        most_similar_books_df = pd.DataFrame(most_similar_books, columns=['Title', 'Similarity'])
        print("For this book we will recommand you:\n")
        
        return most_similar_books_df

In [22]:
tfidf = TfidfVectorizer(stop_words='english')

tfidf_matrix = tfidf.fit_transform(books['tags'])

In [23]:
similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [24]:
def similar (isbn):
    
    book_id=get_book_id_isbn(isbn)
    book_title=get_book_title(book_id)
    print(book_title)
    result=get_similar_books(book_title)
    df = pd.DataFrame(columns=['Title','authors','image','Similarity'])
    for b in result.index:
        
        df1={'Title':get_book_title(get_book_id((result['Title'][b]))),'authors':get_book_authors(get_book_id((result['Title'][b]))),'image':get_book_image(get_book_id((result['Title'][b]))),'Similarity':result['Similarity'][b]}
        df = df.append(df1, ignore_index = True)
    return df
        

similar("451217454")


Mind Prey (Lucas Davenport, #7)
Book not found.


AttributeError: 'NoneType' object has no attribute 'index'