# Book Recommendation system using Content based filtering

In [1]:
#import libaries
import pandas as pd
import numpy as np

In [2]:
#Import dataset
books=pd.read_csv('books.csv')
books.head()

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [3]:
books.columns

Index(['id', 'book_id', 'best_book_id', 'work_id', 'books_count', 'isbn',
       'isbn13', 'authors', 'original_publication_year', 'original_title',
       'title', 'language_code', 'average_rating', 'ratings_count',
       'work_ratings_count', 'work_text_reviews_count', 'ratings_1',
       'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5', 'image_url',
       'small_image_url'],
      dtype='object')

### Data preprocessing and cleaning


In [4]:
#Extract only the required columns 
book=books[['id','original_title']]
book.head()

Unnamed: 0,id,original_title
0,1,The Hunger Games
1,2,Harry Potter and the Philosopher's Stone
2,3,Twilight
3,4,To Kill a Mockingbird
4,5,The Great Gatsby


In [5]:
book.shape

(10000, 2)

In [6]:
#Drop null or missing values
book=book.dropna()

In [7]:
book.shape

(9415, 2)

## For content based book recommendation we have to use NLP techniques like

Keyword extraction -> Extract keywords from title

Cosine Similarity -> Find cosine similarity between all movie titles

### Tf-Idf Vectorizer for keyword extraction

In [8]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
#Define a TF-IDF Vectorizer .
tfidf = TfidfVectorizer(stop_words='english')# Stop_words remove all english stop words such as 'the', 'a'.

In [10]:
book['original_title']=book['original_title'].fillna('')

In [11]:
#TF-IDF matrix by fitting and transforming the data
matrix = tfidf.fit_transform(book['original_title'])

In [12]:
matrix

<9415x8945 sparse matrix of type '<class 'numpy.float64'>'
	with 23947 stored elements in Compressed Sparse Row format>

In [13]:
#It created a sparse matrix
matrix.shape

(9415, 8945)

In [14]:
#It created a matrix of mXn 

### Cosine Similarity between Titles
We can find cosine similarities by cosine similarity available on sklearn.

We can even use sigmoid_kernal or linear_kernal

In [15]:
#I'm using here Linear Kernal for finding cosine similarities
#Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(matrix, matrix)

In [16]:
indices=pd.Series(book.index,index=book['original_title']).drop_duplicates()

In [17]:
#WE have store the titles in index and assign them with index
indices.head(10)

original_title
The Hunger Games                            0
Harry Potter and the Philosopher's Stone    1
Twilight                                    2
To Kill a Mockingbird                       3
The Great Gatsby                            4
The Fault in Our Stars                      5
The Hobbit or There and Back Again          6
The Catcher in the Rye                      7
Angels & Demons                             8
Pride and Prejudice                         9
dtype: int64

In [18]:
#Function for getting reco
def reco(title,cosine_sim=cosine_sim):
    idx=indices[title]                                              #it store the index of title to idx
    sim_score=list(enumerate(cosine_sim[idx]))                      #It give all the similarities related to the give title 
    sim_score=sorted(sim_score,key=lambda x:x[1],reverse=True)      #It sort in reverse order so the element on top  
                                                                        #rows will contain the the books with high sim_score 
    sim_score=sim_score[1:11]                                       #It will store the top 10 books (1) contains sim_score of itself
    book_idx=[i[0] for i in sim_score]                              #Store the book index in book_idx
    return book['original_title'].iloc[book_idx]                    #Display the title 

In [19]:
reco('Harry Potter and the Deathly Hallows')

3752    Harry Potter Collection (Harry Potter, #1-6)
23               Harry Potter and the Goblet of Fire
22           Harry Potter and the Chamber of Secrets
26            Harry Potter and the Half-Blood Prince
1           Harry Potter and the Philosopher's Stone
20         Harry Potter and the Order of the Phoenix
421                  Complete Harry Potter Boxed Set
17          Harry Potter and the Prisoner of Azkaban
2000                     Harry Potter: Film Wizardry
2100                Harry Potter Boxed Set Books 1-4
Name: original_title, dtype: object

In [20]:
reco('Pride and Prejudice')

1038                      Pride and Prejudice and Zombies
4844    Dawn of the Dreadfuls (Pride and Prejudice and...
6248                                     Pride of Baghdad
8779                                   Prom and Prejudice
0                                        The Hunger Games
1                Harry Potter and the Philosopher's Stone
2                                                Twilight
3                                   To Kill a Mockingbird
4                                        The Great Gatsby
5                                  The Fault in Our Stars
Name: original_title, dtype: object

# So we have sucessfully created book recommendation system by using titles for keyword extraction

# Now we use CountVectorizer for keyword extraction

### Data preprocessing and cleaning


In [21]:
#Now we recommend books on the basis of authors too 
book1=books[['id','authors','original_title']]
book1=book1.dropna()
book1.head()

Unnamed: 0,id,authors,original_title
0,1,Suzanne Collins,The Hunger Games
1,2,"J.K. Rowling, Mary GrandPré",Harry Potter and the Philosopher's Stone
2,3,Stephenie Meyer,Twilight
3,4,Harper Lee,To Kill a Mockingbird
4,5,F. Scott Fitzgerald,The Great Gatsby


In [22]:
#Drop null or missing values
book=book.dropna()

## CountVectorizer

In [23]:
# Import CountVectorizer and create the matrix
from sklearn.feature_extraction.text import CountVectorizer

count1 = CountVectorizer(stop_words='english')
matrix1 = count1.fit_transform(book1['authors'])

In [24]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim1 = cosine_similarity(matrix1, matrix1)

In [25]:
book1[['title','author_ori']]=books[['original_title','authors']]
indices1=pd.Series(book1.index,index=book1['original_title']).drop_duplicates()

In [26]:
indices1.head()

original_title
The Hunger Games                            0
Harry Potter and the Philosopher's Stone    1
Twilight                                    2
To Kill a Mockingbird                       3
The Great Gatsby                            4
dtype: int64

In [27]:
#Function for getting recommendation
def reco1(title,cosine_sim=cosine_sim1):
    idx=indices1[title]                                              #it store the index of title to idx
    sim_score=list(enumerate(cosine_sim1[idx]))                      #It give all the similarities related to the give title 
    sim_score=sorted(sim_score,key=lambda x:x[1],reverse=True)      #It sort in reverse order so the element on top  
                                                                        #rows will contain the the books with high sim_score 
    sim_score=sim_score[1:11]                                       #It will store the top 10 books (1) contains sim_score of itself
    book_idx=[i[0] for i in sim_score]                              #Store the book index in book_idx
    return book1[['title','author_ori']].iloc[book_idx]

In [28]:
reco1("Harry Potter and the Philosopher's Stone")#Books recommendation by authors

Unnamed: 0,title,author_ori
20,Harry Potter and the Order of the Phoenix,"J.K. Rowling, Mary GrandPré"
22,Harry Potter and the Chamber of Secrets,"J.K. Rowling, Mary GrandPré"
23,Harry Potter and the Goblet of Fire,"J.K. Rowling, Mary GrandPré"
24,Harry Potter and the Deathly Hallows,"J.K. Rowling, Mary GrandPré"
26,Harry Potter and the Half-Blood Prince,"J.K. Rowling, Mary GrandPré"
2100,Harry Potter Boxed Set Books 1-4,"J.K. Rowling, Mary GrandPré"
17,Harry Potter and the Prisoner of Azkaban,"J.K. Rowling, Mary GrandPré, Rufus Beck"
341,The Casual Vacancy,J.K. Rowling
398,The Tales of Beedle the Bard,J.K. Rowling
421,Complete Harry Potter Boxed Set,J.K. Rowling


In [29]:
reco1("The Fault in Our Stars")

Unnamed: 0,title,author_ori
73,Looking for Alaska,John Green
87,Paper Towns,John Green
274,An Abundance of Katherines,John Green
381,"Will Grayson, Will Grayson","John Green, David Levithan"
983,Paradise Lost,"John Milton, John Leonard"
1574,Let It Snow: Three Holiday Romances,"John Green, Maureen Johnson, Lauren Myracle"
1882,Stoner,"John Williams, John McGahern"
31,Of Mice and Men,John Steinbeck
85,A Time to Kill,John Grisham
122,The Firm,John Grisham


### So here we can see authors like jhon milton or jhon Grisham are also recommending convert the authors into  lowercase and strip all the spaces between them

In [30]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if author exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [31]:
 book1['author_1'] = book1['authors'].apply(clean_data)

In [32]:
book1.head()

Unnamed: 0,id,authors,original_title,title,author_ori,author_1
0,1,Suzanne Collins,The Hunger Games,The Hunger Games,Suzanne Collins,suzannecollins
1,2,"J.K. Rowling, Mary GrandPré",Harry Potter and the Philosopher's Stone,Harry Potter and the Philosopher's Stone,"J.K. Rowling, Mary GrandPré","j.k.rowling,marygrandpré"
2,3,Stephenie Meyer,Twilight,Twilight,Stephenie Meyer,stepheniemeyer
3,4,Harper Lee,To Kill a Mockingbird,To Kill a Mockingbird,Harper Lee,harperlee
4,5,F. Scott Fitzgerald,The Great Gatsby,The Great Gatsby,F. Scott Fitzgerald,f.scottfitzgerald


In [33]:
count2 = CountVectorizer(stop_words='english')
matrix2 = count2.fit_transform(book1['author_1'])

In [34]:
cosine_sim2 = cosine_similarity(matrix2, matrix2)

In [35]:
indices2=pd.Series(book1.index,index=book1['original_title']).drop_duplicates()

In [36]:
indices2.head()

original_title
The Hunger Games                            0
Harry Potter and the Philosopher's Stone    1
Twilight                                    2
To Kill a Mockingbird                       3
The Great Gatsby                            4
dtype: int64

In [37]:
def reco2(title,cosine_sim=cosine_sim2):
    idx=indices2[title]                                              #it store the index of title to idx
    sim_score=list(enumerate(cosine_sim2[idx]))                      #It give all the similarities related to the give title 
    sim_score=sorted(sim_score,key=lambda x:x[1],reverse=True)      #It sort in reverse order so the element on top  
                                                                        #rows will contain the the books with high sim_score 
    sim_score=sim_score[1:11]                                       #It will store the top 10 books (1) contains sim_score of itself
    book_idx=[i[0] for i in sim_score]                              #Store the book index in book_idx
    return book1[['title','author_ori']].iloc[book_idx]

In [38]:
reco2("The Fault in Our Stars")

Unnamed: 0,title,author_ori
73,Looking for Alaska,John Green
87,Paper Towns,John Green
274,An Abundance of Katherines,John Green
381,"Will Grayson, Will Grayson","John Green, David Levithan"
1574,Let It Snow: Three Holiday Romances,"John Green, Maureen Johnson, Lauren Myracle"
0,The Hunger Games,Suzanne Collins
1,Harry Potter and the Philosopher's Stone,"J.K. Rowling, Mary GrandPré"
2,Twilight,Stephenie Meyer
3,To Kill a Mockingbird,Harper Lee
4,The Great Gatsby,F. Scott Fitzgerald


## Now we will merge both original title and authors 

In [39]:
book1.head()

Unnamed: 0,id,authors,original_title,title,author_ori,author_1
0,1,Suzanne Collins,The Hunger Games,The Hunger Games,Suzanne Collins,suzannecollins
1,2,"J.K. Rowling, Mary GrandPré",Harry Potter and the Philosopher's Stone,Harry Potter and the Philosopher's Stone,"J.K. Rowling, Mary GrandPré","j.k.rowling,marygrandpré"
2,3,Stephenie Meyer,Twilight,Twilight,Stephenie Meyer,stepheniemeyer
3,4,Harper Lee,To Kill a Mockingbird,To Kill a Mockingbird,Harper Lee,harperlee
4,5,F. Scott Fitzgerald,The Great Gatsby,The Great Gatsby,F. Scott Fitzgerald,f.scottfitzgerald


In [40]:
features = ['authors', 'original_title']

for feature in features:
    book1[feature] = book1[feature].apply(clean_data)

In [41]:
book1.head()

Unnamed: 0,id,authors,original_title,title,author_ori,author_1
0,1,suzannecollins,thehungergames,The Hunger Games,Suzanne Collins,suzannecollins
1,2,"j.k.rowling,marygrandpré",harrypotterandthephilosopher'sstone,Harry Potter and the Philosopher's Stone,"J.K. Rowling, Mary GrandPré","j.k.rowling,marygrandpré"
2,3,stepheniemeyer,twilight,Twilight,Stephenie Meyer,stepheniemeyer
3,4,harperlee,tokillamockingbird,To Kill a Mockingbird,Harper Lee,harperlee
4,5,f.scottfitzgerald,thegreatgatsby,The Great Gatsby,F. Scott Fitzgerald,f.scottfitzgerald


In [42]:
def create_merge(x):
    return ''.join(x['authors']) + '' + ''.join(x['original_title'])
book1['merge'] = book1.apply(create_merge, axis=1)

In [43]:
book1.head()

Unnamed: 0,id,authors,original_title,title,author_ori,author_1,merge
0,1,suzannecollins,thehungergames,The Hunger Games,Suzanne Collins,suzannecollins,suzannecollinsthehungergames
1,2,"j.k.rowling,marygrandpré",harrypotterandthephilosopher'sstone,Harry Potter and the Philosopher's Stone,"J.K. Rowling, Mary GrandPré","j.k.rowling,marygrandpré","j.k.rowling,marygrandpréharrypotterandthephilo..."
2,3,stepheniemeyer,twilight,Twilight,Stephenie Meyer,stepheniemeyer,stepheniemeyertwilight
3,4,harperlee,tokillamockingbird,To Kill a Mockingbird,Harper Lee,harperlee,harperleetokillamockingbird
4,5,f.scottfitzgerald,thegreatgatsby,The Great Gatsby,F. Scott Fitzgerald,f.scottfitzgerald,f.scottfitzgeraldthegreatgatsby


In [44]:
tfidf1 = TfidfVectorizer(stop_words='english')
matrix3 = tfidf1.fit_transform(book1['merge'])

In [45]:
cosine_sim3 = cosine_similarity(matrix3, matrix3)

In [46]:
indices3=pd.Series(book1.index,index=book1['title']).drop_duplicates()

In [47]:
indices3.head()

title
The Hunger Games                            0
Harry Potter and the Philosopher's Stone    1
Twilight                                    2
To Kill a Mockingbird                       3
The Great Gatsby                            4
dtype: int64

In [48]:
def reco3(title,cosine_sim=cosine_sim3):
    idx=indices3[title]                                              #it store the index of title to idx
    sim_score=list(enumerate(cosine_sim3[idx]))                      #It give all the similarities related to the give title 
    sim_score=sorted(sim_score,key=lambda x:x[1],reverse=True)      #It sort in reverse order so the element on top  
                                                                        #rows will contain the the books with high sim_score 
    sim_score=sim_score[1:11]                                       #It will store the top 10 books (1) contains sim_score of itself
    book_idx=[i[0] for i in sim_score]                              #Store the book index in book_idx
    return book1[['title','author_ori']].iloc[book_idx]

In [49]:
reco3("Harry Potter and the Philosopher's Stone")#Recommendation by authors and titles

Unnamed: 0,title,author_ori
20,Harry Potter and the Order of the Phoenix,"J.K. Rowling, Mary GrandPré"
22,Harry Potter and the Chamber of Secrets,"J.K. Rowling, Mary GrandPré"
23,Harry Potter and the Goblet of Fire,"J.K. Rowling, Mary GrandPré"
24,Harry Potter and the Deathly Hallows,"J.K. Rowling, Mary GrandPré"
2100,Harry Potter Boxed Set Books 1-4,"J.K. Rowling, Mary GrandPré"
17,Harry Potter and the Prisoner of Azkaban,"J.K. Rowling, Mary GrandPré, Rufus Beck"
26,Harry Potter and the Half-Blood Prince,"J.K. Rowling, Mary GrandPré"
468,Fantastic Beasts and Where to Find Them,"Newt Scamander, J.K. Rowling, Albus Dumbledore"
6427,"Short Stories from Hogwarts of Heroism, Hardsh...","J.K. Rowling, MinaLima"
7442,"Short Stories from Hogwarts of Power, Politics...","J.K. Rowling, MinaLima"


In [50]:
reco2("The Fault in Our Stars")#Recommendation by authors

Unnamed: 0,title,author_ori
73,Looking for Alaska,John Green
87,Paper Towns,John Green
274,An Abundance of Katherines,John Green
381,"Will Grayson, Will Grayson","John Green, David Levithan"
1574,Let It Snow: Three Holiday Romances,"John Green, Maureen Johnson, Lauren Myracle"
0,The Hunger Games,Suzanne Collins
1,Harry Potter and the Philosopher's Stone,"J.K. Rowling, Mary GrandPré"
2,Twilight,Stephenie Meyer
3,To Kill a Mockingbird,Harper Lee
4,The Great Gatsby,F. Scott Fitzgerald


In [51]:
reco("The Fault in Our Stars")#Recommendation by titles

5117                 Stars Above
1575         Full Dark, No Stars
5072      The City and the Stars
2515               The Dog Stars
2982          These Broken Stars
7205            Stars of Fortune
240             Number the Stars
9537          Agent to the stars
6398        The Stars Shine Down
3242    The Stars My Destination
Name: original_title, dtype: object