In [2]:
import pandas as pd
import numpy as np
import difflib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
df = pd.read_csv('data.csv')

In [4]:
df.head()

Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count
0,9780002005883,2005883,Gilead,,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0
1,9780002261982,2261987,Spider's Web,A Novel,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0
2,9780006163831,6163831,The One Tree,,Stephen R. Donaldson,American fiction,http://books.google.com/books/content?id=OmQaw...,Volume Two of Stephen Donaldson's acclaimed se...,1982.0,3.97,479.0,172.0
3,9780006178736,6178731,Rage of angels,,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0
4,9780006280897,6280897,The Four Loves,,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0


Exploratory Data Analysis (EDA)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6810 entries, 0 to 6809
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   isbn13          6810 non-null   int64  
 1   isbn10          6810 non-null   object 
 2   title           6810 non-null   object 
 3   subtitle        2381 non-null   object 
 4   authors         6738 non-null   object 
 5   categories      6711 non-null   object 
 6   thumbnail       6481 non-null   object 
 7   description     6548 non-null   object 
 8   published_year  6804 non-null   float64
 9   average_rating  6767 non-null   float64
 10  num_pages       6767 non-null   float64
 11  ratings_count   6767 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 638.6+ KB


In [7]:
df.isnull().sum()

isbn13               0
isbn10               0
title                0
subtitle          4429
authors             72
categories          99
thumbnail          329
description        262
published_year       6
average_rating      43
num_pages           43
ratings_count       43
dtype: int64

Data Preprocessing & Feature Engineering

In [9]:
# Select the features that will be used for recommendation
selected_features = ['title', 'authors', 'categories', 'published_year']
print(f"Selected features: {selected_features}")

Selected features: ['title', 'authors', 'categories', 'published_year']


In [10]:
# Fill any missing values in these features with an empty string
for feature in selected_features:
    df[feature] = df[feature].fillna('')

In [None]:
# Combine the selected features into a single string for each book
def combine_features(row):
    return f"{row['title']} {row['authors']} {row['categories']} {row['published_year']}"
df['combined_features'] = df.apply(combine_features, axis=1)

print("\nExample of a combined feature string:")
print(df['combined_features'].iloc[0])


Example of a combined feature string:
Gilead Marilynne Robinson Fiction 2004.0


Building the mdoel

In [12]:
# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english')
feature_vectors = vectorizer.fit_transform(df['combined_features'])
print("Shape of the feature vectors (books, vocabulary size):")
print(feature_vectors.shape)

Shape of the feature vectors (books, vocabulary size):
(6810, 10364)


In [13]:
# Compute the cosine similarity matrix
cosine_sim_matrix = cosine_similarity(feature_vectors)

print("Shape of the cosine similarity matrix (books, books):")
print(cosine_sim_matrix.shape)
print("\nExample similarity scores for the first book:")
print(cosine_sim_matrix[0][:10])

Shape of the cosine similarity matrix (books, books):
(6810, 6810)

Example similarity scores for the first book:
[1.         0.         0.01372095 0.01211446 0.         0.
 0.         0.         0.         0.        ]


In [None]:
import joblib
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')
joblib.dump(cosine_sim_matrix, 'cosine_similarity_matrix.joblib')

Create the Recommendation Function

In [17]:
def get_recommendations(book_title, cosine_sim=cosine_sim_matrix, dataframe=df):
    all_titles = dataframe['title'].tolist()
    close_matches = difflib.get_close_matches(book_title, all_titles)
    if not close_matches:
        print(f"Sorry, no close match found for '{book_title}'. Please try another book.")
        return
    closest_match = close_matches[0]
    print(f"Closest match found: {closest_match}\n")
    book_index = dataframe[dataframe.title == closest_match].index[0]
    similarity_scores = list(enumerate(cosine_sim[book_index]))
    sorted_similar_books = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    print(f'Books recommended for you based on "{closest_match}":\n')
    i = 1
    for book in sorted_similar_books:
        index = book[0]
        if index != book_index:
            title_from_index = dataframe.iloc[index]['title']
            print(f"{i}. {title_from_index}")
            i += 1
        if i > 10:
            break
book_name = input("Enter your favourite book name: ")
get_recommendations(book_name)        

Closest match found: Marilyn Monroe

Books recommended for you based on "Marilyn Monroe":

1. The Art of Alfred Hitchcock
2. The Dark Side Of Genius
3. Marilyn Manson
4. Laura
5. The Beach House
6. P.J. Funnybunny Camps Out
7. The Long Hard Road Out of Hell
8. Muhammad
9. Trump
10. Marilyn Manson


In [18]:
# Save the model
import joblib
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')
joblib.dump(cosine_sim_matrix, 'cosine_similarity_matrix.joblib')

['cosine_similarity_matrix.joblib']