In [17]:
# Import needed modules
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
df = pd.read_csv('data/books.csv')

In [19]:
df.head(2)

Unnamed: 0,isbn13,isbn10,title,subtitle,authors,genre,thumbnail,description,published_year,average_rating,num_pages,ratings_count
0,9780002005883,2005883,Gilead,,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0
1,9780002261982,2261987,Spider's Web,A Novel,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6810 entries, 0 to 6809
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   isbn13          6810 non-null   int64  
 1   isbn10          6810 non-null   object 
 2   title           6810 non-null   object 
 3   subtitle        2381 non-null   object 
 4   authors         6738 non-null   object 
 5   genre           6711 non-null   object 
 6   thumbnail       6481 non-null   object 
 7   description     6548 non-null   object 
 8   published_year  6804 non-null   float64
 9   average_rating  6767 non-null   float64
 10  num_pages       6767 non-null   float64
 11  ratings_count   6767 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 638.6+ KB


## Feature Extraction

In [27]:
df = df[['title', 'authors','genre','published_year','average_rating']]
df.head()
# df.to_csv('data/book_data.csv')

Unnamed: 0,title,authors,genre,published_year,average_rating
0,Gilead,Marilynne Robinson,Fiction,2004.0,3.85
1,Spider's Web,Charles Osborne;Agatha Christie,Detective and mystery stories,2000.0,3.83
2,The One Tree,Stephen R. Donaldson,American fiction,1982.0,3.97
3,Rage of angels,Sidney Sheldon,Fiction,1993.0,3.93
4,The Four Loves,Clive Staples Lewis,Christian life,2002.0,4.15


In [21]:
# Selecting the relevant features for recommendation
selected_features = ['title','authors','genre','published_year','average_rating']
print(selected_features)

['title', 'authors', 'genre', 'published_year', 'average_rating']


In [28]:
for feature in selected_features:
    df[feature] = df[feature].fillna('')

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6810 entries, 0 to 6809
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   title           6810 non-null   object
 1   authors         6810 non-null   object
 2   genre           6810 non-null   object
 3   published_year  6810 non-null   object
 4   average_rating  6810 non-null   object
dtypes: object(5)
memory usage: 266.1+ KB


In [39]:
df.to_csv('data/book_data.csv')

In [40]:
# combining all the 5 selected features
combined_features = df['title'] + ' ' + df['genre'] + ' ' + df['authors'] + ' ' + f"{df['published_year']}" + ' '  + f"{df['average_rating']}"
combined_features

0       Gilead Fiction Marilynne Robinson 0       2004...
1       Spider's Web Detective and mystery stories Cha...
2       The One Tree American fiction Stephen R. Donal...
3       Rage of angels Fiction Sidney Sheldon 0       ...
4       The Four Loves Christian life Clive Staples Le...
                              ...                        
6805    I Am that Philosophy Sri Nisargadatta Maharaj;...
6806    Secrets Of The Heart Mysticism Khalil Gibran 0...
6807    Fahrenheit 451 Book burning Ray Bradbury 0    ...
6808    The Berlin Phenomenology History Georg Wilhelm...
6809    'I'm Telling You Stories' Literary Criticism H...
Length: 6810, dtype: object

In [41]:
# converting the text data to feature vectors
vectorizer = TfidfVectorizer()
feature_vectors = vectorizer.fit_transform(combined_features)

In [42]:
print(feature_vectors)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 240347 stored elements and shape (6810, 10556)>
  Coords	Values
  (0, 3890)	0.5459692338247633
  (0, 3430)	0.10380631444965349
  (0, 6021)	0.5459692338247633
  (0, 8043)	0.40540667193189694
  (0, 113)	0.11955774516606114
  (0, 109)	0.05977887258303057
  (0, 93)	0.05977887258303057
  (0, 99)	0.11955774516606114
  (0, 111)	0.05977887258303057
  (0, 140)	0.11955774516606114
  (0, 104)	0.05977887258303057
  (0, 141)	0.11955774516606114
  (0, 142)	0.11955774516606114
  (0, 143)	0.11955774516606114
  (0, 92)	0.05977887258303057
  (0, 144)	0.11955774516606114
  (0, 103)	0.05977887258303057
  (0, 6663)	0.11955774516606114
  (0, 7637)	0.05977887258303057
  (0, 5565)	0.11955774516606114
  (0, 145)	0.11955774516606114
  (0, 2848)	0.11955774516606114
  (0, 6902)	0.11955774516606114
  (0, 150)	0.05977887258303057
  (0, 149)	0.05977887258303057
  :	:
  (6809, 103)	0.04512598246612936
  (6809, 6663)	0.09025196493225872
  (6809, 7637)	0.045

In [37]:
# getting the similarity scores using cosine similarity
similarity = cosine_similarity(feature_vectors, feature_vectors)

In [43]:
list_of_all_titles = df['title'].tolist()
print(list_of_all_titles)



In [45]:
# getting the book name from the user
book_name = 'Rage of angels'
print(book_name)

find_close_match = difflib.get_close_matches(book_name, list_of_all_titles)
print(find_close_match)

# finding the index of the book with title
close_match = find_close_match[0]
index_of_the_book = df[df.title == close_match].index[0]
print(index_of_the_book)

# getting a list of similar books
similarity_score = list(enumerate(similarity[index_of_the_book]))
print(similarity_score)

sorted_similar_books = sorted(similarity_score, key = lambda x:x[1], reverse = True) 
print(sorted_similar_books)

top_sim = sorted_similar_books[:5]
print(top_sim)

# print the name of similar books based on the index
i = 1

for book in sorted_similar_books:
    index = book[0]
    title_from_index = df[df.index==index]['title'].values[0]
    if (i < 6):
        print(i, '-', title_from_index)
        i += 1

Rage of angels
['Rage of angels', 'Bag of Bones', 'The Rape of Nanking']
3
[(0, np.float64(0.22474765867355428)), (1, np.float64(0.17040628718571935)), (2, np.float64(0.22489012856701815)), (3, np.float64(1.0)), (4, np.float64(0.18122524425068132)), (5, np.float64(0.18755934947441869)), (6, np.float64(0.21932792474629387)), (7, np.float64(0.1884668545310766)), (8, np.float64(0.1474974356610149)), (9, np.float64(0.5480476281584461)), (10, np.float64(0.47992357823391985)), (11, np.float64(0.2017271966648854)), (12, np.float64(0.2178789977922726)), (13, np.float64(0.15267525789590947)), (14, np.float64(0.15991184820342005)), (15, np.float64(0.19338814890977776)), (16, np.float64(0.2097102823589532)), (17, np.float64(0.1645357511649547)), (18, np.float64(0.2038750371636068)), (19, np.float64(0.19781079376377605)), (20, np.float64(0.19647227661812455)), (21, np.float64(0.18088959699997303)), (22, np.float64(0.19882711151081503)), (23, np.float64(0.16232676749497804)), (24, np.float64(0.2025

In [46]:
book_name = 'Rage of angels'

list_of_all_titles = df['title'].tolist()

find_close_match = difflib.get_close_matches(book_name, list_of_all_titles)

close_match = find_close_match[0]

index_of_the_book = df[df.title == close_match].index[0]

similarity_score = list(enumerate(similarity[index_of_the_book]))

sorted_similar_books = sorted(similarity_score, key = lambda x:x[1], reverse = True) 

print('Books suggested for you : \n')

i = 1

for book in sorted_similar_books:
    index = book[0]
    title_from_index = df[df.index==index]['title'].values[0]
    if (i < 30):
        print(i, '.',title_from_index)
        i+=1


Books suggested for you : 

1 . Rage of angels
2 . The Sky Is Falling
3 . Master of the Game
4 . Tell Me Your Dreams
5 . Nothing Lasts Forever
6 . Gold Rage
7 . If Tomorrow Comes
8 . What Angels Fear
9 . Angels & Demons
10 . Angels
11 . Desolation Angels
12 . The Killer Angels
13 . My Perfect Life
14 . Love
15 . The Valkyries
16 . The City of Falling Angels
17 . Confessions of a Teenage Drama Queen
18 . Angels and Demons
19 . You Bright and Risen Angels
20 . The Gunslinger
21 . Atonement
22 . The Book of the Dragon
23 . The Lighthouse
24 . V.
25 . Crime Novels
26 . J R
27 . Out of this World
28 . It
29 . History -


In [47]:
list_of_all_genre = df['genre'].unique().tolist()
print(list_of_all_genre)

list_of_all_avg_rating = df['average_rating'].unique().tolist()
print(list_of_all_avg_rating)

['Fiction', 'Detective and mystery stories', 'American fiction', 'Christian life', 'Authors, English', 'Africa, East', 'Hyland, Morn (Fictitious character)', 'Adventure stories', 'Arthurian romances', 'Fantasy fiction', '', 'English drama', 'Country life', 'English fiction', 'Clergy', 'Aubrey, Jack (Fictitious character)', 'Detective and mystery stories, English', 'Black Death', 'Human cloning', 'Science fiction', 'Great Britain', 'American essays', 'China', 'Capitalism', 'Ireland', 'Juvenile Fiction', "Children's stories, English", 'Male friendship', 'Literary Collections', 'Beresford, Tommy (Fictitious character)', 'Imaginary wars and battles', 'Dysfunctional families', 'Poirot, Hercule (Fictitious character)', 'Christmas stories', 'Marple, Jane (Fictitious character)', 'Belgians', 'Battle, Superintendent (Fictitious character)', 'Baggins, Frodo (Fictitious character)', 'Cambridge (Mass.)', 'Business enterprises', 'Emotional problems', 'Characters and characteristics in motion pictur

In [50]:
# Example lists
list_of_all_genre = df['genre'].unique().tolist()
list_of_all_avg_rating = df['average_rating'].unique().tolist()


user_genre = 'Fiction'
user_avg_rating = 3.5

# Find the closest match for genre
closest_genre = difflib.get_close_matches(user_genre, list_of_all_genre, n=1)
print(f"Closest genre match: {closest_genre}")

# Find the closest match for average rating
closest_avg_rating = difflib.get_close_matches(str(user_avg_rating), list(map(str, list_of_all_avg_rating)), n=1)
print(f"Closest average rating match: {closest_avg_rating}")

# Ensure that both matches were found
if closest_genre and closest_avg_rating:
    matched_genre = closest_genre[0]
    matched_avg_rating = float(closest_avg_rating[0])  # Convert back to float

    # Filter DataFrame for matching genre and average rating
    filtered_df = df[(df['genre'] == matched_genre) & (df['average_rating'] == matched_avg_rating)]
    
    if not filtered_df.empty:
        # Assume `similarity` is a precomputed similarity matrix based on some features like genre and average rating
        # Calculate similarity scores for books in the filtered DataFrame
        index_of_the_book = filtered_df.index[0]  # Get the first match's index
        similarity_score = list(enumerate(similarity[index_of_the_book]))

        # Sort and get the top similar books
        sorted_similar_books = sorted(similarity_score, key=lambda x: x[1], reverse=True)
        top_sim = sorted_similar_books[:10]

        # Print the names of similar books
        for i, book in enumerate(top_sim, start=1):
            index = book[0]
            title_from_index = df.iloc[index]['title']  # Retrieve title based on index
            print(i, '-', title_from_index)
    else:
        print("No books found matching the genre and average rating.")
else:
    print("No close match found for the provided genre or average rating.")


Closest genre match: ['Fiction']
Closest average rating match: ['3.5']
1 - The Zero
2 - Zero
3 - Ivanhoe
4 - Count Zero
5 - The Antiquary
6 - Einstein
7 - Thumbsucker
8 - Love
9 - Monster
10 - Kissinger


In [51]:
import pickle

# Assuming `df`, `similarity`, `list_of_all_genre`, and `list_of_all_avg_rating` are already defined
with open('model_data.pkl', 'wb') as file:
    pickle.dump({
        'df': df,
        'similarity': similarity,
        'list_of_all_genre': list_of_all_genre,
        'list_of_all_avg_rating': list_of_all_avg_rating,
    }, file)

In [52]:
with open('model_data.pkl', 'rb') as file:
    model_data = pickle.load(file)

df_1 = model_data['df']
similarity = model_data['similarity']
list_of_all_genre = model_data['list_of_all_genre']
list_of_all_avg_rating = model_data['list_of_all_avg_rating']
