In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from datetime import datetime

from IPython.core.display import HTML
import tba3102

In [2]:
def book_recommender(book_title, books, doc_sims):

    # find book id
    book_idx = np.where(books == book_title)[0][0]
    # get book similarities
    book_similarities = doc_sims.iloc[book_idx].values
    # get top 5 similar book IDs
    similar_book_idxs = np.argsort(-book_similarities)[1:6]
    # get top 5 books
    similar_books = books[similar_book_idxs]

    # return the top 5 books
    return similar_books

In [3]:
display(HTML("<style>pre { white-space: pre !important; }</style>"))
tba3102.set_default_pandas_options(max_colwidth=300)

print('Text processing started at {}'.format(datetime.now()))

Text processing started at 2023-03-26 05:27:31.079795


In [4]:
df = pd.read_csv('../data/57000-books-with-cleaned-blurbs.csv', index_col=0)
# df

In [5]:
tf = TfidfVectorizer(ngram_range=(1, 1), min_df=2)
tfidf_matrix = tf.fit_transform(df['Cleaned_Blurb'].array)
tfidf_matrix.shape

(18159, 50275)

In [6]:
doc_sim = cosine_similarity(tfidf_matrix)
doc_sim_df = pd.DataFrame(doc_sim)
doc_sim_df

Unnamed: 0,0,1,2,3,4,...,18154,18155,18156,18157,18158
0,1.000000,0.000000,0.000000,0.019519,0.050120,...,0.000000,0.010671,0.000000,0.006381,0.020758
1,0.000000,1.000000,0.000000,0.000000,0.000000,...,0.000000,0.006679,0.000000,0.002863,0.001432
2,0.000000,0.000000,1.000000,0.000000,0.000000,...,0.000000,0.004177,0.009379,0.000000,0.045856
3,0.019519,0.000000,0.000000,1.000000,0.014969,...,0.019762,0.011640,0.007170,0.054990,0.003694
4,0.050120,0.000000,0.000000,0.014969,1.000000,...,0.000000,0.003192,0.000000,0.017500,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
18154,0.000000,0.000000,0.000000,0.019762,0.000000,...,1.000000,0.011801,0.016346,0.015046,0.013742
18155,0.010671,0.006679,0.004177,0.011640,0.003192,...,0.011801,1.000000,0.000000,0.016379,0.050169
18156,0.000000,0.000000,0.009379,0.007170,0.000000,...,0.016346,0.000000,1.000000,0.000000,0.012692
18157,0.006381,0.002863,0.000000,0.054990,0.017500,...,0.015046,0.016379,0.000000,1.000000,0.036493


In [7]:
books_list = df['Title'].array
random_books = df.sample(n=10, replace=False, random_state=np.random.randint(2**31-1))
random_books_titles = random_books.Title.tolist()

In [8]:
for book_title in random_books_titles:

    print('Book:', book_title)
    
    recommended_books_titles = book_recommender(book_title=book_title, books=books_list, doc_sims=doc_sim_df)
    
    print('Top 5 recommended Books:', )
    
    for recommended_books_title in recommended_books_titles:
        
        print('\t{}'.format(recommended_books_title))
    
    print()

Book: Death Qualified (Barbara Holloway Novels (Paperback))
Top 5 recommended Books:
	Touching Spirit Bear
	Chosen Prey
	Nightingale's Gate
	Out of the Blue (Stewart Sisters Trilogy)
	Desperate Measures (Barbara Holloway Novels (Paperback))

Book: Letters to Montgomery Clift
Top 5 recommended Books:
	Remembering Lucy Maud Montgomery
	A Groom with a View : A Jane Jeffry Mystery (Jane Jeffry Mysteries (Paperback))
	Murder Pans Out
	The Tesseract
	Fugitive Mom (Harlewuin Superromance, No. 973)

Book: The Ultimate Weight Solution: The 7 Keys to Weight Loss Freedom
Top 5 recommended Books:
	Get with the Program!
	KISS Guide to Weight Loss
	La Antidieta
	Life Inside the Thin Cage: A Personal Look into the Hidden World of the Chronic Dieter
	The Business Plan for the Body

Book: Inventing A Nation: Washington, Adams, Jefferson
Top 5 recommended Books:
	Eagle's Cry : A Novel of the Louisiana Purchase (The American Story)
	Dreaming War: Blood for Oil and the Cheney-Bush Junta
	Rise to Rebellion

In [9]:
print('Text processing ended at {}'.format(datetime.now()))

Text processing ended at 2023-03-26 05:27:38.625823
