In [1]:
from tqdm import tqdm
import pandas as pd

In [2]:
books_df=pd.read_csv('goodbooks-10k/books.csv')
tags_df=pd.read_csv('goodbooks-10k/tags.csv')
book_tags_df=pd.read_csv('goodbooks-10k/book_tags.csv')

In [3]:
tags_df

Unnamed: 0,tag_id,tag_name
0,0,-
1,1,--1-
2,2,--10-
3,3,--12-
4,4,--122-
...,...,...
34247,34247,Ｃhildrens
34248,34248,Ｆａｖｏｒｉｔｅｓ
34249,34249,Ｍａｎｇａ
34250,34250,ＳＥＲＩＥＳ


In [4]:
tag_id_to_name = dict(zip(tags_df['tag_id'], tags_df['tag_name']))

In [7]:
useless_tags=set([
    'to-read', 'currently-reading', 'favorites', 'owned',
    'wishlist', 'default', 'my-books', 'books-i-own',
     'library', 'audio', 'read', 'books', 'bookshelves', 'owned-books'
])

In [8]:
book_tags_map={}
grouped=book_tags_df.groupby('goodreads_book_id')

for book_id, group in tqdm(grouped, desc="Processing tags..."):
    
    # take top 5 tags
    top_tags=group.sort_values('count', ascending=False).head(5)['tag_id'].tolist()
    
    tag_names=[]
    for tid in top_tags:
        tag_name=tag_id_to_name.get(tid, '').lower()
        if tag_name not in useless_tags:
            tag_names.append(tag_name)
    
    book_tags_map[book_id]=tag_names

Processing tags...: 100%|██████████| 10000/10000 [00:00<00:00, 18844.78it/s]


In [9]:
print(books_df.columns.tolist())

['book_id', 'goodreads_book_id', 'best_book_id', 'work_id', 'books_count', 'isbn', 'isbn13', 'authors', 'original_publication_year', 'original_title', 'title', 'language_code', 'average_rating', 'ratings_count', 'work_ratings_count', 'work_text_reviews_count', 'ratings_1', 'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5', 'image_url', 'small_image_url']


In [10]:
books=[]
for _, row in tqdm(books_df.iterrows(), total=len(books_df), desc="Building books list..."):

    book_id=row['goodreads_book_id']
    tags_list=book_tags_map.get(book_id, [])
    tags_text=', '.join(tags_list)

    books.append({
        'title':row['title'],
        'authors':row['authors'],
        'average_rating':row['average_rating'],
        'genres':tags_text,
    })

Building books list...: 100%|██████████| 10000/10000 [00:00<00:00, 67435.47it/s]


In [11]:
print(f'{len(books)} books loaded.')

10000 books loaded.


## Building corpus

In [12]:
corpus=[f"{b['title']} by {b['authors']}. Tags: {b['genres']}" for b in books]
corpus[:5]


['The Hunger Games (The Hunger Games, #1) by Suzanne Collins. Tags: young-adult, fiction, dystopian',
 "Harry Potter and the Sorcerer's Stone (Harry Potter, #1) by J.K. Rowling, Mary GrandPré. Tags: fantasy, young-adult",
 'Twilight (Twilight, #1) by Stephenie Meyer. Tags: young-adult, fantasy, vampires, ya',
 'To Kill a Mockingbird by Harper Lee. Tags: classics, classic, historical-fiction',
 'The Great Gatsby by F. Scott Fitzgerald. Tags: classics, fiction, classic']

## Loading model

In [13]:
from sentence_transformers import SentenceTransformer
encoder=SentenceTransformer('all-mpnet-base-v2')

In [14]:
import numpy as np

BATCH_SIZE=64
book_embeddings=[]

for i in tqdm(range(0, len(corpus), BATCH_SIZE), desc="Encoding books..."):
    batch=corpus[i:i+BATCH_SIZE]
    book_embeddings.extend(encoder.encode(batch))

book_embeddings=np.array(book_embeddings)
print(f"Embeddings shape: {book_embeddings.shape}")


Encoding books...: 100%|██████████| 157/157 [00:50<00:00,  3.09it/s]

Embeddings shape: (10000, 768)





In [15]:
import faiss
from sklearn.preprocessing import normalize

# normalize embeds
book_embeddings_norm=normalize(book_embeddings, axis=1)

# cosine similarity
index=faiss.IndexFlatIP(book_embeddings.shape[1]) 
index.add(book_embeddings_norm)

In [16]:
def find_five_books(query_text, top_n=5):
    query_embedding=encoder.encode([query_text])
    query_embedding=normalize(query_embedding, axis=1)

    D,I=index.search(query_embedding, top_n)
    results=[]
    for idx in I[0]:
        b=books[idx]
        results.append({
            'title':b['title'],
            'authors':b['authors'],
            'genres':b['genres'],
            'average_rating':b['average_rating']
        })
    return results


In [17]:
query = 'sci-fi dystopia tech'
results=find_five_books(query)

i=1
for r in results:
    print(f"{i}. {r['title']}\n   Author: {r['authors']} \n   Rating: {r['average_rating']}\n   Top Tags: {r['genres']}\n")
    i+=1

1. Shift (Silo, #2)
   Author: Hugh Howey 
   Rating: 4.12
   Top Tags: science-fiction, sci-fi, fiction, dystopian

2. Metro 2033 (METRO, #1)
   Author: Dmitry Glukhovsky, M. David Drevs 
   Rating: 3.98
   Top Tags: science-fiction, sci-fi

3. Daemon (Daemon, #1)
   Author: Daniel Suarez 
   Rating: 4.17
   Top Tags: technology

4. We
   Author: Yevgeny Zamyatin, Clarence Brown 
   Rating: 3.95
   Top Tags: fiction, science-fiction, dystopia

5. Dust (Silo, #3)
   Author: Hugh Howey 
   Rating: 4.26
   Top Tags: science-fiction, sci-fi, fiction, dystopian

