In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import re
from tqdm.auto import tqdm

In [None]:
df = pd.read_csv("/content/cleaned_book_details (2).csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2060 entries, 0 to 2059
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   book_name           2060 non-null   object
 1   author              2007 non-null   object
 2   year_of_publishing  2039 non-null   object
 3   plot                2060 non-null   object
 4   genre               1637 non-null   object
 5   description         2034 non-null   object
 6   page_number         1623 non-null   object
dtypes: object(7)
memory usage: 112.8+ KB


In [None]:
model = SentenceTransformer('all-mpnet-base-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# AVERAGING

In [None]:
def combine_features(row):
    return f"{row['description']} {row['plot']}"

df['combined_text'] = df.apply(combine_features, axis=1)
df['combined_text'] = df['combined_text'].fillna('').apply(lambda x: re.sub(r'\s+', ' ', x))

In [None]:
def split_text(text, chunk_size=300, overlap=30):
    """Разбивает текст на перекрывающиеся фрагменты"""
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = ' '.join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

In [None]:
def get_average_embedding(text, model, chunk_size=300, overlap=30):
    """Генерирует усредненный эмбеддинг для длинного текста"""
    if not isinstance(text, str) or len(text.split()) < 50:
        return model.encode("")

    chunks = split_text(text, chunk_size, overlap)
    chunk_embeddings = model.encode(chunks, show_progress_bar=False)
    return np.mean(chunk_embeddings, axis=0)

In [None]:
tqdm.pandas()
df['avg_embedding'] = df['combined_text'].progress_apply(
    lambda x: get_average_embedding(x, model))

  0%|          | 0/2060 [00:00<?, ?it/s]

In [None]:
def get_book_recommendations(query, top_k=3):
    query_embedding = model.encode([query])

    similarities = cosine_similarity(
        query_embedding,
        np.vstack(df['avg_embedding'])
    )[0]

    top_indices = np.argsort(similarities)[-top_k:][::-1]

    return df.iloc[top_indices][['book_name', 'author', 'genre']].assign(
        similarity_score=[f"{sim:.4f}" for sim in similarities[top_indices]]
    )

In [None]:
user_query = "A young wizard discovers his magical heritage and attends a school of magic while facing a dark lord"
recommendations = get_book_recommendations(user_query)
print(recommendations[['book_name', 'author', 'genre', 'similarity_score']])

                                   book_name         author          genre  \
300                       Changeling (novel)  Roger Zelazny        Fantasy   
1489                        Renegade's Magic     Robin Hobb  Fantasy novel   
812   Harry Potter and the Half-Blood Prince  J. K. Rowling        Fantasy   

     similarity_score  
300            0.3814  
1489           0.3740  
812            0.3692  


In [None]:
df.to_csv("embeddings_sentence_transformers_averaging.csv", index=False)

# JUST PLOT

In [None]:
def split_text(text, chunk_size=300, overlap=30):
    """Разбивает текст на перекрывающиеся фрагменты"""
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = ' '.join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

In [None]:
def get_average_embedding(text, model, chunk_size=300, overlap=30):
    """Генерирует усредненный эмбеддинг для длинного текста"""
    if not isinstance(text, str) or len(text.split()) < 50:
        return model.encode("")

    chunks = split_text(text, chunk_size, overlap)
    chunk_embeddings = model.encode(chunks, show_progress_bar=False)
    return np.mean(chunk_embeddings, axis=0)

In [None]:
tqdm.pandas()
df['avg_embedding'] = df['plot'].progress_apply(
    lambda x: get_average_embedding(x, model))

  0%|          | 0/2060 [00:00<?, ?it/s]

In [None]:
def get_book_recommendations(query, top_k=3):
    query_embedding = model.encode([query])

    similarities = cosine_similarity(
        query_embedding,
        np.vstack(df['avg_embedding'])
    )[0]

    top_indices = np.argsort(similarities)[-top_k:][::-1]

    return df.iloc[top_indices][['book_name', 'author', 'genre']].assign(
        similarity_score=[f"{sim:.4f}" for sim in similarities[top_indices]]
    )

In [None]:
user_query = "A young wizard discovers his magical heritage and attends a school of magic while facing a dark lord"
recommendations = get_book_recommendations(user_query)
print(recommendations[['book_name', 'author', 'genre', 'similarity_score']])

                                   book_name          author            genre  \
808  Harry Potter and the Chamber of Secrets   J. K. Rowling          Fantasy   
201                           The Blue Sword  Robin McKinley          Fantasy   
159                        Beneath the Moors    Brian Lumley  Horror, fantasy   

    similarity_score  
808           0.3737  
201           0.3732  
159           0.3681  


In [None]:
df.to_csv("embeddings_sentence_transformers_just_plot.csv", index=False)