In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import re
from tqdm.auto import tqdm

In [None]:
df = pd.read_csv("/content/cleaned_book_details (2).csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2060 entries, 0 to 2059
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   book_name           2060 non-null   object
 1   author              2007 non-null   object
 2   year_of_publishing  2039 non-null   object
 3   plot                2060 non-null   object
 4   genre               1637 non-null   object
 5   description         2034 non-null   object
 6   page_number         1623 non-null   object
dtypes: object(7)
memory usage: 112.8+ KB


In [None]:
model = SentenceTransformer('all-mpnet-base-v2')

# AVERAGING

In [None]:
df = pd.read_csv('/content/embeddings_sentence_transformers_averaging.csv')

def parse_numpy_vector(s):
    if isinstance(s, str):
        return np.fromstring(s.strip("[]"), sep=' ')
    else:
        return None

df['avg_embedding'] = df['avg_embedding'].apply(parse_numpy_vector)

df.head()

Unnamed: 0,book_name,author,year_of_publishing,plot,genre,description,page_number,combined_text,avg_embedding
0,The 12.30 from Croydon,Freeman Wills Crofts,1934,"Set in Yorkshire and London in 1933, The 12.30...",Mystery,The 12.30 from Croydon (U.S. title: Wilful and...,,The 12.30 from Croydon (U.S. title: Wilful and...,"[-0.00849666819, 0.0138319647, 0.00705731707, ..."
1,The Final Unfinished Voyage of Jack Aubrey,Patrick O'Brian,2004,The story begins with Surprise in the Strait o...,Historical novel,The Final Unfinished Voyage of Jack Aubrey is ...,"144 first edition, hardback",The Final Unfinished Voyage of Jack Aubrey is ...,"[0.00486886175, 0.0152872959, 0.0083897775, -0..."
2,30 Days in Sydney,Peter Carey,"July 15, 2010 (2010-07-15)","The book takes the form of an impressionistic,...",,30 Days in Sydney is a book written by Austral...,256,30 Days in Sydney is a book written by Austral...,"[-0.0544374436, 0.043370232, -0.00396791985, -..."
3,The Thirty-Nine Steps,John Buchan,1915[1],"The story's narrator, Richard Hannay, arrives ...",Adventure novel,The Thirty-Nine Steps is a 1915 adventure nove...,253[1],The Thirty-Nine Steps is a 1915 adventure nove...,"[0.000661438331, -0.0416843779, 0.0112594133, ..."
4,334 (novel),Thomas M. Disch,1972 (MacGibbon & Kee),The future in 334 has brought few technologica...,"Dystopian, science fiction",334 is a 1972 dystopian science fiction novel ...,201,334 is a 1972 dystopian science fiction novel ...,"[-0.0130084911, -0.0215216614, 0.0124294832, 0..."


In [None]:
def combine_features(row):
    return f"{row['description']} {row['plot']}"

df['combined_text'] = df.apply(combine_features, axis=1)
df['combined_text'] = df['combined_text'].fillna('').apply(lambda x: re.sub(r'\s+', ' ', x))

In [None]:
def split_text(text, chunk_size=300, overlap=30):
    """Разбивает текст на перекрывающиеся фрагменты"""
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = ' '.join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

In [None]:
def get_average_embedding(text, model, chunk_size=300, overlap=30):
    """Генерирует усредненный эмбеддинг для длинного текста"""
    if not isinstance(text, str) or len(text.split()) < 50:
        return model.encode("")

    chunks = split_text(text, chunk_size, overlap)
    chunk_embeddings = model.encode(chunks, show_progress_bar=False)
    return np.mean(chunk_embeddings, axis=0)

In [None]:
tqdm.pandas()
df['avg_embedding'] = df['combined_text'].progress_apply(
    lambda x: get_average_embedding(x, model))

In [None]:
def get_book_recommendations(query, top_k=3):
    query_embedding = model.encode([query])

    similarities = cosine_similarity(
        query_embedding,
        np.vstack(df['avg_embedding'])
    )[0]

    top_indices = np.argsort(similarities)[-top_k:][::-1]

    return df.iloc[top_indices][['book_name', 'author', 'genre']].assign(
        similarity_score=[f"{sim:.4f}" for sim in similarities[top_indices]]
    )

In [None]:
user_query = "A young wizard discovers his magical heritage and attends a school of magic while facing a dark lord"
recommendations = get_book_recommendations(user_query)
print(recommendations[['book_name', 'author', 'genre', 'similarity_score']])

                                   book_name         author          genre  \
300                       Changeling (novel)  Roger Zelazny        Fantasy   
1489                        Renegade's Magic     Robin Hobb  Fantasy novel   
812   Harry Potter and the Half-Blood Prince  J. K. Rowling        Fantasy   

     similarity_score  
300            0.3814  
1489           0.3740  
812            0.3692  


In [None]:
user_query = "Historical novel about brave women and obstacles they have to overcome"
recommendations = get_book_recommendations(user_query)
print(recommendations[['book_name', 'author', 'genre', 'similarity_score']])

                              book_name                  author  \
627   Fearless, A Novel of Sarah Bowman  Lucia St. Clair Robson   
1776                            Ten Men          Alexandra Gray   
695       The French Lieutenant's Woman             John Fowles   

                                                  genre similarity_score  
627                     Historical novel, Western novel           0.5376  
1776                                              Novel           0.4914  
695   Postmodern literature, romance novel, historic...           0.4787  


In [None]:
user_query = "story about animals and their friendship with humans"
recommendations = get_book_recommendations(user_query)
print(recommendations[['book_name', 'author', 'genre', 'similarity_score']])

                 book_name          author                        genre  \
2054  The Zookeeper's Wife  Diane Ackerman  HistoryBiographyNon-fiction   
80             Animal Farm   George Orwell             Political satire   
2053              Zoo City   Lauren Beukes                          NaN   

     similarity_score  
2054           0.3988  
80             0.3947  
2053           0.3918  


In [None]:
user_query = "I want to read some sad book about war and love. Medieval ages, epic battles and drama"
recommendations = get_book_recommendations(user_query)
print(recommendations[['book_name', 'author', 'genre', 'similarity_score']])

                        book_name  \
921       The Incorporated Knight   
1079  Love in the Time of Cholera   
1904                The Valkyries   

                                              author  \
921   L. Sprague de Camp and Catherine Crook de Camp   
1079                          Gabriel García Márquez   
1904                                Maniyan Seminary   

                         genre similarity_score  
921                    Fantasy           0.4560  
1079             Romance novel           0.4484  
1904  Love, Mystery, Spiritual           0.4483  


In [None]:
user_query = "Book about space travelers, discovery of mysterious planets inhabited by strange life forms"
recommendations = get_book_recommendations(user_query)
print(recommendations[['book_name', 'author', 'genre', 'similarity_score']])

                book_name         author            genre similarity_score
1115  Man and the Planets   Duncan Lunan        Astronomy           0.4128
23      Across the Zodiac     Percy Greg  Science fiction           0.4058
633        Fiasco (novel)  Stanisław Lem  Science fiction           0.3930


In [None]:
df.to_pickle("embeddings_sentence_transformers_averaging.pkl")

# JUST PLOT

In [None]:
df = pd.read_csv('/content/embeddings_sentence_transformers_just_plot.csv')

def parse_numpy_vector(s):
    if isinstance(s, str):
        return np.fromstring(s.strip("[]"), sep=' ')
    else:
        return None

df['avg_embedding'] = df['avg_embedding'].apply(parse_numpy_vector)

df.head()

Unnamed: 0,book_name,author,year_of_publishing,plot,genre,description,page_number,combined_text,avg_embedding
0,The 12.30 from Croydon,Freeman Wills Crofts,1934,"Set in Yorkshire and London in 1933, The 12.30...",Mystery,The 12.30 from Croydon (U.S. title: Wilful and...,,The 12.30 from Croydon (U.S. title: Wilful and...,"[-0.00677350163, 0.0147330984, 0.00583120296, ..."
1,The Final Unfinished Voyage of Jack Aubrey,Patrick O'Brian,2004,The story begins with Surprise in the Strait o...,Historical novel,The Final Unfinished Voyage of Jack Aubrey is ...,"144 first edition, hardback",The Final Unfinished Voyage of Jack Aubrey is ...,"[-0.00216862792, -0.000194263484, 0.001734255,..."
2,30 Days in Sydney,Peter Carey,"July 15, 2010 (2010-07-15)","The book takes the form of an impressionistic,...",,30 Days in Sydney is a book written by Austral...,256,30 Days in Sydney is a book written by Austral...,"[-0.051732827, 0.0457235053, -0.0037406988, -0..."
3,The Thirty-Nine Steps,John Buchan,1915[1],"The story's narrator, Richard Hannay, arrives ...",Adventure novel,The Thirty-Nine Steps is a 1915 adventure nove...,253[1],The Thirty-Nine Steps is a 1915 adventure nove...,"[-0.00900641177, -0.0162965357, 0.0142708626, ..."
4,334 (novel),Thomas M. Disch,1972 (MacGibbon & Kee),The future in 334 has brought few technologica...,"Dystopian, science fiction",334 is a 1972 dystopian science fiction novel ...,201,334 is a 1972 dystopian science fiction novel ...,"[-0.0407156944, 0.0323059894, 0.00229117461, 0..."


In [None]:
def split_text(text, chunk_size=300, overlap=30):
    """Разбивает текст на перекрывающиеся фрагменты"""
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = ' '.join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

In [None]:
def get_average_embedding(text, model, chunk_size=300, overlap=30):
    """Генерирует усредненный эмбеддинг для длинного текста"""
    if not isinstance(text, str) or len(text.split()) < 50:
        return model.encode("")

    chunks = split_text(text, chunk_size, overlap)
    chunk_embeddings = model.encode(chunks, show_progress_bar=False)
    return np.mean(chunk_embeddings, axis=0)

In [None]:
tqdm.pandas()
df['avg_embedding'] = df['plot'].progress_apply(
    lambda x: get_average_embedding(x, model))

In [None]:
def get_book_recommendations(query, top_k=3):
    query_embedding = model.encode([query])

    similarities = cosine_similarity(
        query_embedding,
        np.vstack(df['avg_embedding'])
    )[0]

    top_indices = np.argsort(similarities)[-top_k:][::-1]

    return df.iloc[top_indices][['book_name', 'author', 'genre']].assign(
        similarity_score=[f"{sim:.4f}" for sim in similarities[top_indices]]
    )

In [None]:
user_query = "A young wizard discovers his magical heritage and attends a school of magic while facing a dark lord"
recommendations = get_book_recommendations(user_query)
print(recommendations[['book_name', 'author', 'genre', 'similarity_score']])

                                   book_name          author            genre  \
808  Harry Potter and the Chamber of Secrets   J. K. Rowling          Fantasy   
201                           The Blue Sword  Robin McKinley          Fantasy   
159                        Beneath the Moors    Brian Lumley  Horror, fantasy   

    similarity_score  
808           0.3737  
201           0.3732  
159           0.3681  


In [None]:
user_query = "Historical novel about brave women and obstacles they have to overcome"
recommendations = get_book_recommendations(user_query)
print(recommendations[['book_name', 'author', 'genre', 'similarity_score']])

                         book_name                    author  \
77                The Angel Makers           Jessica Gregson   
828                Herland (novel)  Charlotte Perkins Gilman   
695  The French Lieutenant's Woman               John Fowles   

                                                 genre similarity_score  
77                                               Novel           0.5094  
828          Feminist utopia, feminist science fiction           0.4707  
695  Postmodern literature, romance novel, historic...           0.4661  


In [None]:
user_query = "story about animals and their friendship with humans"
recommendations = get_book_recommendations(user_query)
print(recommendations[['book_name', 'author', 'genre', 'similarity_score']])

                 book_name          author                        genre  \
2054  The Zookeeper's Wife  Diane Ackerman  HistoryBiographyNon-fiction   
1028            Life of Pi     Yann Martel            Adventure fiction   
2053              Zoo City   Lauren Beukes                          NaN   

     similarity_score  
2054           0.3887  
1028           0.3809  
2053           0.3793  


In [None]:
user_query = "I want to read some sad book about war and love. Medieval ages, epic battles and drama"
recommendations = get_book_recommendations(user_query)
print(recommendations[['book_name', 'author', 'genre', 'similarity_score']])

                        book_name                  author  \
77               The Angel Makers         Jessica Gregson   
1079  Love in the Time of Cholera  Gabriel García Márquez   
1340                 Parade's End         Ford Madox Ford   

                                    genre similarity_score  
77                                  Novel           0.4675  
1079                        Romance novel           0.4330  
1340  Historical fiction, modernist novel           0.4235  


In [None]:
user_query = "Book about space travelers, discovery of mysterious planets inhabited by strange life forms"
recommendations = get_book_recommendations(user_query)
print(recommendations[['book_name', 'author', 'genre', 'similarity_score']])

                book_name              author            genre  \
1115  Man and the Planets        Duncan Lunan        Astronomy   
23      Across the Zodiac          Percy Greg  Science fiction   
161       Between Planets  Robert A. Heinlein  Science fiction   

     similarity_score  
1115           0.4270  
23             0.3997  
161            0.3625  


In [None]:
df.to_csv("embeddings_sentence_transformers_just_plot.csv", index=False)