In [None]:
from transformers import AutoTokenizer

In [None]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import normalize
from tqdm import tqdm

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
df_cleaned = pd.read_csv("/content/cleaned_book_details (2).csv")

In [None]:
def count_tokens(text, model_name="allenai/longformer-base-4096"):
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  tokens = tokenizer(text, truncation=False, add_special_tokens=True)
  return len(tokens["input_ids"])

text = df_cleaned["plot"][2]
token_count = count_tokens(text)
print(f"Token count: {token_count}")

Token count: 231


In [None]:
model_name = "allenai/longformer-base-4096"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

LongformerModel(
  (embeddings): LongformerEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (position_embeddings): Embedding(4098, 768, padding_idx=1)
  )
  (encoder): LongformerEncoder(
    (layer): ModuleList(
      (0-11): 12 x LongformerLayer(
        (attention): LongformerAttention(
          (self): LongformerSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (query_global): Linear(in_features=768, out_features=768, bias=True)
            (key_global): Linear(in_features=768, out_features=768, bias=True)
            (value_global): Linear(in_features=768, out_features=768, bias=True)
          )
    

In [None]:
tqdm.pandas()

In [None]:
def get_batch_embeddings(text_list, batch_size=8):
  embeddings = [None] * len(text_list)
  valid_indices = [i for i, text in enumerate(text_list) if pd.notna(text) and text != ""]
  for i in tqdm(range(0, len(valid_indices), batch_size), desc="Generating Embeddings in Batches"):
    batch_idx = valid_indices[i:i+batch_size]
    batch_texts = [text_list[j] for j in batch_idx]

    inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=4096)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():
      outputs = model(**inputs)

    batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()

    for j, emb in zip(batch_idx, batch_embeddings):
      embeddings[j] = emb

  return embeddings

In [None]:
df_cleaned["plot_embedding"] = get_batch_embeddings(df_cleaned["plot"].tolist(), batch_size=8)
df_cleaned["description_embedding"] = get_batch_embeddings(df_cleaned["description"].tolist(), batch_size=8)

In [None]:
def normalize_embedding(embedding):
  if embedding is None:
    return None
  return normalize(embedding.reshape(1, -1))[0]

In [None]:
df_cleaned["plot_embedding"] = df_cleaned["plot_embedding"].progress_apply(normalize_embedding)
df_cleaned["description_embedding"] = df_cleaned["description_embedding"].progress_apply(normalize_embedding)

In [None]:
df_cleaned = pd.read_csv('/content/embeddings_longformer.csv')

def parse_numpy_vector(s):
    if isinstance(s, str):
        return np.fromstring(s.strip("[]"), sep=' ')
    else:
        return None

df_cleaned['plot_embedding'] = df_cleaned['plot_embedding'].apply(parse_numpy_vector)
df_cleaned['description_embedding'] = df_cleaned['description_embedding'].apply(parse_numpy_vector)

df_cleaned.head()

Unnamed: 0,book_name,author,year_of_publishing,plot,genre,description,page_number,plot_embedding,description_embedding
0,The 12.30 from Croydon,Freeman Wills Crofts,1934,"Set in Yorkshire and London in 1933, The 12.30...",Mystery,The 12.30 from Croydon (U.S. title: Wilful and...,,"[-0.00774844224, 0.00612177933, 0.0050468822, ...","[-0.00506153377, 0.00736133289, 0.000629157876..."
1,The Final Unfinished Voyage of Jack Aubrey,Patrick O'Brian,2004,The story begins with Surprise in the Strait o...,Historical novel,The Final Unfinished Voyage of Jack Aubrey is ...,"144 first edition, hardback","[-0.00862098299, 0.00364552019, 0.00393931149,...","[-0.00825847313, 0.00424632337, 0.0033255436, ..."
2,30 Days in Sydney,Peter Carey,"July 15, 2010 (2010-07-15)","The book takes the form of an impressionistic,...",,30 Days in Sydney is a book written by Austral...,256,"[-0.00958774984, 0.00574548915, 0.00351350172,...","[-0.00680910842, 0.00729448535, 0.000411202083..."
3,The Thirty-Nine Steps,John Buchan,1915[1],"The story's narrator, Richard Hannay, arrives ...",Adventure novel,The Thirty-Nine Steps is a 1915 adventure nove...,253[1],"[-0.0070822956, 0.00532044983, 0.00147258863, ...","[-0.00632881746, 0.00685635116, 0.0024315773, ..."
4,334 (novel),Thomas M. Disch,1972 (MacGibbon & Kee),The future in 334 has brought few technologica...,"Dystopian, science fiction",334 is a 1972 dystopian science fiction novel ...,201,"[-0.00685231388, 0.00810605474, 0.00211672275,...","[-0.00400138833, 0.0059737158, 0.00270546763, ..."


In [None]:
df_cleaned.to_pickle("embeddings_longformer.pkl")

In [None]:
def get_single_embedding(text):
  if pd.isna(text) or text is None or text == "":
    return None

  inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=4096)
  inputs = {key: value.to(device) for key, value in inputs.items()}

  with torch.no_grad():
    outputs = model(**inputs)

  return outputs.last_hidden_state[:, 0, :].cpu().numpy().reshape(1, -1)


In [None]:
df_cleaned = pd.read_csv("/content/embeddings_longformer.csv")

In [None]:
df_cleaned.head()

Unnamed: 0,book_name,author,year_of_publishing,plot,genre,description,page_number,plot_embedding,description_embedding
0,The 12.30 from Croydon,Freeman Wills Crofts,1934,"Set in Yorkshire and London in 1933, The 12.30...",Mystery,The 12.30 from Croydon (U.S. title: Wilful and...,,[-7.74844224e-03 6.12177933e-03 5.04688220e-...,[-5.06153377e-03 7.36133289e-03 6.29157876e-...
1,The Final Unfinished Voyage of Jack Aubrey,Patrick O'Brian,2004,The story begins with Surprise in the Strait o...,Historical novel,The Final Unfinished Voyage of Jack Aubrey is ...,"144 first edition, hardback",[-8.62098299e-03 3.64552019e-03 3.93931149e-...,[-8.25847313e-03 4.24632337e-03 3.32554360e-...
2,30 Days in Sydney,Peter Carey,"July 15, 2010 (2010-07-15)","The book takes the form of an impressionistic,...",,30 Days in Sydney is a book written by Austral...,256,[-9.58774984e-03 5.74548915e-03 3.51350172e-...,[-6.80910842e-03 7.29448535e-03 4.11202083e-...
3,The Thirty-Nine Steps,John Buchan,1915[1],"The story's narrator, Richard Hannay, arrives ...",Adventure novel,The Thirty-Nine Steps is a 1915 adventure nove...,253[1],[-7.08229560e-03 5.32044983e-03 1.47258863e-...,[-6.32881746e-03 6.85635116e-03 2.43157730e-...
4,334 (novel),Thomas M. Disch,1972 (MacGibbon & Kee),The future in 334 has brought few technologica...,"Dystopian, science fiction",334 is a 1972 dystopian science fiction novel ...,201,[-6.85231388e-03 8.10605474e-03 2.11672275e-...,[-4.00138833e-03 5.97371580e-03 2.70546763e-...


# **Averaging strategy**

In [None]:
#эмбеддинги для сюжета и описания объединяем, находим среднее
#пото уже косинусное растояние считает между объединенным эмбеддингом
#и тем, что пользователь ввел

def find_similar_books(user_description, df_cleaned, top_n=3):
  user_embedding = get_single_embedding(user_description)
  if user_embedding is None:
    return "Invalid description provided"

  similarities = []
  valid_indices = []

  for idx, (desc_emb, plot_emb) in enumerate(zip(df_cleaned["description_embedding"], df_cleaned["plot_embedding"])):
    if desc_emb is None:
      book_embedding = np.array(plot_emb)
    elif plot_emb is None:
      book_embedding = np.array(desc_emb)
    else:
      book_embedding = (np.array(desc_emb) + np.array(plot_emb)) / 2

    similarity = cosine_similarity(user_embedding.reshape(1, -1), book_embedding.reshape(1, -1))[0][0]
    similarities.append(similarity)
    valid_indices.append(idx)

  if not similarities:
    return "No valid embeddings found"

  similarities = np.array(similarities)
  top_indices = np.argsort(similarities)[-top_n:][::-1]

  recommendations = df_cleaned.iloc[[valid_indices[i] for i in top_indices]][["book_name", "author", "genre"]].copy()
  recommendations["similarity_score"] = similarities[top_indices]

  return recommendations

In [None]:
user_input = "A young wizard discovers his magical heritage and attends a school of magic while facing a dark lord"
recommended_books = find_similar_books(user_input, df_cleaned)
print(recommended_books)

                      book_name              author  \
400           A Crown of Swords       Robert Jordan   
1701           Strange Gateways   E. Hoffmann Price   
1488  A Rendezvous in Averoigne  Clark Ashton Smith   

                                 genre  similarity_score  
400                            Fantasy          0.997513  
1701                   Fantasy, horror          0.997489  
1488  Science fiction, Fantasy, Horror          0.997344  


In [None]:
user_input = "Historical novel about brave women and obstacles they have to overcome"
recommended_books = find_similar_books(user_input, df_cleaned)
print(recommended_books)

                      book_name              author  \
1488  A Rendezvous in Averoigne  Clark Ashton Smith   
1057            Lonesome Places      August Derleth   
1701           Strange Gateways   E. Hoffmann Price   

                                 genre  similarity_score  
1488  Science fiction, Fantasy, Horror          0.998425  
1057                   Fantasy, Horror          0.998352  
1701                   Fantasy, horror          0.998324  


In [None]:
user_input = "story about animals and their friendship with humans"
recommended_books = find_similar_books(user_input, df_cleaned)
print(recommended_books)

                                book_name             author  \
1701                     Strange Gateways  E. Hoffmann Price   
1741                 Tales from Underwood    David H. Keller   
1732  Synthesis & Other Virtual Realities     Mary Rosenblum   

                                 genre  similarity_score  
1701                   Fantasy, horror          0.998080  
1741  Fantasy, horror, science fiction          0.997983  
1732                   Science fiction          0.997951  


In [None]:
user_input = "I want to read some sad book about war and love. Medieval ages, epic battles and drama"
recommended_books = find_similar_books(user_input, df_cleaned)
print(recommended_books)

                    book_name        author          genre  similarity_score
675               Forest Mage    Robin Hobb  Fantasy novel          0.997211
1881  The Two Sisters (novel)   H. E. Bates            NaN          0.997134
515            Dream Children  A. N. Wilson          Novel          0.997119


In [None]:
user_input = "Book about space travelers, discovery of mysterious planets inhabited by strange life forms"
recommended_books = find_similar_books(user_input, df_cleaned)
print(recommended_books)

                      book_name             author            genre  \
1155  Memories of the Space Age      J. G. Ballard  Science fiction   
1701           Strange Gateways  E. Hoffmann Price  Fantasy, horror   
47          The Aliens of Earth        Nancy Kress  Science fiction   

      similarity_score  
1155          0.998344  
1701          0.998341  
47            0.998209  


# **Weighted strategy**

In [None]:
#все как в прошлый раз, но попробуем применить веса для эмбеддингов,
#потому что описание не всегда идет о самой книге, иногда оно идет про какие-то
#награды и тд, что сильно отдаляет от сюжета

desc_weight = 0
plot_weight = 1

def find_similar_books(user_description, df_cleaned, top_n=3):
  user_embedding = get_single_embedding(user_description)
  if user_embedding is None:
    return "Invalid description provided"

  similarities = []
  valid_indices = []

  for idx, (desc_emb, plot_emb) in enumerate(zip(df_cleaned["description_embedding"], df_cleaned["plot_embedding"])):
    book_embedding = np.array(plot_emb)

    similarity = cosine_similarity(user_embedding.reshape(1, -1), book_embedding.reshape(1, -1))[0][0]
    similarities.append(similarity)
    valid_indices.append(idx)

  if not similarities:
    return "No valid embeddings found"

  similarities = np.array(similarities)
  top_indices = np.argsort(similarities)[-top_n:][::-1]

  recommendations = df_cleaned.iloc[[valid_indices[i] for i in top_indices]][["book_name", "author", "genre"]].copy()
  recommendations["similarity_score"] = similarities[top_indices]

  return recommendations

In [None]:
user_input = "A young wizard discovers his magical heritage and attends a school of magic while facing a dark lord"
recommended_books = find_similar_books(user_input, df_cleaned)
print(recommended_books)

                                         book_name  \
400                              A Crown of Swords   
370  Conan the Adventurer (short story collection)   
515                                 Dream Children   

                                      author              genre  \
400                            Robert Jordan            Fantasy   
370  Robert E. Howard and L. Sprague de Camp  Sword and sorcery   
515                             A. N. Wilson              Novel   

     similarity_score  
400          0.997351  
370          0.997225  
515          0.997191  


In [None]:
user_input = "Historical novel about brave women and obstacles they have to overcome"
recommended_books = find_similar_books(user_input, df_cleaned)
print(recommended_books)

                   book_name  \
400        A Crown of Swords   
578     Evermore (anthology)   
1502  The Rim of the Unknown   

                                                 author  \
400                                       Robert Jordan   
578   edited by James Robert Smith and Stephen Mark ...   
1502                                 Frank Belknap Long   

                                 genre  similarity_score  
400                            Fantasy          0.998053  
578               horror short stories          0.997994  
1502  Fantasy, horror, science fiction          0.997988  


In [None]:
user_input = "story about animals and their friendship with humans"
recommended_books = find_similar_books(user_input, df_cleaned)
print(recommended_books)

                                  book_name  \
1155              Memories of the Space Age   
1651  Something About Cats and Other Pieces   
578                    Evermore (anthology)   

                                                 author  \
1155                                      J. G. Ballard   
1651                                    H. P. Lovecraft   
578   edited by James Robert Smith and Stephen Mark ...   

                                 genre  similarity_score  
1155                   Science fiction          0.997952  
1651  Fantasy, horror, science fiction          0.997836  
578               horror short stories          0.997733  


In [None]:
user_input = "I want to read some sad book about war and love. Medieval ages, epic battles and drama"
recommended_books = find_similar_books(user_input, df_cleaned)
print(recommended_books)

                 book_name         author  genre  similarity_score
1219            My Cleaner     Maggie Gee    NaN          0.997030
515         Dream Children   A. N. Wilson  Novel          0.996952
191   The Blessing (novel)  Nancy Mitford    NaN          0.996853


In [None]:
user_input = "Book about space travelers, discovery of mysterious planets inhabited by strange life forms"
recommended_books = find_similar_books(user_input, df_cleaned)
print(recommended_books)

                       book_name          author            genre  \
1155   Memories of the Space Age   J. G. Ballard  Science fiction   
280   The Casebook of Solar Pons  August Derleth        Detective   
47           The Aliens of Earth     Nancy Kress  Science fiction   

      similarity_score  
1155          0.998155  
280           0.997987  
47            0.997966  
