In [1]:
# pip install sentence-transformers transformers faiss-cpu pandas datasets

Collecting sentence-transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading sentence_transformers-3.3.1-py3-none-any.whl (268 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m72.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-cpu, sentence-transformers
Successfully installed faiss-cpu-1.9.0.post1 sentence-transformers-3.3.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import pandas as pd
import numpy as np
import re
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM
import faiss
from datasets import load_dataset
import torch

In [3]:
def clear_gpu_memory():
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

# Add after each generation
clear_gpu_memory()

In [4]:
# --- Load the Dataset from Hugging Face ---
dataset = load_dataset("matoupines/book-dataset")
train_data = dataset['train'].to_pandas()  # Convert dataset to a Pandas DataFrame

book-data.csv:   0%|          | 0.00/572k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/497 [00:00<?, ? examples/s]

In [5]:
# --- Data Cleaning and Formatting ---
def clean_text(text):
    """Cleans text by removing/replacing special characters."""
    if pd.isna(text):  # Handle missing values
        return ""
    text = str(text)  # Ensure text is a string
    text = text.strip()  # Remove leading/trailing whitespace
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove non-ASCII characters
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with single spaces
    return text

def format_authors(authors):
    """Formats the authors field to ensure proper quoting for multiple authors."""
    if pd.isna(authors):
        return ""
    authors = str(authors)
    # If there's a comma, assume multiple authors and enclose in quotes
    if ',' in authors:
        return f'"{authors}"'
    else:
        return authors  # Return as is if no comma (single author)

In [6]:
# Apply cleaning functions to relevant fields
train_data['title'] = train_data['title'].apply(clean_text)
train_data['description'] = train_data['description'].apply(clean_text)
train_data['authors'] = train_data['authors'].apply(format_authors)

In [7]:
# --- Combine Fields for Embedding ---
train_data["text"] = train_data["title"] + " " + train_data["authors"] + " " + train_data["description"]
documents = train_data["text"].tolist()

In [8]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [9]:
# --- Initialize Embedding Model ---
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Generate Embeddings for the Dataset
print("Generating embeddings...")
embeddings = embedding_model.encode(documents, show_progress_bar=True)

# Convert embeddings to float32 for FAISS
embeddings = np.array(embeddings).astype("float32")

# --- Initialize FAISS ---
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)  # Cosine similarity
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)  # Normalize embeddings
index.add(embeddings)  # Add normalized embeddings to FAISS index

# Save FAISS index (optional)
faiss.write_index(index, "books_index.faiss")
print("FAISS index saved.")

# Load Qwen-2.5 tokenizer and model
qwen_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B")
qwen_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-3B")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Generating embeddings...


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

FAISS index saved.


tokenizer_config.json:   0%|          | 0.00/7.23k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/683 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.6k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

In [10]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
def truncate_context(query, context, max_input_tokens):
    """Truncate context to fit within model's token limit."""
    # Start with smaller context chunks
    context_chunks = context.split('\n')
    truncated_chunks = []
    current_text = f"Query: {query}\nContext:"
    
    for chunk in context_chunks:
        test_text = current_text + f"\n{chunk}\nAnswer:"
        tokens = qwen_tokenizer(test_text, return_tensors="pt", truncation=True, max_length=max_input_tokens)
        
        if len(tokens["input_ids"][0]) < max_input_tokens:
            truncated_chunks.append(chunk)
            current_text = f"Query: {query}\nContext: {' '.join(truncated_chunks)}"
        else:
            break
            
    return current_text + "\nAnswer:"

def generate_response(query, context, max_new_tokens=150):
    """Generate a factually accurate response using a more structured and restrictive prompt."""
    max_input_tokens = min(2048, qwen_tokenizer.model_max_length - max_new_tokens)
    
    prompt = f"""You are a helpful assistant that provides factually accurate and relevant responses.
    Your answers should be based strictly on the given context, and you should aim for clarity and precision.
    Do not invent information or provide guesses. If the context doesn't provide an answer, simply say you don't know.
    Ensure that your answer directly addresses the user's query.
    
    Context:
    {context}
    
    Question: {query}
    Answer:
    """
    
    inputs = qwen_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_input_tokens)
    
    outputs = qwen_model.generate(
        inputs["input_ids"],
        max_new_tokens=max_new_tokens,
        num_return_sequences=1,
        pad_token_id=qwen_tokenizer.eos_token_id,
        temperature=0.2,  # Lower for more deterministic and precise outputs
        top_p=0.85,       # Slightly narrower sampling pool for better quality
        repetition_penalty=2.0,  # Increased penalty for repetition to reduce redundancy
        no_repeat_ngram_size=4,  # Larger n-gram size to avoid repetitive phrases
        length_penalty=1.2,      # Slight penalty to avoid overly long answers
        early_stopping=True      # Stop generation early if the answer is complete
    )
    
    # Decode and clean up the output
    response = qwen_tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    
    return response

def get_relevant_docs(query, k=5):
    """Retrieve relevant documents with semantic search."""
    query_embedding = embedding_model.encode([query])
    distances, indices = index.search(query_embedding.astype("float32"), k)
    
    # Convert to DataFrame and sort by relevance
    df = pd.DataFrame(dataset['train'])
    results = df.iloc[indices[0]]
    
    # Only return highly relevant results
    mask = distances[0] < 1.2  # Adjust threshold as needed
    return results[mask], distances[0][mask]
    

In [15]:
def summarize_context(relevant_docs, max_tokens=500):
    # Summarize the context to limit the number of tokens
    summarized_context = ""
    token_count = 0

    for _, row in relevant_docs.iterrows():
        context_piece = f"Title: {row['title']}, Description: {row['description']}\n"
        tokenized_piece = qwen_tokenizer(context_piece)["input_ids"]
        
        # Check if adding this piece would exceed the token limit
        if token_count + len(tokenized_piece) <= max_tokens:
            summarized_context += context_piece
            token_count += len(tokenized_piece)
        else:
            break  # Stop adding more context if we exceed token limit

    return summarized_context

def generate_optimized_response(query, relevant_docs, max_new_tokens=150, max_tokens=500):
    # Prepare the context by summarizing or truncating it
    context = summarize_context(relevant_docs, max_tokens)
    
    # Prepare the prompt
    prompt = f"""
    You are a helpful assistant. Please provide a summary of the book titled "Twenty Wishes" by Debbie Macomber, based solely on the context below:
    
    Context:
    {context}
    
    Please ensure the summary accurately reflects the plot of *Twenty Wishes* and does not include unrelated information from other books. 
    Do not invent any details or provide guesses. Summarize the main story of the book clearly and concisely.

    Answer:
    """

    # Check tokenized length
    tokenized_length = len(qwen_tokenizer(prompt)["input_ids"])
    print(f"Tokenized input length before truncation: {tokenized_length} tokens")

    # Generate response
    response = generate_response(query, context, max_new_tokens)
    
    return response

In [18]:
# Example usage with your query
query = "Find some books that similar to the book the tipping point of malcolm gladwell"
print(query)

relevant_docs, scores = get_relevant_docs(query, 5)
recommended_books = "\n".join(
    [
        f"Title: {row['title']} by {row['authors']}"
        for _, row in relevant_docs.iterrows()
    ]
)
print(recommended_books)

Find some books that similar to the book the tipping point of malcolm gladwell


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Title: What the Dog Saw by Malcolm Gladwell
Title: The Secret by Rhonda Byrne
Title: Outliers by Malcolm Gladwell
Title: Dear Martin by Nic Stone
Title: The Pocket Dangerous Book for Boys: Things to Do by "Conn Iggulden, Hal Iggulden"
