In [None]:
import os
import pandas as pd
import numpy as np
import re
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM
import faiss
from datasets import load_dataset
import nltk
import torch
nltk.download('punkt')

In [1]:
# --- Load the Dataset from Hugging Face ---
dataset = load_dataset("matoupines/book-dataset")
train_data = dataset["train"].to_pandas()  # Convert dataset to a Pandas DataFrame

# --- Data Cleaning and Formatting ---

MAX_DESC_LENGTH = 200  # Limit description to 200 characters


def clean_text(text):
    """Cleans text and truncates long descriptions."""
    if pd.isna(text):
        return ""
    text = str(text).strip()
    text = re.sub(r"[^\x00-\x7F]+", "", text)  # Remove non-ASCII characters
    text = re.sub(r"\s+", " ", text)  # Replace multiple spaces with single spaces
    if len(text) > MAX_DESC_LENGTH:
        text = text[:MAX_DESC_LENGTH] + "..."  # Truncate long descriptions
    return text


def format_authors(authors):
    """Formats the authors field to ensure proper quoting for multiple authors."""
    if pd.isna(authors):
        return ""
    authors = str(authors)
    # If there's a comma, assume multiple authors and enclose in quotes
    if "," in authors:
        return f'"{authors}"'
    else:
        return authors  # Return as is if no comma (single author)


# Apply cleaning functions to relevant fields
train_data["title"] = train_data["title"].apply(clean_text)
train_data["description"] = train_data["description"].apply(clean_text)
train_data["authors"] = train_data["authors"].apply(format_authors)

# --- Combine Fields for Embedding ---
train_data["text"] = (
    train_data["title"] + " " + train_data["authors"] + " " + train_data["description"]
)
documents = train_data["text"].tolist()

# --- Initialize Embedding Model ---
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Generate Embeddings for the Dataset
print("Generating embeddings...")
embeddings = embedding_model.encode(documents, show_progress_bar=True)

# Convert embeddings to float32 for FAISS
embeddings = np.array(embeddings).astype("float32")

# --- Initialize Embedding Model ---
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Generate Embeddings for the Dataset
print("Generating embeddings...")
embeddings = embedding_model.encode(documents, show_progress_bar=True)

# Convert embeddings to float32 for FAISS
embeddings = np.array(embeddings).astype("float32")

# --- Initialize FAISS ---
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)  # Cosine similarity
embeddings = embeddings / np.linalg.norm(
    embeddings, axis=1, keepdims=True
)  # Normalize embeddings
index.add(embeddings)  # Add normalized embeddings to FAISS index

# Save FAISS index (optional)
faiss.write_index(index, "books_index.faiss")
print("FAISS index saved.")

# Load Qwen-2.5 tokenizer and model
qwen_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B")
qwen_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B")


def truncate_context(query, context, max_input_tokens):
    """Truncate context to fit within model's token limit."""
    # Start with smaller context chunks
    context_chunks = context.split("\n")
    truncated_chunks = []
    current_text = f"Query: {query}\nContext:"

    for chunk in context_chunks:
        test_text = current_text + f"\n{chunk}\nAnswer:"
        tokens = qwen_tokenizer(
            test_text, return_tensors="pt", truncation=True, max_length=max_input_tokens
        )

        if len(tokens["input_ids"][0]) < max_input_tokens:
            truncated_chunks.append(chunk)
            current_text = f"Query: {query}\nContext: {' '.join(truncated_chunks)}"
        else:
            break

    return current_text + "\nAnswer:"


def generate_response(query, context, max_new_tokens=150):
    """Generates focused responses using context-aware prompting."""
    max_input_tokens = min(2048, qwen_tokenizer.model_max_length - max_new_tokens)

    # Create a more structured prompt
    prompt = f"""You are an expert in book summaries. Based on the following context, answer the query by extracting relevant details from the provided book descriptions. Focus on specific information about the plot, characters, or themes mentioned in the context. 
    
    Query: {query}
    Context: {context}
    
    Detailed Answer:"""

    inputs = qwen_tokenizer(
        prompt, return_tensors="pt", truncation=True, max_length=max_input_tokens
    )

    outputs = qwen_model.generate(
        inputs["input_ids"],
        max_new_tokens=max_new_tokens,
        num_return_sequences=1,
        pad_token_id=qwen_tokenizer.eos_token_id,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.2,
        no_repeat_ngram_size=3,
    )

    return qwen_tokenizer.decode(outputs[0], skip_special_tokens=True)


def get_relevant_docs(query, k=1):
    """Retrieve relevant documents with semantic search."""
    query_embedding = embedding_model.encode([query])
    distances, indices = index.search(query_embedding.astype("float32"), k)

    # Convert to DataFrame and sort by relevance
    df = pd.DataFrame(dataset["train"])
    results = df.iloc[indices[0]]

    # Only return highly relevant results
    mask = distances[0] < 1.2  # Adjust threshold as needed
    return results[mask], distances[0][mask]


try:
    query = "Find me a book about a widow coping with loss and finding new meaning in life.?"
    print(f"Query: {query}")

    # Retrieve relevant documents
    relevant_docs, scores = get_relevant_docs(query)

    for score, row in relevant_docs:
        if isinstance(row, dict):
            print(f"Title: {row.get('title', 'No title')}\n")
        elif isinstance(row, str):
            print(f"Content: {row}\n")
        else:
            print(f"Unknown data type: {type(row)}\n")


except Exception as e:
    print(f"Error: {e}")

NameError: name 'load_dataset' is not defined