In [2]:
!pip install sentence-transformers transformers faiss-cpu pandas datasets langchain langchain-community

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.13-py3-none-any.whl.metadata (2.9 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0

In [18]:
import os
import pandas as pd
import numpy as np
import re
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import faiss
from datasets import load_dataset
import time


In [4]:
from langchain.document_loaders import DataFrameLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate

In [5]:
# --- Load the Dataset from Hugging Face ---
dataset = load_dataset("matoupines/book-dataset")
train_data = dataset['train'].to_pandas()  # Convert dataset to a Pandas DataFrame

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


book-data.csv:   0%|          | 0.00/572k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/497 [00:00<?, ? examples/s]

In [6]:
# --- Load the Dataset from Hugging Face ---
dataset = load_dataset("matoupines/book-dataset")
train_data = dataset['train'].to_pandas()  # Convert dataset to a Pandas DataFrame

In [7]:
# --- Data Cleaning and Formatting ---
MAX_DESC_LENGTH = 250  # Limit description to 200 characters
def clean_text(text):
    """Cleans text and truncates long descriptions."""
    if pd.isna(text):
        return ""
    text = str(text).strip()
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove non-ASCII characters
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with single spaces
    if len(text) > MAX_DESC_LENGTH:
        text = text[:MAX_DESC_LENGTH] + "..."  # Truncate long descriptions
    return text

def format_authors(authors):
    """Formats the authors field to ensure proper quoting for multiple authors."""
    if pd.isna(authors):
        return ""
    authors = str(authors)
    # If there's a comma, assume multiple authors and enclose in quotes
    if ',' in authors:
        return f'"{authors}"'
    else:
        return authors  # Return as is if no comma (single author)

# Apply cleaning functions to relevant fields
train_data['title'] = train_data['title'].apply(clean_text)
train_data['description'] = train_data['description'].apply(clean_text)
train_data['authors'] = train_data['authors'].apply(format_authors)

In [8]:
# --- Prepare data for LangChain ---
train_data["page_content"] = (
    "Title: " + train_data["title"] + " Authors: " + train_data["authors"] + " Description: " + train_data["description"]
)
train_data["metadata"] = train_data.apply(lambda row: {"title": row["title"], "authors": row["authors"]}, axis=1)

In [9]:
# --- LangChain Components ---

# 1. Document Loader
loader = DataFrameLoader(train_data, page_content_column="page_content")
documents = loader.load()

# 2. Text Splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = text_splitter.split_documents(documents)

# 3. Embeddings
embedding_model_name = "multi-qa-mpnet-base-dot-v1"
embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)

# 4. Vector Store
db = FAISS.from_documents(texts, embedding_model)

# 5. Retriever
retriever = db.as_retriever(search_kwargs={"k": 3})  # Retrieve top 3 documents

  embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.71k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [15]:
# 6. LLM
qwen_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B")
qwen_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-1.5B")

pipe = pipeline(
    "text-generation",
    model=qwen_model,
    tokenizer=qwen_tokenizer,
    max_new_tokens=150,
    do_sample=True,
    temperature=0.7,
    top_p=0.9,
    repetition_penalty=1.2,
    no_repeat_ngram_size=3,
)

llm = HuggingFacePipeline(pipeline=pipe)

tokenizer_config.json:   0%|          | 0.00/7.23k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

Device set to use cpu


In [11]:
# Create a prompt template
prompt_template = """You are a helpful assistant that recommends books based on user queries.
Use the following context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Detailed Answer:"""

PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

In [12]:
# 7. Chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT},
)

In [20]:
# --- Main Execution ---
try:
    query = "Recommend a thriller where the main character has to confront a killer from their past."
    print(f"Query: {query}")

    start_time = time.time()  # Record start time
    result = qa_chain(query)
    end_time = time.time()  # Record end time

    print(f"{result['result'].split('Detailed Answer:')[1]}")

    # Optionally, print source documents
    print("\nSource Documents:")
    for doc in result["source_documents"]:
        print(f"  Title: {doc.metadata['title']}, Authors: {doc.metadata['authors']}")

    response_time = end_time - start_time  # Calculate response time
    print(f"Model Response Time: {response_time:.2f} seconds")

except Exception as e:
    print(f"Error: {e}")

Query: Recommend a thriller where the main character has to confront a killer from their past.
 

Based on the provided information about "Little Brother" by Cory Doctorowl, it appears that this book doesn't contain any explicit content related to violence or death scenes specifically for thrillers with heavy themes of redemption and justice. However, if we were looking for something more intense than typical "thrillers," I would recommend "Crossfire" written by James Patterson in 1987, which features a highly dangerous and unpredictable protagonist who must navigate through both personal conflicts and supernatural threats while trying to find closure after being wrongly accused of murder against her ex-husband.

However, given your request for a thrilling story involving a major character confronting a previous crime scene (which might be too specific), here’s another option:

**Thriller Title:**

Source Documents:
  Title: The Walking Dead: The heart's desire (#19-24), Authors: Rober

In [21]:
# --- Main Execution ---
try:
    query = "I want a book with a circus setting, preferably with romance."
    print(f"Query: {query}")

    start_time = time.time()  # Record start time
    result = qa_chain(query)
    end_time = time.time()  # Record end time

    print(f"{result['result'].split('Detailed Answer:')[1]}")

    response_time = end_time - start_time  # Calculate response time
    print(f"Model Response Time: {response_time:.2f} seconds")

except Exception as e:
    print(f"Error: {e}")

Query: I want a book with a circus setting, preferably with romance.
 The title "Drama" fits well within your preferences. It sounds like a perfect fit considering your desire for a romantic drama set against a circus backdrop where characters navigate their personal dramas through different relationships throughout a professional experience. This genre allows readers to immerse themselves into multiple perspectives and emotions, making it ideal for those looking for compelling plotlines involving love and passion amidst chaos and transformation. Additionally, featuring a cast of talented performers ensures that each character feels distinct and integral to the overall narrative arc, enhancing both entertainment value and thematic depth.
Model Response Time: 32.98 seconds
