In [None]:
import os

from langchain.text_splitter import CharacterTextSplitter
from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_ollama.llms import OllamaLLM

# 1. Create DB

In [None]:
# Define the directory containing the text file and the persistent directory
file_path = os.path.join("..", "..", "data", "books", "odyssey.txt")
persistent_dir = os.path.join("..", "..", "db", "chroma_db")

In [None]:
# Create embeddings
model_embeddings = "sentence-transformers/all-mpnet-base-v2"
embeddings = HuggingFaceEmbeddings(model_name=model_embeddings)

In [None]:
# Check if the Chroma vector store already exists
if not os.path.exists(persistent_dir):
    print("Persistent directory does not exist. Initializing vector store...")

    # Ensure the text file exists
    if not os.path.exists(file_path):
        raise FileNotFoundError(
            f"The file {file_path} does not exist. Please check the path."
        )

    # Read the text content from the file
    loader = TextLoader(file_path, encoding="utf8")
    documents = loader.load()

    # Different ways to split text into chunks
    # ----------------------------------------

    # 1. Character-based Splitting
    # Splits text into chunks based on a specified number of characters.
    # Useful for consistent chunk sizes regardless of content structure.
    text_splitter = CharacterTextSplitter(
        separator="\n", chunk_size=1000, chunk_overlap=100
    )
    docs = text_splitter.split_documents(documents)

    # 2. Sentence-based Splitting
    # Splits text into chunks based on sentences, ensuring chunks end at sentence boundaries.
    # Ideal for maintaining semantic coherence within chunks.
    # text_splitter = SentenceTransformersTokenTextSplitter(chunk_size=1000)
    # docs = text_splitter.split_documents(documents)

    # 3. Token-based Splitting
    # Splits text into chunks based on tokens (words or subwords), using tokenizers like GPT-2.
    # Useful for transformer models with strict token limits.
    # text_splitter = TokenTextSplitter(chunk_overlap=0, chunk_size=512)
    # docs = text_splitter.split_documents(documents)

    # 4. Recursive Character-based Splitting
    # Attempts to split text at natural boundaries (sentences, paragraphs) within character limit.
    # Balances between maintaining coherence and adhering to character limits.
    # text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    # docs = text_splitter.split_documents(documents)

    # Display information about the split documents
    print("\n--- Document Chunks Information ---")
    print(f"Number of document chunks: {len(docs)}")

    # Create the vector store and persist it automatically
    print("\n--- Creating vector store ---")
    db = Chroma.from_documents(docs, embeddings, persist_directory=persistent_dir)
    print("\n--- Finished creating vector store ---")

else:
    print("Vector store already exists. No need to initialize.")

# 2. Retrieve documents

In [None]:
# Load the existing vector store with the embedding function
db = Chroma(persist_directory=persistent_dir, embedding_function=embeddings)

# Define the user's question
query = "Who is Odysseus' wife?"

# Different retrieval methods
# ---------------------------

# 1. Similarity Search
# This method retrieves documents based on vector similarity.
# It finds the most similar documents to the query vector based on cosine similarity.
# Use this when you want to retrieve the top k most similar documents.
# search_type = "similarity"
# search_kwargs = {"k": 3}

# 2. Max Marginal Relevance (MMR)
# This method balances between selecting documents that are relevant to the query and diverse among themselves.
# 'fetch_k' specifies the number of documents to initially fetch based on similarity.
# 'lambda_mult' controls the diversity of the results: 1 for minimum diversity, 0 for maximum.
# Use this when you want to avoid redundancy and retrieve diverse yet relevant documents.
# Note: Relevance measures how closely documents match the query.
# Note: Diversity ensures that the retrieved documents are not too similar to each other,
#       providing a broader range of information.
# search_type = "mmr"
# search_kwargs = {"k": 3, "fetch_k": 20, "lambda_mult": 0.5}

# 3. Similarity Score Threshold
# This method retrieves documents that exceed a certain similarity score threshold.
# 'score_threshold' sets the minimum similarity score a document must have to be considered relevant.
# Use this when you want to ensure that only highly relevant documents are retrieved, filtering out less relevant ones.
search_type = "similarity_score_threshold"
search_kwargs = {"k": 3, "score_threshold": 0.3}

# Retrieve relevant documents based on the query
retriever = db.as_retriever(
    search_type=search_type,
    search_kwargs=search_kwargs,
)
relevant_docs = retriever.invoke(query)

# Display the relevant results with metadata
print("\n--- Relevant Documents ---")
for i, doc in enumerate(relevant_docs, 1):
    print(f"Document {i}:\n{doc.page_content}\n")
    if doc.metadata:
        print(f"Source: {doc.metadata.get('source', 'Unknown')}\n")

# 3. One off question

In [None]:
llm = OllamaLLM(model="deepseek-r1:1.5b")

In [None]:
# Load the existing vector store with the embedding function
db = Chroma(persist_directory=persistent_dir, embedding_function=embeddings)

# Define the user's question
query = "Who is Odysseus' wife?"

# Retrieve relevant documents based on the query
retriever = db.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k": 3, "score_threshold": 0.3},
)
relevant_docs = retriever.invoke(query)

# Combine the query and the relevant document contents
combined_input = (
    "Here are some documents that might help answer the question: "
    + query
    + "\n\nRelevant Documents:\n"
    + "\n\n".join([doc.page_content for doc in relevant_docs])
    + "\n\nPlease provide an answer based only on the provided documents. If the answer is not found in the documents, respond with 'I'm not sure'."
)

# Define the messages for the model
messages = [
    SystemMessage(content="You are a helpful assistant."),
    HumanMessage(content=combined_input),
]

# Invoke the model with the combined input
result = llm.invoke(messages)

# Display the answer
print("\n--- Generated Response ---")
print("Answer:")
print(result)