In [None]:
import hashlib
import os
import glob
import getpass
from typing import List, Tuple
from dotenv import load_dotenv
from langchain_text_splitters.markdown import MarkdownHeaderTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from rank_bm25 import BM25Okapi

In [None]:
# Folder containing the .md files
folder_path = "docs/data"  # Change this to your actual folder path
vector_db_folder = "vector_db"
os.makedirs(vector_db_folder, exist_ok=True)

# Get the first 5 .md files (sorted alphabetically)
md_files = sorted(glob.glob(os.path.join(folder_path, "*.md")))[:5]

# Dictionary to store file names and their contents
md_dict = {}

# Read each file and store in dictionary
for file in md_files:
    file_name = os.path.basename(file)  # Extract file name
    with open(file, "r", encoding="utf-8") as f:
        md_dict[file_name] = f.read()

print(f"Loaded {len(md_dict)} markdown files.")

Loaded 5 markdown files.


In [None]:
class ContextualRetrieval:
    """
    A class that implements the Contextual Retrieval system.
    """

    def __init__(self):
        """
        Initialize the ContextualRetrieval system.
        """
        self.text_splitter = MarkdownHeaderTextSplitter(
            headers_to_split_on=[
                ("#", "Header 1"),
                ("##", "Header 2"),
                ("###", "Header 3")
            ]
        )
        self.embeddings = OpenAIEmbeddings()
        self.llm = ChatOpenAI(
            model="gpt-4o-mini",
            temperature=0,
            max_tokens=None,
            timeout=None,
            max_retries=2,
            api_key=os.getenv("OPENAI_API_KEY")
        )

    def process_documents(self, md_dict: dict) -> None:
        """
        Process markdown files stored in md_dict and save them separately in the vector_db folder.
        """
        for file_name, document in md_dict.items():
            chunks, contextualized_chunks = self._process_single_document(document)
            vector_store = self.create_vectorstores(contextualized_chunks)
            vector_store.save_local(os.path.join(vector_db_folder, file_name.replace(".md", "")))
        print("Documents processed and stored in vector_db folder.")

    def _process_single_document(self, document: str) -> Tuple[List[Document], List[Document]]:
        """
        Process a single document by splitting it into chunks and generating contextualized versions.
        """
        chunks = self.text_splitter.split_text(document)  # Ensure `document` is a string
        contextualized_chunks = self._generate_contextualized_chunks(document, chunks)
        return chunks, contextualized_chunks

    def _generate_contextualized_chunks(self, document: str, chunks: List[Document]) -> List[Document]:
        """
        Generate contextualized versions of the given chunks.
        """
        contextualized_chunks = []
        for chunk in chunks:
            context = self._generate_context(document, chunk.page_content)
            contextualized_content = f"{context}\n\n{chunk.page_content}"
            contextualized_chunks.append(Document(page_content=contextualized_content, metadata=chunk.metadata))
        return contextualized_chunks

    def create_vectorstores(self, chunks: List[Document]) -> FAISS:
        """
        Create a vector store for the given chunks.
        """
        return FAISS.from_documents(chunks, self.embeddings)

    def _generate_context(self, document: str, chunk: str) -> str:
        """
        Generate context for a specific chunk using the language model.
        """
        prompt = ChatPromptTemplate.from_template("""
        ### Instructions:
        1. **Understand the Document’s Context**:
        - The purpose is to generate a short, precise context for the given chunk to improve search retrieval.

        2. **Generate Context Without Redundant Phrasing**:
        - Do **not** use phrases like "This chunk discusses" or "This section provides."
        - Directly state what the chunk is about in a clear and concise manner.

        3. **Handle Tables Properly (if applicable)**:
        - If the chunk contains a table, summarize its purpose and what kind of data it represents.

        4. **Keep It Brief & Informative**:
        - The context should be **3 to 6 sentences max**.
        - It should provide just enough information to situate the chunk within the document while remaining succinct.

        **Answer only with the succinct context and nothing else.**

        <document>
        {document}
        </document>

        Here is the chunk we want to situate within the whole document:
        <chunk>
        {chunk}
        </chunk>

        ### **Context:**
        """)

        messages = prompt.format_messages(document=document, chunk=chunk)
        response = self.llm.invoke(messages)
        return response.content

    def query_multiple_vectorstores(self, query: str, top_k: int = 3) -> List[str]:
        """
        Query multiple vector stores in the vector_db folder and retrieve the most relevant results.

        :param query: The query string.
        :param top_k: Number of top results to retrieve from each vector store.
        :return: List of retrieved responses.
        """
        results = []

        # Iterate through each stored vector database
        for file_name in os.listdir(vector_db_folder):
            vector_store_path = os.path.join(vector_db_folder, file_name)
            if os.path.isdir(vector_store_path):
                # Load the vector store
                vector_store = FAISS.load_local(vector_store_path, self.embeddings, allow_dangerous_deserialization=True)

                # Perform similarity search
                retrieved_docs = vector_store.similarity_search(query, k=top_k)

                # Extract content from retrieved documents
                for doc in retrieved_docs:
                    results.append(doc.page_content)

        return results


    def generate_answer(self, query: str, relevant_chunks: List[str]) -> str:
        prompt = ChatPromptTemplate.from_template("""
        Based on the following information, please provide a concise and accurate answer to the question.
        If the information is not sufficient to answer the question, say so.

        Question: {query}

        Relevant information:
        {chunks}

        Answer:
        """)
        messages = prompt.format_messages(query=query, chunks="\n\n".join(relevant_chunks))
        response = self.llm.invoke(messages)
        return response.content


In [None]:
retrieval_system = ContextualRetrieval()
retrieval_system.process_documents(md_dict)

In [None]:
query = """
"""

In [None]:
docs = retrieval_system.query_multiple_vectorstores(query)

In [None]:
contextualized_vector_answer = retrieval_system.generate_answer(query, docs)
