In [20]:
import chromadb
from typing import List
from dotenv import load_dotenv
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.schema import Document, TextNode
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext, Settings
from llama_index.core.node_parser import MarkdownNodeParser, SentenceSplitter
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

In [2]:
load_dotenv()

True

In [3]:
class HybridMarkdownSentenceParser:
    def __init__(self, chunk_size: int = 512, chunk_overlap: int = 50):
        self.markdown_parser = MarkdownNodeParser()
        self.sentence_splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

    def get_nodes_from_documents(self, documents: List[Document]) -> List[TextNode]:
        # First, parse the documents using the Markdown parser
        markdown_nodes = self.markdown_parser.get_nodes_from_documents(documents)
        final_nodes = []

        for node in markdown_nodes:
            # If the node's text exceeds the chunk size, split it further
            if len(node.text.split()) > self.sentence_splitter.chunk_size:
                # Create a temporary Document for sentence splitting
                temp_doc = Document(text=node.text)
                # Split the text into smaller chunks
                split_nodes = self.sentence_splitter.get_nodes_from_documents([temp_doc])
                # Preserve metadata from the original node
                for split_node in split_nodes:
                    split_node.metadata.update(node.metadata)
                final_nodes.extend(split_nodes)
            else:
                final_nodes.append(node)

        return final_nodes

In [4]:
# Load your Markdown documents
documents = SimpleDirectoryReader("data").load_data()

# Initialize the hybrid parser
hybrid_parser = HybridMarkdownSentenceParser(chunk_size=512, chunk_overlap=50)

# Generate nodes using the hybrid parser
nodes = hybrid_parser.get_nodes_from_documents(documents)

In [5]:
chroma_client = chromadb.PersistentClient()
chroma_collection = chroma_client.get_or_create_collection("recipes_for_science")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex(nodes, storage_context=storage_context)

In [17]:
llm = HuggingFaceInferenceAPI(model_name="Qwen/Qwen2.5-Coder-32B-Instruct")
query_engine = index.as_chat_engine(llm=llm)

NameError: name 'HuggingFaceInferenceAPI' is not defined

In [14]:
response = query_engine.query("In what year did Herschel observe sunspots?")

In [15]:
print(response.response)

Herschel observed sunspots at some point during his investigations, but the specific year in which he made this observation is not provided in the context information.
