In [57]:
# Install necessary packages
!pip install -q --progress-bar off llama-index-core llama-index-readers-docling \
llama-index-node-parser-docling llama-index-embeddings-huggingface \
llama-index-llms-huggingface-api llama-index-vector-stores-milvus \
llama-index-readers-file python-dotenv

In [58]:
# Import Modules & Define Helper Functions
import os
from pathlib import Path
from tempfile import mkdtemp
from warnings import filterwarnings
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Suppress warnings for cleaner outputs
filterwarnings(action="ignore", category=UserWarning, module="pydantic")
filterwarnings(action="ignore", category=FutureWarning, module="easyocr")

# Prevent tokenizer parallelism issues
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Function to get environment variables
def _get_env_from_colab_or_os(key):
    return os.getenv(key)


In [59]:
# Define Main Parameters
import os
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI

# Fetch HuggingFace token from environment variables
hf_token = os.getenv("HF_TOKEN")

# Ensure the token is set
if not hf_token:
    raise EnvironmentError("HF_TOKEN environment variable is not set!")

# Define embedding and generation models
EMBED_MODEL = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
MILVUS_URI = str(Path(mkdtemp()) / "docling.db")
GEN_MODEL = HuggingFaceInferenceAPI(
    token=hf_token,
    model_name="mistralai/Mixtral-8x7B-Instruct-v0.1",
)

SOURCE = "https://arxiv.org/pdf/2408.09869"  # Docling Technical Report
QUERY = "Which are the main AI models in Docling?"

embed_dim = len(EMBED_MODEL.get_text_embedding("hi"))


In [60]:
# Define RAG Pipeline with DoclingReader
from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.core.node_parser import MarkdownNodeParser
from llama_index.readers.docling import DoclingReader
from llama_index.vector_stores.milvus import MilvusVectorStore

# Define the DoclingReader and node parser
reader = DoclingReader()
node_parser = MarkdownNodeParser()

# Configure the vector store
vector_store = MilvusVectorStore(
    uri=str(Path(mkdtemp()) / "docling.db"),  # Temporary database
    dim=embed_dim,
    overwrite=True,
)

# Create the index
index = VectorStoreIndex.from_documents(
    documents=reader.load_data(SOURCE),
    transformations=[node_parser],
    storage_context=StorageContext.from_defaults(vector_store=vector_store),
    embed_model=EMBED_MODEL,
)

KeyboardInterrupt: 

In [None]:
# Query the pipeline
result = index.as_query_engine(llm=GEN_MODEL).query(QUERY)

# Display the results
print(f"Q: {QUERY}\nA: {result.response.strip()}\n\nSources:")
for node in result.source_nodes:
    print(f"Text: {node.text}\nMetadata: {node.metadata}\n")
