In [3]:
from llama_index.core import Document, SimpleDirectoryReader

document = Document.example()

In [2]:
document

Document(id_='c5478e16-c170-44b9-892e-99a6aafbc762', embedding=None, metadata={'filename': 'README.md', 'category': 'codebase'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='\nContext\nLLMs are a phenomenal piece of technology for knowledge generation and reasoning.\nThey are pre-trained on large amounts of publicly available data.\nHow do we best augment LLMs with our own private data?\nWe need a comprehensive toolkit to help perform this data augmentation for LLMs.\n\nProposed Solution\nThat\'s where LlamaIndex comes in. LlamaIndex is a "data framework" to help\nyou build LLM  apps. It provides the following tools:\n\nOffers data connectors to ingest your existing data sources and data formats\n(APIs, PDFs, docs, SQL, etc.)\nProvides ways to structure your data (indices, graphs) so that this data can be\neasily used with LLMs.

In [21]:
reader = SimpleDirectoryReader(input_files=["./files/private_airplane_acs_6.pdf"])
documents = reader.load_data()

In [22]:
len(documents)

87

In [23]:
documents

[Document(id_='0be4ee6e-c40f-4907-bacc-5914b7bca759', embedding=None, metadata={'page_label': 'i', 'file_name': 'private_airplane_acs_6.pdf', 'file_path': 'files/private_airplane_acs_6.pdf', 'file_type': 'application/pdf', 'file_size': 716798, 'creation_date': '2025-07-17', 'last_modified_date': '2025-07-17'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='Private Pilot for Airplane Category \nAirman Certification Standards\nFAA-S-ACS-6C\nFlight Standards Service\nWashington, DC 20591\nNovember 2023', path=None, url=None, mimetype=None), image_resource=None, audio_resource=None, video_resource=None, text_template='{metadata_str}\n\n{conte

In [26]:
documents[24].metadata

{'page_label': '17',
 'file_name': 'private_airplane_acs_6.pdf',
 'file_path': 'files/private_airplane_acs_6.pdf',
 'file_type': 'application/pdf',
 'file_size': 716798,
 'creation_date': '2025-07-17',
 'last_modified_date': '2025-07-17'}

In [16]:
documents.excluded_llm_metadata_keys = []

In [27]:
from llama_index.core.schema import MetadataMode

print(documents[24].get_content(metadata_mode=MetadataMode.LLM))

page_label: 17
file_path: files/private_airplane_acs_6.pdf

Private Pilot for Airplane Category ACS (FAA-S-ACS-6C)
17
Area of Operation III.  Airport and Seaplane Base Operations
PA.III.B.K2 Traffic pattern selection for the current conditions.
PA.III.B.K3 Right-of-way rules.
PA.III.B.K4 Use of automated weather and airport information.
Risk 
Management: The applicant is able to identify, assess, and mitigate risk associated with:
PA.III.B.R1 Collision hazards.
PA.III.B.R2 Distractions, task prioritization, loss of situational awareness, or disorientation.
PA.III.B.R3 Windshear and wake turbulence.
Skills: The applicant exhibits the skill to:
PA.III.B.S1 Identify and interpret airport/seaplane base runways, taxiways, markings, signs, and lighting.
PA.III.B.S2 Comply with recommended traffic pattern procedures.
PA.III.B.S3 Correct for wind drift to maintain the proper ground track.
PA.III.B.S4 Maintain orientation with the runway/landing area in use.
PA.III.B.S5 Maintain traffic pattern

In [18]:
from llama_index.core import Document
from llama_index.core.schema import MetadataMode

document = Document(
    text="This is a super-customized document",
    metadata={
        "file_name": "super_secret_document.txt",
        "category": "finance",
        "author": "LlamaIndex",
    },
    excluded_llm_metadata_keys=["file_name"],
    metadata_seperator="::",
    metadata_template="{key}=>{value}",
    text_template="Metadata: {metadata_str}\n-----\nContent: {content}",
)

print(
    "The LLM sees this: \n",
    document.get_content(metadata_mode=MetadataMode.LLM),
)
print(
    "The Embedding model sees this: \n",
    document.get_content(metadata_mode=MetadataMode.EMBED),
)

The LLM sees this: 
 Metadata: category=>finance::author=>LlamaIndex
-----
Content: This is a super-customized document
The Embedding model sees this: 
 Metadata: file_name=>super_secret_document.txt::category=>finance::author=>LlamaIndex
-----
Content: This is a super-customized document


In [43]:
import os
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Document, get_response_synthesizer
from llama_index.core.node_parser import SentenceSplitter
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.core import Settings
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor
from llama_index.core.vector_stores import MetadataFilter, FilterOperator, MetadataFilters
from llama_index.core.tools import QueryEngineTool
from llama_index.core.query_engine import RouterQueryEngine
from llama_index.core.selectors import LLMSingleSelector

# Ensure you have your OpenAI API key set up
# os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY"

# Set default LLM and embedding model for the whole pipeline
Settings.llm = Ollama(model="deepseek-r1:7b", temperature=0.2, seed=334, request_timeout=90.0)
Settings.embed_model = OllamaEmbedding(model_name="nomic-embed-text:v1.5")

In [35]:
# --- Step 1: Prepare your documents with appropriate metadata ---
# Let's simulate your directory structure and file naming
# data_dir = "files/poh"
data_dir = "files"

# Custom Document loading to add 'aircraft_model' metadata
def load_documents_with_metadata(directory_path):
    reader = SimpleDirectoryReader(directory_path, recursive=True)
    loaded_docs = reader.load_data()
    
    docs_with_metadata = []
    for doc in loaded_docs:
        file_name = os.path.basename(doc.metadata.get('file_path', ''))

        # Extract aircraft model from filename (e.g., "poh_162_1.pdf" -> "162")
        aircraft_model = None
        document_type = None
        if "poh_" in file_name:
            aircraft_model = "Cessna 162"
            document_type = "Pilot Operating Handbook"
        elif "phak" in file_name:
            document_type = "Pilot Handbook of Aeronautical Knowledge"
        elif "acs" in file_name:
            document_type = "Airman Certification Standards"
        elif "afh" in file_name:
            document_type = "Airplane Flying Handbook"

        # Add metadata to the document
        new_metadata = doc.metadata.copy()
        if aircraft_model:
            new_metadata['aircraft_model'] = aircraft_model

        new_metadata['document_type'] = document_type
        
        docs_with_metadata.append(Document(text=doc.text, metadata=new_metadata))
    return docs_with_metadata

documents = load_documents_with_metadata(data_dir)
print(f"Loaded {len(documents)} documents with metadata.")

for doc in documents:
    print(f"  - {doc.metadata.get('file_name', 'N/A')}: Model={doc.metadata.get('aircraft_model', 'N/A')}, Type={doc.metadata.get('document_type', 'N/A')}")

Loaded 1225 documents with metadata.
  - afh_1.pdf: Model=N/A, Type=Airplane Flying Handbook
  - afh_1.pdf: Model=N/A, Type=Airplane Flying Handbook
  - afh_1.pdf: Model=N/A, Type=Airplane Flying Handbook
  - afh_1.pdf: Model=N/A, Type=Airplane Flying Handbook
  - afh_1.pdf: Model=N/A, Type=Airplane Flying Handbook
  - afh_1.pdf: Model=N/A, Type=Airplane Flying Handbook
  - afh_1.pdf: Model=N/A, Type=Airplane Flying Handbook
  - afh_1.pdf: Model=N/A, Type=Airplane Flying Handbook
  - afh_1.pdf: Model=N/A, Type=Airplane Flying Handbook
  - afh_1.pdf: Model=N/A, Type=Airplane Flying Handbook
  - afh_1.pdf: Model=N/A, Type=Airplane Flying Handbook
  - afh_1.pdf: Model=N/A, Type=Airplane Flying Handbook
  - afh_1.pdf: Model=N/A, Type=Airplane Flying Handbook
  - afh_1.pdf: Model=N/A, Type=Airplane Flying Handbook
  - afh_1.pdf: Model=N/A, Type=Airplane Flying Handbook
  - afh_1.pdf: Model=N/A, Type=Airplane Flying Handbook
  - afh_10.pdf: Model=N/A, Type=Airplane Flying Handbook
  - afh_10

In [36]:
# --- Step 2: Create a VectorStoreIndex for all documents ---
# We'll use a single index for all documents, but filter by metadata at query time.
# This is more efficient than creating separate indexes for each POH.

# Create nodes from documents (standard practice)
node_parser = SentenceSplitter(chunk_size=512, chunk_overlap=20)
nodes = node_parser.get_nodes_from_documents(documents)

# Create the main vector index from all nodes
all_documents_index = VectorStoreIndex(nodes)
print("\nVectorStoreIndex created for all documents.")


VectorStoreIndex created for all documents.


In [45]:
### --- Step 3: Implement Routing Query Engines ---
# We'll create specialized query engines for each document type/model,
# and then use a RouterQueryEngine to select the correct one.

# 3.1 Create a generic query engine that can filter by metadata
def get_filtered_query_engine(index, filters=None):
    # This retriever will apply filters before fetching nodes
    retriever = index.as_retriever(
        similarity_top_k=5, # You can adjust top_k
        filters=filters # Pass the filters here
    )
    query_engine = RetrieverQueryEngine(
        retriever=retriever,
        response_synthesizer=get_response_synthesizer(),
        node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7)],
    )
    return query_engine

# 3.2 Define individual QueryEngineTools for each document type/model
# We'll create "dummy" filter objects that the Router can use for description,
# but the actual filtering will happen dynamically.

# Query engine for Cessna 162 POH
poh_162_filters = MetadataFilters(
    filters=[MetadataFilter(key="aircraft_model", value="Cessna 162", operator=FilterOperator.EQ)]
)
poh_162_qe_tool = QueryEngineTool.from_defaults(
    query_engine=get_filtered_query_engine(all_documents_index, poh_162_filters),
    description=(
        "Useful for questions specifically about the Cessna 162 aircraft, "
        "including its engine, performance, and operational procedures. "
        "This uses the Pilot Operating Handbook (POH) for Cessna 162."
    ),
    name="poh_cessna_162_tool",
)

# Query engine for general aviation regulations
phak_filters = MetadataFilters(
    filters=[MetadataFilter(key="document_type", value="Pilot Handbook of Aeronautical Knowledge", operator=FilterOperator.EQ)]
)
phak_qe_tool = QueryEngineTool.from_defaults(
    query_engine=get_filtered_query_engine(all_documents_index, phak_filters),
    description=(
        "Useful for general aviation regulations, VFR flight rules, "
        "air traffic control procedures, and other non-aircraft-specific aviation topics."
    ),
    name="phak_tool",
)

afh_filters = MetadataFilters(
    filters=[MetadataFilter(key="document_type", value="Airplane Flying Handbook", operator=FilterOperator.EQ)]
)
afh_qe_tool = QueryEngineTool.from_defaults(
    query_engine=get_filtered_query_engine(all_documents_index, afh_filters),
    description=(
        "Useful for techniques for flying aircraft of all type."
    ),
    name="afh_tool"
)

acs_filters = MetadataFilters(
    filters=[MetadataFilter(key="document_type", value="Airman Certification Standards", operator=FilterOperator.EQ)]
)
acs_qe_tool = QueryEngineTool.from_defaults(
    query_engine=get_filtered_query_engine(all_documents_index, acs_filters),
    description=(
        "The standards and expectations of Private Pilots. "
        "Useful to understand what requirements they must achieve to pass the final test (Checkride) "
        "and expectations of a Private Pilots Licesnse (PPL)"
    ),
    name="acs_tool"
)

# Default/General query engine (if no specific tool is chosen)
# This one doesn't have specific metadata filters at the tool level, it searches everything.
general_qe_tool = QueryEngineTool.from_defaults(
    query_engine=all_documents_index.as_query_engine(
        similarity_top_k=5,
        node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7)],
    ),
    description=(
        "Use if the other tools are not suitable."
    ),
    name="general_aviation_tool",
)

# 3.3 Create the RouterQueryEngine
# The LLMSingleSelector will use the descriptions of the tools to decide which one to use.
router_query_engine = RouterQueryEngine(
    selector=LLMSingleSelector.from_defaults(),
    query_engine_tools=[
        poh_162_qe_tool,
        phak_qe_tool,
        afh_qe_tool,
        acs_qe_tool,
        general_qe_tool, # Include a general tool as a fallback
    ],
    verbose=True # Set to True to see which tool the router selects
)

print("\nRouterQueryEngine configured.")


RouterQueryEngine configured.


In [46]:
### --- Step 4: Querying with Chain Reasoning ---

print("\n--- Querying Examples ---")

queries = [
    "What kind of engine does the Cessna 162 use?",
    "Describe the electrical system of the Cessna 172.",
    "What are the rules for VFR flight?",
    "According to the Airman Certification Standards for the Private Pilots license what skills when maneuvering during slow flight do I need to exhibit and what are their codes?"
]

for i, query in enumerate(queries):
    print(f"\n--- Query {i+1}: {query} ---")
    response = router_query_engine.query(query)
    print(f"Response: {response.response}")
    print(f"Selected Tool: {response.metadata.get('tool_name', 'N/A')}")
    print("\nSource Document(s) and Page Number(s):")
    
    # Iterate through the source_nodes to get file name and page label
    # Each node in response.source_nodes is a NodeWithScore object
    for node_with_score in response.source_nodes:
        # Access the underlying TextNode (or other Node type)
        node = node_with_score.node
        
        # Get metadata from the node
        file_name = node.metadata.get('file_name', 'N/A')
        page_label = node.metadata.get('page_label', 'N/A') # 'page_label' is the common key for page number
        
        print(f"- File: {file_name}, Page: {page_label}")
        # Optionally, you can print a snippet of the content to verify
        # print(f"  Content Snippet: {node.get_content()[:150]}...")


--- Querying Examples ---

--- Query 1: What kind of engine does the Cessna 162 use? ---
[1;3;38;5;200mSelecting query engine 0: This choice provides information about the Cessna 162's engine, performance, and operational procedures, which are directly relevant to answering the question..
[0mResponse: <think>
Okay, so I need to figure out what kind of engine the Cessna 162 uses based on the provided context. Let me start by reading through the given information carefully.

Looking at page_label 25 first, it mentions that the Cessna 162 has a four-cylinder horizontally opposed engine with wet sump lubrication. It's described as normally aspirated, direct drive, air-cooled, and carbureted. The model number is T eledyne Continental O-200-D, rated at 100 horsepower at 2750 RPM.

Then on page_label 14, there's a section about starting the engine which includes details like fuel primer control, throttle settings, and mixture control. This seems to be more procedural than describing the en