In [1]:
### PART 1: SYSTEM PROVISIONING AND ENVIRONMENTAL CONFIGURATION ###

# Install necessary libraries for Hybrid Orchestration
!pip install -q llama-index llama-parse llama-index-llms-google-genai llama-index-embeddings-google-genai
!pip install -q langchain-google-genai langchain-community

import os
import nest_asyncio
from google.colab import drive
from google.colab import userdata

# 1.1. Persistence and Async Configuration
# Nest_asyncio is required for running LlamaIndex's async loops in Colab notebooks.
nest_asyncio.apply()
drive.mount('/content/drive')

# 1.2. API Credentialing
# Securely fetch keys or prompt the user if not set.
try:
    os.environ["GOOGLE_API_KEY"] = userdata.get("GOOGLE_API_KEY")
except:
    os.environ["GOOGLE_API_KEY"] = input("Please enter your Google API Key: ")

try:
    os.environ["LLAMA_CLOUD_API_KEY"] = userdata.get("LLAMA_CLOUD_API_KEY")
except:
    os.environ["LLAMA_CLOUD_API_KEY"] = input("Please enter your LlamaParse API Key: ")

### PART 2: ARCHITECTURAL COMPONENT INITIALIZATION ###

from llama_index.core import Settings, SimpleDirectoryReader, StorageContext, VectorStoreIndex, load_index_from_storage
from llama_index.core.node_parser import HierarchicalNodeParser, get_leaf_nodes
from llama_index.core.extractors import QuestionsAnsweredExtractor
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.retrievers import AutoMergingRetriever
from llama_index.llms.google_genai import GoogleGenAI
from llama_index.embeddings.google_genai import GoogleGenAIEmbedding
from llama_parse import LlamaParse

# 2.1. Model Alignment (Source [3])
# We enforce a strict alignment between the embedding model and the generator to prevent latent space mismatch.
# Gemini 1.5 Pro is selected for its massive context window (Source [3]).
Settings.llm = GoogleGenAI(model="models/gemini-1.5-pro", temperature=0.1)
Settings.embed_model = GoogleGenAIEmbedding(model_name="models/embedding-001")

### PART 3: STRUCTURAL INGESTION AND METADATA INJECTION ###

# 3.1. Taxonomic Metadata Injection Hook
# This function extracts inter-document hierarchy (Grade Level) from the file path.
def get_grade_metadata(file_path):
    """
    Extracts high-level taxonomic tags from the directory structure.
    Resolves 'Document-Level Retrieval Mismatch' by anchoring files to their grade.
    """
    # Assumes structure: /content/drive/MyDrive/History/Grade_10/textbook.pdf
    path_parts = file_path.split("/")
    return {
        "grade_level": path_parts[-2],
        "file_name": path_parts[-1]
    }

# 3.2. Structural Parsing Configuration (Source [2])
# LlamaParse is configured to output Markdown, preserving intra-document hierarchy (Chapters/Tables).
parser = LlamaParse(
    result_type="markdown",
    parsing_instruction="""
    Parse this history textbook.
    1. Retain all Chapter and Section headings as Markdown headers (#, ##).
    2. Keep chronological tables intact.
    """
)

file_extractor = {".pdf": parser}

### PART 4: HIERARCHICAL INDEXING (SMALL-TO-BIG) ###

PERSIST_DIR = "/content/drive/MyDrive/Hybrid_RAG_Storage"

def build_hybrid_index():
    if not os.path.exists(PERSIST_DIR):
        print("Commencing high-fidelity structural indexing...")

        # Ingest documents with Metadata Injection
        reader = SimpleDirectoryReader(
            input_dir="/content/drive/MyDrive/History", # Adjust to your path
            recursive=True,
            file_extractor=file_extractor,
            file_metadata=get_grade_metadata
        )
        documents = reader.load_data()

        # 4.1. The Chunk Size Dilemma Resolution (Source [2])
        # We create a hierarchy: 2048 (Context) -> 512 (Intermediate) -> 128 (Retrieval Precision)
        node_parser = HierarchicalNodeParser.from_defaults(
            chunk_sizes=[2048, 512, 128]
        )

        # 4.2. Content-Based Metadata Extraction
        # We add an extractor to generate questions that the node's content can answer.
        extractor = QuestionsAnsweredExtractor(
            questions=2,
            llm=Settings.llm,
            metadata_mode="embed"
        )

        # 4.3. Ingestion Pipeline
        # Runs the parser and then the extractor. Warning: This increases ingestion time/cost.
        pipeline = IngestionPipeline(
            transformations=[node_parser, extractor]
        )

        # Generate the node tree
        nodes = pipeline.run(documents=documents)
        leaf_nodes = get_leaf_nodes(nodes) # Only leaves are embedded for search

        print("\n--- Sample Leaf Node Metadata ---")
        for i, leaf_node in enumerate(leaf_nodes[:3]): # Print metadata for first 3 leaf nodes
            print(f"Leaf Node {i} Metadata: {leaf_node.metadata}")
        print("-----------------------------------\n")

        # 4.4. Dual-Layer Storage (Source [3])
        # The docstore retains the parent nodes; the index retains the leaf embeddings.
        storage_context = StorageContext.from_defaults()
        storage_context.docstore.add_documents(nodes)

        index = VectorStoreIndex(
            leaf_nodes,
            storage_context=storage_context
        )

        # Persist to "Non-Parametric Memory"
        index.storage_context.persist(persist_dir=PERSIST_DIR)
        return index
    else:
        print("Loading existing knowledge graph from persistent storage...")
        storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
        return load_index_from_storage(storage_context)

# Initialize the Index
index = build_hybrid_index()

### PART 5: HYBRID ORCHESTRATION (LANGCHAIN WRAPPER) ###

from langchain_community.retrievers import LlamaIndexRetriever
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

# 5.1. Constructing the Auto-Merging Retriever
# This LlamaIndex component automatically swaps 'child' nodes for 'parent' nodes upon retrieval.
base_retriever = index.as_retriever(similarity_top_k=5)
auto_merging_retriever = AutoMergingRetriever(
    base_retriever,
    index.storage_context,
    verbose=True # Set to True to see the "merging" happen in logs
)

# 5.2. Wrapping in LangChain Interface (Source [1, 2])
# We adapt the LlamaIndex retriever to be compatible with LangChain's LCEL.
langchain_retriever = LlamaIndexRetriever(retriever=auto_merging_retriever)

# 5.3. Generative Orchestration via LCEL
# We use LangChain to manage the prompt templates and Gemini generation.
llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro", temperature=0.1)

template = """
You are a history teacher answering high school level history multiple choice questions with five choices. Using the provided context—which may include grade-specific metadata—answer the question, select the correct choice and output it. Prioritize chronological accuracy and distinguish between grade-level perspectives if evident.

Context: {context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

# Define the Chain
rag_chain = (
    {"context": langchain_retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

### PART 6: EXECUTION ###

query = "Compare the treatment of the Industrial Revolution in Grade 10 vs Grade 12."
print(f"Executing Query: {query}\n")

# This invoke triggers the vector search -> child retrieval -> parent merging -> generation
response = rag_chain.invoke(query)
print("\nGenerated Analysis:\n")
print(response)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.9/11.9 MB[0m [31m66.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m303.3/303.3 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.9/63.9 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.6/102.6 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m329.0/329.0 kB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m55.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.0/88.0 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

ClientError: 400 INVALID_ARGUMENT. {'error': {'code': 400, 'message': 'API key not valid. Please pass a valid API key.', 'status': 'INVALID_ARGUMENT', 'details': [{'@type': 'type.googleapis.com/google.rpc.ErrorInfo', 'reason': 'API_KEY_INVALID', 'domain': 'googleapis.com', 'metadata': {'service': 'generativelanguage.googleapis.com'}}, {'@type': 'type.googleapis.com/google.rpc.LocalizedMessage', 'locale': 'en-US', 'message': 'API key not valid. Please pass a valid API key.'}]}}