In [None]:
!pip -q install -U llama-index llama-index-llms-openai llama-index-embeddings-openai nest_asyncio

import os
import asyncio
import nest_asyncio
nest_asyncio.apply()

from getpass import getpass

if not os.environ.get("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass("Enter OPENAI_API_KEY: ")

In [None]:
from llama_index.core import Document, VectorStoreIndex, Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

Settings.llm = OpenAI(model="gpt-4o-mini", temperature=0.2)
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

texts = [
    "Reliable RAG systems separate retrieval, synthesis, and verification. Common failures include hallucination and shallow retrieval.",
    "RAG evaluation focuses on faithfulness, answer relevancy, and retrieval quality.",
    "Tool-using agents require constrained tools, validation, and self-review loops.",
    "A robust workflow follows retrieve, answer, evaluate, and revise steps."
]

docs = [Document(text=t) for t in texts]
index = VectorStoreIndex.from_documents(docs)
query_engine = index.as_query_engine(similarity_top_k=4)

In [None]:
from llama_index.core.evaluation import FaithfulnessEvaluator, RelevancyEvaluator

faith_eval = FaithfulnessEvaluator(llm=Settings.llm)
rel_eval = RelevancyEvaluator(llm=Settings.llm)

def retrieve_evidence(q: str) -> str:
    r = query_engine.query(q)
    out = []
    for i, n in enumerate(r.source_nodes or []):
        out.append(f"[{i+1}] {n.node.get_content()[:300]}")
    return "\n".join(out)

def score_answer(q: str, a: str) -> str:
    r = query_engine.query(q)
    ctx = [n.node.get_content() for n in r.source_nodes or []]
    f = faith_eval.evaluate(query=q, response=a, contexts=ctx)
    r = rel_eval.evaluate(query=q, response=a, contexts=ctx)
    return f"Faithfulness: {f.score}\nRelevancy: {r.score}"

In [None]:
from llama_index.core.agent.workflow import ReActAgent
from llama_index.core.workflow import Context

agent = ReActAgent(
    tools=[retrieve_evidence, score_answer],
    llm=Settings.llm,
    system_prompt="""
Always retrieve evidence first.
Produce a structured answer.
Evaluate the answer and revise once if scores are low.
""",
    verbose=True
)

ctx = Context(agent)

In [3]:
async def run_brief(topic: str):
    q = f"Design a reliable RAG + tool-using agent workflow and how to evaluate it. Topic: {topic}"
    handler = agent.run(q, ctx=ctx)
    async for ev in handler.stream_events():
        print(getattr(ev, "delta", ""), end="")
    res = await handler
    return str(res)

topic = "RAG agent reliability and evaluation"
loop = asyncio.get_event_loop()
result = loop.run_until_complete(run_brief(topic))

print("\n\nFINAL OUTPUT\n")
print(result)


--- STREAM (agent reasoning + tool use) ---

```
Thought: The current language of the user is: English. I need to use a tool to help me answer the question.
Action: retrieve_evidence
Action Input: {"question":"Design a reliable RAG + tool-using agent workflow and how to evaluate it."}
```Thought: I have gathered relevant evidence regarding the design of a reliable RAG + tool-using agent workflow and how to evaluate it. Now, I can formulate a comprehensive answer.

Answer: To design a reliable RAG (Retrieval-Augmented Generation) + tool-using agent workflow, follow these steps:

1. **Clarify Task and Success Criteria**: Clearly define the task at hand and establish what success looks like for the agent.

2. **Retrieve Evidence**: Implement a robust retrieval mechanism to gather relevant information from a knowledge base or external sources.

3. **Produce an Answer**: Generate an answer using the retrieved evidence, ensuring to include citations or "evidence blocks" to support the claim