## 1. Install the llama stack client

In [192]:
%pip install llama_stack==0.2.20 llama-stack-client==0.2.20

Collecting llama_stack==0.2.20
  Using cached llama_stack-0.2.20-py3-none-any.whl.metadata (15 kB)
Using cached llama_stack-0.2.20-py3-none-any.whl (3.7 MB)
Installing collected packages: llama_stack
  Attempting uninstall: llama_stack
    Found existing installation: llama_stack 0.2.12
    Uninstalling llama_stack-0.2.12:
      Successfully uninstalled llama_stack-0.2.12
Successfully installed llama_stack-0.2.20
Note: you may need to restart the kernel to use updated packages.


## 2. List available models

In [2]:
from llama_stack_client import LlamaStackClient

client = LlamaStackClient(base_url="https://lsd-llama-milvus-service-llamastack.apps.cluster-mqt5h.mqt5h.sandbox2228.opentlc.com/")
client.models.list()

INFO:httpx:HTTP Request: GET https://lsd-llama-milvus-service-llamastack.apps.cluster-mqt5h.mqt5h.sandbox2228.opentlc.com/v1/models "HTTP/1.1 200 OK"


[Model(identifier='Llama-4-Scout-17B-16E-W4A16', metadata={}, api_model_type='llm', provider_id='vllm-inference', type='model', provider_resource_id='Llama-4-Scout-17B-16E-W4A16', model_type='llm'),
 Model(identifier='granite-embedding-125m', metadata={'embedding_dimension': 768.0}, api_model_type='embedding', provider_id='sentence-transformers', type='model', provider_resource_id='ibm-granite/granite-embedding-125m-english', model_type='embedding')]

## 3. Import and run the KubeFlow Pipeline
Import the "[docling-pipeline_compiled.yaml](./docling-pipeline_compiled.yaml)" KubeFlow Pipeline into your pipeline server, then run the pipeline to insert your PDF documents into the vector database.

When running the pipeline, you can customize the following parameters:

- `base_url`: Base URL to fetch PDF files from
- `pdf_filenames`: Comma-separated list of PDF filenames to download and convert
- `num_workers`: Number of parallel workers
- `vector_db_id`: Milvus vector database ID
- `service_url`: Milvus service URL
- `embed_model_id`: Embedding model to use
- `max_tokens`: Maximum tokens per chunk
- `use_gpu`: Enable/disable GPU acceleration

Compile the ingestion pipeline, by installing `pip install kfp==2.14.1 kfp-kubernetes==2.14.0`

Note: The compiled pipeline was generated by running `python docling-pipeline.py`.

## 4. Is Milvus registered with LlamaStack from ingestion pipeline?

In [4]:
from llama_stack_client import Agent, AgentEventLogger, LlamaStackClient

vector_db_id = "sreips_vector_id"
client = LlamaStackClient(base_url="https://lsd-llama-milvus-service-llamastack.apps.cluster-mqt5h.mqt5h.sandbox2228.opentlc.com/")

models = client.models.list()
model_id = next(m for m in models if m.model_type == "llm").identifier

print(models)
print(client.vector_dbs.list())

INFO:httpx:HTTP Request: GET https://lsd-llama-milvus-service-llamastack.apps.cluster-mqt5h.mqt5h.sandbox2228.opentlc.com/v1/models "HTTP/1.1 200 OK"


[Model(identifier='Llama-4-Scout-17B-16E-W4A16', metadata={}, api_model_type='llm', provider_id='vllm-inference', type='model', provider_resource_id='Llama-4-Scout-17B-16E-W4A16', model_type='llm'), Model(identifier='granite-embedding-125m', metadata={'embedding_dimension': 768.0}, api_model_type='embedding', provider_id='sentence-transformers', type='model', provider_resource_id='ibm-granite/granite-embedding-125m-english', model_type='embedding')]


INFO:httpx:HTTP Request: GET https://lsd-llama-milvus-service-llamastack.apps.cluster-mqt5h.mqt5h.sandbox2228.opentlc.com/v1/vector-dbs "HTTP/1.1 200 OK"


[VectorDBListResponseItem(embedding_dimension=768, embedding_model='granite-embedding-125m', identifier='sreips_vector_id', provider_id='milvus', type='vector_db', provider_resource_id='sreips_vector_id', vector_db_name=None)]


## 5. Register the RH KCS MCP Server

In [5]:
client.toolgroups.register(
    toolgroup_id="mcp::rh-kcs-mcp",
    provider_id="model-context-protocol",
    mcp_endpoint={"uri" : "https://rh-kcs-mcp-servers.apps.cluster-mqt5h.mqt5h.sandbox2228.opentlc.com/sse"},
)

INFO:httpx:HTTP Request: POST https://lsd-llama-milvus-service-llamastack.apps.cluster-mqt5h.mqt5h.sandbox2228.opentlc.com/v1/toolgroups "HTTP/1.1 200 OK"


In [6]:
client.toolgroups.list()

INFO:httpx:HTTP Request: GET https://lsd-llama-milvus-service-llamastack.apps.cluster-mqt5h.mqt5h.sandbox2228.opentlc.com/v1/toolgroups "HTTP/1.1 200 OK"


[ToolGroup(identifier='builtin::websearch', provider_id='tavily-search', type='tool_group', args=None, mcp_endpoint=None, provider_resource_id='builtin::websearch'),
 ToolGroup(identifier='builtin::rag', provider_id='rag-runtime', type='tool_group', args=None, mcp_endpoint=None, provider_resource_id='builtin::rag'),
 ToolGroup(identifier='mcp::rh-kcs-mcp', provider_id='model-context-protocol', type='tool_group', args=None, mcp_endpoint=McpEndpoint(uri='https://rh-kcs-mcp-servers.apps.cluster-mqt5h.mqt5h.sandbox2228.opentlc.com/sse'), provider_resource_id='mcp::rh-kcs-mcp')]

## 6. Execute both the agents in sequence
Prompt LLama 4 with a question in relation to the document inserted and execute the rag_agent

In [9]:
import uuid

rag_agent = Agent(
    client,
    model=model_id,
    instructions="You are a helpful assistant",
    tools=[
        {
            "name": "builtin::rag/knowledge_search",
            "args": {"vector_db_ids": [vector_db_id]},
        }, 
        "mcp::rh-kcs-mcp",
    ],
    max_infer_iters=10
)

prompt = "Use the given rag search and find what's the resolution of volume attachment failures in kubernetes?"
prompt = "crashloop backoff in openshift?"
print("prompt>", prompt)

session_id = rag_agent.create_session(session_name=f"s{uuid.uuid4().hex}")

response = rag_agent.create_turn(
    messages=[{"role": "user", "content": prompt}],
    session_id=session_id,
    stream=True,

)

for log in AgentEventLogger().log(response):
    log.print()

# rag_output = []

# for log in AgentEventLogger().log(response):
#     if log.role != "inference" and log.role != "tool_execution":
#         rag_output.append(log)

# if rag_output:
#     print("FINAL INFERENCE RESPONSE:\n")
#     rag_results = "".join(str(x) for x in rag_output) 
#     print(rag_results)
# else:
#     print("No final inference response found.")

INFO:httpx:HTTP Request: POST https://lsd-llama-milvus-service-llamastack.apps.cluster-mqt5h.mqt5h.sandbox2228.opentlc.com/v1/agents "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://lsd-llama-milvus-service-llamastack.apps.cluster-mqt5h.mqt5h.sandbox2228.opentlc.com/v1/tools?toolgroup_id=builtin%3A%3Arag%2Fknowledge_search "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://lsd-llama-milvus-service-llamastack.apps.cluster-mqt5h.mqt5h.sandbox2228.opentlc.com/v1/tools?toolgroup_id=mcp%3A%3Arh-kcs-mcp "HTTP/1.1 200 OK"


prompt> crashloop backoff in openshift?


INFO:httpx:HTTP Request: POST https://lsd-llama-milvus-service-llamastack.apps.cluster-mqt5h.mqt5h.sandbox2228.opentlc.com/v1/agents/494a8324-cd5b-4c09-a022-2adf26c597f0/session "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://lsd-llama-milvus-service-llamastack.apps.cluster-mqt5h.mqt5h.sandbox2228.opentlc.com/v1/agents/494a8324-cd5b-4c09-a022-2adf26c597f0/session/84c9c8bf-1136-46d6-be92-424e66f3c49c/turn "HTTP/1.1 200 OK"


[33minference> [0m[33m[0m[97m[0m
[32mtool_execution> Tool:knowledge_search Args:{'query': 'crashloop backoff in openshift'}[0m
[32mtool_execution> Tool:knowledge_search Response:[TextContentItem(text='knowledge_search tool found 5 chunks:\nBEGIN of knowledge_search tool results.\n', type='text'), TextContentItem(text="Result 1\nContent: 1. Playbook: Troubleshooting Pod CrashLoopBackOff\nSummary This state indicates that a pod's container is starting, crashing, and then being restarted by OpenShift, in a continuous loop. The pod's status will show CrashLoopBackOff .\nMetadata: {'chunk_id': '3beb065e-decf-48b4-9bbe-8382aa062baf', 'document_id': 'SREIPS-Prod-troubleshooting-Knowledge-Base', 'file_name': 'SREIPS-Prod-troubleshooting-Knowledge-Base'}\n", type='text'), TextContentItem(text='Result 2\nContent: Diagnostic Playbook\n1. Check Pod Status: Identify the failing pod.\n```\nNone oc get pods -n <namespace>\n```\n2. Describe the Pod: This shows the reason for the current state

Prompt LLama 4 with a question in relation to the document inserted and execute the mcp_agent

In [13]:
import uuid
from llama_stack_client.lib.agents.react.agent import ReActAgent
from llama_stack_client.lib.agents.react.tool_parser import ReActOutput

mcp_agent = Agent(
    client,
    model=model_id,
    instructions="""You are a helpful assistant. Search for relevant Red Hat knowledge articles.

Format each result as:
Title: [article title]
Link: [full view_uri URL]

Show the complete URL for each article so users can easily access them.""",
    tools=[
        "mcp::rh-kcs-mcp",
    ],max_infer_iters=100
)

input_prompt = "Find relevant knowledge articles for 'volume attachment failures in kubernetes'"
input_prompt = "Find relevant knowledge articles for 'what's the resolution for pod crashloop backoff failures in kubernetes'?"
input_prompt = "crashloopbackoff failure in openshift?"

session_id = mcp_agent.create_session(session_name=f"s{uuid.uuid4().hex}")

response = mcp_agent.create_turn(
        messages=[{"role": "user","content": input_prompt}],
        session_id=session_id,
        stream=True,
    )

import re, json

for log in AgentEventLogger().log(response):
    log.print()

# mcp_output = []

# for log in AgentEventLogger().log(response):
#     if log.role == "tool_execution" and "Tool:search_kcs Response:" in log.content:
#         # Extract the JSON array between the single quotes after 'TextContentItem(text='
#         match = re.search(r"TextContentItem\(text='(.*?)', type='text'\)", log.content)
#         if match:
#             json_str = match.group(1)
#             try:
#                 data = json.loads(json_str)
#                 for item in data:
#                     title = item.get("title")
#                     uri = item.get("view_uri")
#                     mcp_output.append((title, " - ", uri))
#                     # print(f"- {title}: {uri}")
#             except json.JSONDecodeError as e:
#                 print("Error decoding JSON:", e)  

# mcp_results = "\n".join(f"{t[0]}{t[1]}{t[2]}" for t in mcp_output)

# print(mcp_results)  

# mcp_output = []
# for log in AgentEventLogger().log(response):
#     if log.role != "inference" and log.role != "tool_execution":
#         mcp_output.append(log)

# if mcp_output:
#     mcp_results = "".join(str(x) for x in mcp_output)
#     print("MCP RESPONSE:\n")
#     print(mcp_results)
# else:
#     print("No MCP response found.")

INFO:httpx:HTTP Request: POST https://lsd-llama-milvus-service-llamastack.apps.cluster-mqt5h.mqt5h.sandbox2228.opentlc.com/v1/agents "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://lsd-llama-milvus-service-llamastack.apps.cluster-mqt5h.mqt5h.sandbox2228.opentlc.com/v1/tools?toolgroup_id=mcp%3A%3Arh-kcs-mcp "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://lsd-llama-milvus-service-llamastack.apps.cluster-mqt5h.mqt5h.sandbox2228.opentlc.com/v1/agents/c8ca66bc-f6a4-4e16-aa3f-bedaa942e574/session "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://lsd-llama-milvus-service-llamastack.apps.cluster-mqt5h.mqt5h.sandbox2228.opentlc.com/v1/agents/c8ca66bc-f6a4-4e16-aa3f-bedaa942e574/session/0aa6056f-675d-4848-ba14-2730814e2179/turn "HTTP/1.1 200 OK"


[33minference> [0m[33m[0m[33m[0m[97m[0m
[32mtool_execution> Tool:search_kcs Args:{'query': 'crashloopbackoff failure in openshift', 'rows': 50.0, 'start': 0.0, 'session_id': ''}[0m
[32mtool_execution> Tool:search_kcs Response:[TextContentItem(text='[{"id":"7024400","title":"knative-openshift pod in CrashLoopBackOff  due to failure of Liveness Probe in RHOCP 4","score":181.3567,"view_uri":"https://access.redhat.com/solutions/7024400"},{"id":"7041005","title":"The Openshift Kube Apiserver pods were stuck in CrashLoopBackOff","score":108.50442,"view_uri":"https://access.redhat.com/solutions/7041005"},{"id":"7109992","title":"Fluentd Pods in CrashLoopBackOff State  in RHOCP 4.","score":103.55866,"view_uri":"https://access.redhat.com/solutions/7109992"},{"id":"7073701","title":"aws-efs-csi-driver-controller in CrashLoopBackOff","score":103.241325,"view_uri":"https://access.redhat.com/solutions/7073701"},{"id":"7012517","title":"OpenShift Update Service Operator V5.0.0 is in Crash

Query chunks from a vector database.

In [74]:
chunks = client.vector_io.query(query="volume attachment failures", vector_db_id=vector_db_id)
    
for chunk in chunks:
    print(chunk)


INFO:httpx:HTTP Request: POST https://lls-route-llamastack.apps.cluster-5tptd.5tptd.sandbox2399.opentlc.com/v1/vector-io/query "HTTP/1.1 200 OK"


('scores', [0.8968541026115417, 0.8732659220695496, 0.8625918626785278])


### Obsolete code below

In [200]:
from llama_stack_client.lib.agents.react.agent import ReActAgent
from llama_stack_client.lib.agents.react.tool_parser import ReActOutput

agent = ReActAgent(
            client=client,
            model=model_id,
            tools=[
                {
                    "name": "builtin::rag/knowledge_search",
                    "args": {"vector_db_ids": [vector_db_id]},
                }, 
                # "mcp::rh-kcs-mcp",
            ],
            response_format={
                "type": "json_schema",
                "json_schema": ReActOutput.model_json_schema(),
            },
        )
prompt = [
    "Use the rag/knowledge_search tool and find what's the resolution for volume attachment failures in OpenShift?",
    "Then, based on the answer obtained for the last prompt, enrich it with additional details using rh-kcs-mcp tool.",
]
session_id = agent.create_session(session_name=f"s{uuid.uuid4().hex}")
for p in prompt:
    response = agent.create_turn(
            messages=[{"role": "user","content": p,}],
            session_id=session_id,
        )

for log in AgentEventLogger().log(response):
    log.print()

INFO:llama_stack_client._base_client:Retrying request to /v1/tools in 0.458628 seconds
INFO:llama_stack_client._base_client:Retrying request to /v1/tools in 0.900825 seconds


APIConnectionError: Connection error.

In [None]:
from llama_stack_client.lib.agents.react.agent import ReActAgent
from llama_stack_client.lib.agents.react.tool_parser import ReActOutput

# --- Create ReAct agent with both tools ---

react_agent = ReActAgent(
    client=client,
    model=model_id,
    instructions=(
            "You MUST follow this exact two-step process:\n"
            "1) Call the tool `knowledge_search` to retrieve internal KB context (use the query exactly as asked).\n"
            "2) After you receive the knowledge_search results, call the tool `mcp::rh-kcs-mcp` to search the external KCS articles, "
            "so you can cross-check the internal context and add public references.\n"
            "Do not try to answer before both tool calls are completed. If the first tool returns nothing, still call the second tool.\n"
            "Finally, synthesize a single, short answer and list the sources from both tools."
        ),
    tools=[
        {
            "name": "builtin::rag/knowledge_search",
            "args": {"vector_db_ids": [vector_db_id]},
        },
        {
            "name": "mcp::rh-kcs-mcp",
            "args": {},
        },
    ],
    response_format={
            "type": "json_schema",
            "json_schema": ReActOutput.model_json_schema(),
        },
    )

input_prompt = [
        "Find the resolution for volume attachment failures in Kubernetes. ",
    ]

session_id = react_agent.create_session(session_name=f"s{uuid.uuid4().hex}")

# --- STREAMING TURN ---
for prompt in input_prompt:
    print("\n--- NEW PROMPT ---\n", prompt)
    response_stream = react_agent.create_turn(
            messages=[{"role": "user", "content": prompt}],
            session_id=session_id,
        )

# --- LOG + CAPTURE FINAL MESSAGE ---

final_output = None
logger = AgentEventLogger()

for event in logger.log(response_stream):
    event.print()
if event.role == "assistant" and event.content:
    final_output = event.content

# --- PRINT CLEAN FINAL MESSAGE ---

print("\n============================")
print("FINAL ASSISTANT RESPONSE (RAG → MCP Combined Output):")
print("============================")
print(final_output or "(No final message returned)")


INFO:httpx:HTTP Request: POST https://lls-route-llamastack.apps.cluster-5tptd.5tptd.sandbox2399.opentlc.com/v1/agents "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://lls-route-llamastack.apps.cluster-5tptd.5tptd.sandbox2399.opentlc.com/v1/tools?toolgroup_id=builtin%3A%3Arag%2Fknowledge_search "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://lls-route-llamastack.apps.cluster-5tptd.5tptd.sandbox2399.opentlc.com/v1/tools?toolgroup_id=mcp%3A%3Arh-kcs-mcp "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://lls-route-llamastack.apps.cluster-5tptd.5tptd.sandbox2399.opentlc.com/v1/agents/994ad605-db10-48e0-8780-a32a99c4ea51/session "HTTP/1.1 200 OK"



--- NEW PROMPT ---
 Find the resolution for volume attachment failures in Kubernetes. 


INFO:httpx:HTTP Request: POST https://lls-route-llamastack.apps.cluster-5tptd.5tptd.sandbox2399.opentlc.com/v1/agents/994ad605-db10-48e0-8780-a32a99c4ea51/session/53ebde5c-5588-477f-a917-735b346622e8/turn "HTTP/1.1 200 OK"


[33minference> [0m[33m{[0m[33m 
[0m[33m   [0m[33m "[0m[33mt[0m[33mhou[0m[33mght[0m[33m":[0m[33m "[0m[33mFirst[0m[33m,[0m[33m I[0m[33m will[0m[33m use[0m[33m the[0m[33m knowledge[0m[33m_search[0m[33m tool[0m[33m to[0m[33m find[0m[33m information[0m[33m about[0m[33m volume[0m[33m attachment[0m[33m failures[0m[33m in[0m[33m Kubernetes[0m[33m."
[0m[33m   [0m[33m ,[0m[33m "[0m[33maction[0m[33m":[0m[33m [0m[33m [0m[33m	[0m[33m{
[0m[33m       [0m[33m "[0m[33mtool[0m[33m_name[0m[33m":[0m[33m [0m[33m "[0m[33mknowledge[0m[33m_search[0m[33m",
[0m[33m       [0m[33m "[0m[33mtool[0m[33m_params[0m[33m":[0m[33m [0m[33m	[0m[33m[[0m[33m 
[0m[33m           [0m[33m {[0m[33m 
[0m[33m               [0m[33m "[0m[33mname[0m[33m":[0m[33m [0m[33m "[0m[33mquery[0m[33m",
[0m[33m               [0m[33m "[0m[33mvalue[0m[33m":[0m[33m [0m[33m "[0m[33mKubernetes[0m[33m v