## 1. Install the llama stack client

In [None]:
%pip install "llama-stack-client>=0.3.0"

## 2. List available models and vector stores

In [None]:
from llama_stack_client import LlamaStackClient

client = LlamaStackClient(base_url="http://lsd-milvus-service:8321")
models = client.models.list()
vector_stores = client.vector_stores.list()

print(models)
print("---")
print(vector_stores)

## 3. Set LLM and Vector Store to use

In [None]:
from llama_stack_client import LlamaStackClient

client = LlamaStackClient(base_url="http://lsd-milvus-service:8321")

model_id = "vllm-inference/qwen3-14b-awq"
vector_store_name = "pdf-vector-store"

print(model_id)
print(vector_store_name)

## 4. Prompt the LLM and retrieve relevant context via RAG
Prompt the LLM with a question in relation to the documents inserted, and see it return accurate answers.

In [None]:
client = LlamaStackClient(base_url="http://lsd-milvus-service:8321")

vector_stores = client.vector_stores.list()
vector_store = next(
    (s for s in vector_stores.data if s.name == vector_store_name), None
)

resp = client.responses.create(
    model=model_id,
    instructions="""
        You are a helpful assistant with access to data via the file_search tool.

        When asked questions, use available tools to find the answer. Follow these rules:

        1. Use tools immediately without asking for confirmation
        2. Chain tool calls as needed - use results from one call as inputs to the next
        3. Do not narrate your process, explain failures, or describe what you're trying - just do it
        4. Only provide output when you have the final answer
    """,
    tools=[
        {"type": "file_search", "vector_store_ids": [vector_store.id]},
    ],
    stream=False,
    input="Who wrote the paper on Row and Column Access Control Support in IBM DB2 for i?",
)

print(resp.output_text)

### Congratulations! You've successfully inserted your PDF documents via a KubeFlow Pipeline, and queried your RAG application using Llama Stack! ðŸŽ‰ðŸ¥³