## 1. Install the llama stack client

In [None]:
%pip install llama_stack==0.3.0

## 2. List available models

In [None]:
from llama_stack_client import LlamaStackClient

client = LlamaStackClient(base_url="http://redbank-lsd-service:8321")
client.models.list()

## 3. Set LLM and Embedding model

In [None]:
from llama_stack_client import LlamaStackClient

client = LlamaStackClient(base_url="http://redbank-lsd-service:8321")

models = client.models.list()

model_id = "vllm-inference/qwen2-5"
embedding_model_id = (
    em := next(m for m in models if m.model_type == "embedding")
).identifier
embedding_dimension = em.metadata["embedding_dimension"]

print(model_id)
print(embedding_model_id)

## 4. Test the LLM with RAG and MCP tools

In [None]:
def run_query(prompt: str):
    from llama_stack_client import LlamaStackClient

    client = LlamaStackClient(base_url="http://redbank-lsd-service:8321")

    vector_stores = client.vector_stores.list()
    vector_store = next(
        (s for s in vector_stores.data if s.name == "redbank-kb-vector-store"), None
    )
    print(f"Vector store ID: {vector_store.id}")

    resp = client.responses.create(
        model=model_id,
        instructions="""
            You are a helpful assistant with access to financial data through MCP tools.

            IMPORTANT: Transaction all data is from 2025. Use dates like '2025-01-01'.

            When asked questions, use available tools to find the answer. Follow these rules:

            1. Use tools immediately without asking for confirmation
            2. If you need additional information, search for it using whatever details are provided
            3. Chain tool calls as needed - use results from one call as inputs to the next
            4. If one approach doesn't work, try alternative methods silently
            5. Do not narrate your process, explain failures, or describe what you're trying - just do it
            6. Only provide output when you have the final answer
            7. If you truly cannot find the information after multiple attempts, simply state what you were unable to find
            8. Use the rag tool to answer all questions related to knowledge base or FAQs when the question is not about transactions or user-specific data.

            Just execute tool calls until you have an answer, then provide it.
        """,
        tools=[
            {
                "type": "mcp",
                "server_label": "dmcp",
                "server_description": "MCP Server.",
                "server_url": "http://redbank-mcp-server:8000/mcp",
                "require_approval": "never",
            },
            {"type": "file_search", "vector_store_ids": [vector_store.id]},
        ],
        input=prompt,
        stream=True,
    )

    full_text = ""
    for event in resp:
        if event.type == "response.output_text.delta":
            print(event.delta, end="", flush=True)
            full_text += event.delta
        elif event.type == "response.completed":
            print("\n\n--- Stream complete ---")

In [None]:
# Example query that uses MCP tools
run_query("How much was David's client lunch, and what is his email?")

In [None]:
# Example query that uses RAG tools
run_query("Who founded Red Bank Financial?")

## 5. Test your Speech-To-Text model (Whisper)

In [None]:
def whisper_transcribe(audio_file_path: str):
    from openai import OpenAI

    WHISPER_URL = "http://whisper-large-v3-turbo-quantized-predictor:80/v1"
    AUDIO_FILE = audio_file_path

    client = OpenAI(base_url=WHISPER_URL, api_key="fake")

    with open(AUDIO_FILE, "rb") as audio_file:
        transcript = client.audio.transcriptions.create(
            model="whisper-large-v3-turbo-quantized", file=audio_file, language="en"
        )
    print(f"üìù Transcription: {transcript.text}")
    return transcript.text

In [None]:
whisper_transcribe("who-founded-red-bank-financial.mp3")

## 6. Test your Text-To-Speech model (Kokoro)

In [None]:
def kokoro_speak(input_text: str):
    import openai
    import httpx

    unverified_client = httpx.Client(verify=False, timeout=120)
    client = openai.OpenAI(
        base_url="http://kokoro-fastapi.<namespace>.svc.cluster.local:8880/v1",
        api_key="not-needed",
        http_client=unverified_client,
    )

    # Stream Kokoro audio output directly to a WAV file
    with client.audio.speech.with_streaming_response.create(
        model="kokoro",
        voice="af_bella",
        input=input_text,
        format="wav",
    ) as response:
        response.stream_to_file("response.wav")

    from IPython.display import Audio, display

    display(Audio("response.wav"))

    print("‚úÖ Audio saved to response.wav")

In [None]:
kokoro_speak("Red Bank Financial was founded by Red Hatters")