## 1. Create Llama Stack client, list available models and vector databases

In [None]:
%pip install -r requirements.txt

In [None]:
from llama_stack_client import LlamaStackClient

client = LlamaStackClient(base_url="http://lsd-milvus-service:8321")

models = client.models.list()
print(f"Models information: {models}\n")

inference_llm = next(
    m
    for m in models
    if m.custom_metadata and m.custom_metadata.get("model_type") == "llm"
)
print(f"Identifier for Inference model in usage: {inference_llm}\n")

# Check what vector databases exist
print("=== Available Vector Stores ===")
client.vector_stores.list()

## 2. Prompt the LLM and retrieve relevant context via RAG
Prompt the LLM with questions in relation to the documents inserted, and see it return accurate answers.

In [None]:
client = LlamaStackClient(base_url="http://lsd-milvus-service:8321")

vector_stores = client.vector_stores.list()
vector_store = next(
    (s for s in vector_stores.data if s.name == "csv-vector-store"), None
)

user_prompts = [
    "What is gender, home country and age of Dulce Abril and Philip Gent?",
    "What is customer id, company, city, country, phone number, email, subscription date, subscribed website of of Sheryl Baxter?",
    "What products were sold according to sample sales data?",
    "What is the economics condition at Ireland in 2025?",  # Dummy question the model will answer with 'I don’t know'
]

responses = []

for prompt in user_prompts:
    resp = client.responses.create(
        model=inference_llm.id,
        instructions="""
            /no_think
            You are a helpful assistant with access to data via the file_search tool.

            When asked questions, use available tools to find the answer. Follow these rules:
            1. Use tools immediately without asking for confirmation
            2. Chain tool calls as needed
            3. Do not narrate your process
            4. Only provide the final answer
            5. If the answer is not found in the context, respond with 'I don’t know'
        """,
        tools=[{"type": "file_search", "vector_store_ids": [vector_store.id]}],
        stream=False,
        input=prompt,
    )
    responses.append(resp)
    print(f"\nQ: {prompt}")
    print(f"A: {resp.output_text.strip()}")

## 3. Preparation for evaluating RAG models using [RAGAS](https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/?h=metrics)

- We will use two key metrics to show the performance of the RAG server:
    1. [Faithfulness](https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/faithfulness/) - measures how factually consistent a response is with the retrieved context. It ranges from 0 to 1, with higher scores indicating better consistency.
    2. [Response Relevancy](https://docs.ragas.io/en/stable/concepts/metrics/available_metrics/answer_relevance/) - metric measures how relevant a response is to the user input. Higher scores indicate better alignment with the user input, while lower scores are given if the response is incomplete or includes redundant information.

 - Create and paste your API key from [Groq Cloud](https://console.groq.com/home)

In [None]:
import os

os.environ["GROQ_API_KEY"] = "YOUR_GROQ_API_KEY"

In [None]:
from typing import List


# Extract retrieved contexts from Responses API output
def extract_retrieved_contexts(response) -> List[str]:
    """
    Extracts retrieved contexts from LlamaStack Responses API output.

    Args:
        response: Response object from client.responses.create()

    Returns:
        List of retrieved context strings for Ragas evaluation
    """
    retrieved_contexts = []

    for output_item in response.output:
        # Check if this is a file_search_call with results
        if (
            hasattr(output_item, "type")
            and output_item.type == "file_search_call"
            and hasattr(output_item, "results")
            and output_item.results
        ):
            for result in output_item.results:
                if hasattr(result, "text") and result.text:
                    retrieved_contexts.append(result.text)

    return retrieved_contexts

In [None]:
from ragas.dataset_schema import EvaluationDataset

samples = []

references = [
    """
Dulce Abril is 32 years old female from USA and Philip Gent is 36 years old man from France.
""",
    """
Sheryl Baxter's customer ID is DD37Cf93aecA6Dc, her company is Rasmussen Group, her city is East Leonard, her country is Chile, her phone numbers are 229.077.5154 and 397.884.0519x718, her email is zunigavanessa@smith.info, her subscription date is 2020-08-24, and her subscribed website is http://www.stephenson.com/.
""",
]

# Constructing a Ragas EvaluationDataset
for i, response in enumerate(responses[: len(references)]):
    samples.append(
        {
            "user_input": user_prompts[i],
            "response": response.output_text,
            "reference": references[i],
            "retrieved_contexts": extract_retrieved_contexts(response),
        }
    )

ragas_eval_dataset = EvaluationDataset.from_list(samples)
ragas_eval_dataset.to_pandas()

## 4. Prerequisites for RAG evaluation

In [None]:
from ragas.metrics import (
    Faithfulness,
    ResponseRelevancy,
)
from ragas.dataset_schema import SingleTurnSample
from langchain_groq import ChatGroq
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_huggingface import HuggingFaceEmbeddings

llm = ChatGroq(
    model="meta-llama/llama-4-maverick-17b-128e-instruct",
    temperature=0,
)

# Wrap the Groq LLM for use with Ragas
evaluator_llm = LangchainLLMWrapper(llm)

# Using HuggingFace embeddings as a free alternative
embeddings_model = HuggingFaceEmbeddings(
    model_name="ibm-granite/granite-embedding-125m-english"
)
evaluator_embeddings = LangchainEmbeddingsWrapper(embeddings_model)


# references for both prompts
reference_for_first_prompt = samples[0]["reference"]
reference_for_second_prompt = samples[1]["reference"]

# inputs for both prompts
user_input_for_first_prompt = samples[0]["user_input"]
user_input_for_second_prompt = samples[1]["user_input"]

# responses for both prompts
response_for_first_prompt = samples[0]["response"]
response_for_second_prompt = samples[1]["response"]

# reference lists for both prompts
reference_list_for_first_prompt = [
    line.strip() for line in reference_for_first_prompt.strip().split("\n")
]
reference_list_for_second_prompt = [
    line.strip() for line in reference_for_second_prompt.strip().split("\n")
]

# Retrieved contexts for both prompts
retrieved_contexts_for_first_prompt = samples[0]["retrieved_contexts"]
retrieved_contexts_for_second_prompt = samples[1]["retrieved_contexts"]

print(
    f"Retrieved contexts for the first prompt: {retrieved_contexts_for_first_prompt}\n"
)
print(
    f"Retrieved contexts for the second prompt: {retrieved_contexts_for_second_prompt}\n"
)

## 5. Evaluate Faithfulness Score for both prompts

In [None]:
first_prompt_turn = SingleTurnSample(
    user_input=user_input_for_first_prompt,
    response=response_for_first_prompt,
    retrieved_contexts=retrieved_contexts_for_first_prompt,
)
faithfulness_scorer = Faithfulness(llm=evaluator_llm)
faithfulness_score_for_first_prompt = await faithfulness_scorer.single_turn_ascore(
    first_prompt_turn
)

second_prompt_turn = SingleTurnSample(
    user_input=user_input_for_second_prompt,
    response=response_for_second_prompt,
    retrieved_contexts=retrieved_contexts_for_second_prompt,
)
faithfulness_score_for_second_prompt = await faithfulness_scorer.single_turn_ascore(
    second_prompt_turn
)

print(
    f"Faithfulness score for prompt '{user_prompts[0]}': {faithfulness_score_for_first_prompt}"
)

print(
    f"Faithfulness score for prompt '{user_prompts[1]}': {faithfulness_score_for_second_prompt}"
)

## 6. Evaluate Response Relevancy for both prompts

In [None]:
first_prompt_turn = SingleTurnSample(
    user_input=user_input_for_first_prompt,
    response=response_for_first_prompt,
    retrieved_contexts=retrieved_contexts_for_first_prompt,
)
response_relevancy_scorer = ResponseRelevancy(
    llm=evaluator_llm, embeddings=evaluator_embeddings
)
response_relevancy_score_for_first_prompt = (
    await response_relevancy_scorer.single_turn_ascore(first_prompt_turn)
)

second_prompt_turn = SingleTurnSample(
    user_input=user_input_for_second_prompt,
    response=response_for_second_prompt,
    retrieved_contexts=retrieved_contexts_for_second_prompt,
)
response_relevancy_score_for_second_prompt = (
    await response_relevancy_scorer.single_turn_ascore(second_prompt_turn)
)

print(
    f"Response Relevancy score for prompt '{user_prompts[0]}': {response_relevancy_score_for_first_prompt}"
)

print(
    f"Response Relevancy score for prompt '{user_prompts[1]}': {response_relevancy_score_for_second_prompt}"
)