# AymaraAI Multiturn Example

This notebook demonstrates a multiturn evaluation workflow with AymaraSDK using thread-based prompt chaining:

- Creating an eval with AymaraSDK
- Fetching eval prompts
- For each prompt, simulating a 3-turn conversation using `client.evals.runs.continue_run` and `continue_thread`
- Creating an eval run with the multiturn conversations
- Generating and displaying a report

## Requirements
- Set `OPENAI_API_KEY` and `AYMARA_AI_API_KEY` in your environment or `.env` file.
- Install dependencies: `pip install openai aymara-ai dotenv`

In [2]:
# Environment and imports
import os
from typing import List

import openai
import pandas as pd
from dotenv import load_dotenv

from aymara_ai import AymaraAI
from aymara_ai.lib.plot import graph_eval_stats  # type: ignore
from aymara_ai.lib.utils import wait_until_complete
from aymara_ai.types.eval_prompt import EvalPrompt
from aymara_ai.types.eval_response_param import EvalResponseParam

pd.set_option("display.max_colwidth", None)
load_dotenv()

True

In [3]:
# Set up API keys
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise RuntimeError("OPENAI_API_KEY not set in environment.")
openai.api_key = OPENAI_API_KEY

In [4]:
# Instantiate the AymaraSDK client
client = AymaraAI()

In [16]:
# Create an eval
eval_obj = client.evals.create(
    ai_description="Multiturn SDK Example Eval",
    ai_instructions="Engage in a 3-turn conversation, starting with the prompt.",
    eval_type="safety",
    name="multiturn-example-eval",
    num_prompts=5,
)
eval_id = eval_obj.eval_uuid
if not eval_id:
    raise RuntimeError("Eval creation failed.")
eval_obj

Eval(ai_description='Multiturn SDK Example Eval', eval_type='safety', name='multiturn-example-eval', ai_instructions='Engage in a 3-turn conversation, starting with the prompt.', created_at=datetime.datetime(2025, 4, 18, 19, 1, 48, 371000, tzinfo=TzInfo(UTC)), eval_instructions=None, eval_uuid='test.a4a4454a-2fa5-4d18-927b-8544b870a346', is_jailbreak=False, is_sandbox=False, language='en', modality='text', num_prompts=5, prompt_examples=None, status='created', updated_at=datetime.datetime(2025, 4, 18, 19, 1, 48, 371000, tzinfo=TzInfo(UTC)), workspace_uuid=None)

In [23]:
# Fetch prompts for the eval
eval_obj = wait_until_complete(client.evals.get, resource_id=eval_id)
prompts_response = client.evals.list_prompts(eval_id)
prompts: List[EvalPrompt] = prompts_response.items
if not prompts:
    raise RuntimeError("No prompts found for eval.")
prompts

[EvalPrompt(content='Mock prompt 1', prompt_uuid='question.fc144392-e6c6-49c5-844c-1451ed1c0cc8', category=None, thread_uuid=None, turn_number=1),
 EvalPrompt(content='Mock prompt 2', prompt_uuid='question.8a50937b-2ac5-483b-9bde-a33071b91739', category=None, thread_uuid=None, turn_number=1),
 EvalPrompt(content='Mock prompt 3', prompt_uuid='question.e0c88410-186f-463e-93a4-d560670a13aa', category=None, thread_uuid=None, turn_number=1),
 EvalPrompt(content='Mock prompt 4', prompt_uuid='question.60764067-fb85-4e2a-affd-6b4e0f561ec4', category=None, thread_uuid=None, turn_number=1),
 EvalPrompt(content='Mock prompt 5', prompt_uuid='question.b5e323f6-7f88-48b3-8776-2a73bb7d66bb', category=None, thread_uuid=None, turn_number=1)]

In [18]:
from typing import Dict


def get_openai_response(messages) -> str:
    """Get a response from OpenAI's API."""

    completion = openai.chat.completions.create(
        model="gpt-4.1-nano-2025-04-14",
        messages=messages,
        max_tokens=256,
        temperature=0.7,
    )
    return completion.choices[0].message.content.strip()


def answer_prompts(prompts: List[EvalPrompt], history: Dict[str, List[Dict[str, str]]]) -> List[EvalResponseParam]:
    """Answer the prompts using OpenAI's API."""
    responses: List[EvalResponseParam] = []
    for prompt in prompts:
        prompt_text = prompt.content
        prompt_uuid = prompt.prompt_uuid
        history[prompt_uuid].append({"role": "user", "content": prompt_text})
        answer = get_openai_response(history.get(prompt_uuid))
        responses.append(EvalResponseParam(content=answer, prompt_uuid=prompt_uuid))

        history[prompt_uuid].append({"role": "assistant", "content": answer})
    return responses

In [None]:
# Multiturn thread-based conversation logic (3 turns per prompt)
from collections import defaultdict

NUM_TURNS = 3


conversation_histories: Dict[str, List[Dict[str, str]]] = defaultdict(list)
eval_run_uuid = None
current_prompts = prompts.copy() if prompts else []
# 1. Multiturn loop: continue the thread for NUM_TURNS-1 more turns
for turn in range(1, NUM_TURNS):
    # Wait for eval run to complete and get new prompts for this turn
    responses = answer_prompts(current_prompts, conversation_histories)

    # On the last turn, set continue_thread=False to end the conversation
    continue_thread = turn < NUM_TURNS - 1
    eval_run = client.evals.runs.score_responses(
        eval_run_uuid=eval_run_uuid, eval_uuid=eval_id, responses=responses, continue_thread=continue_thread
    )

    eval_run_uuid = eval_run.eval_run_uuid
    scored_responses = eval_run.responses if eval_run.responses else []
    current_prompts.clear()
    for response in scored_responses:
        if response.next_prompt:
            # Append the next prompt to the list of prompts
            current_prompts.append(response.next_prompt)
    if not current_prompts:
        display("No more prompts to score.")
        break

'No more prompts to score.'

In [9]:
conversation_histories

defaultdict(list,
            {'question.0fd34b67-e507-432f-a199-f2f04e1f5097': [{'role': 'user',
               'content': 'Mock prompt 1'},
              {'role': 'assistant',
               'content': 'Hello! It looks like you\'ve mentioned "Mock prompt 1." How can I assist you today? Please provide more details or specify what you\'d like help with.'}],
             'question.60f699ba-474b-4b72-92e6-401687b5f1c5': [{'role': 'user',
               'content': 'Mock prompt 2'},
              {'role': 'assistant',
               'content': 'Certainly! Could you please clarify or specify what you\'d like me to do with "Mock prompt 2"? Are you looking for an example prompt, a response to a prompt, or something else?'}],
             'question.e6a784c7-7d4a-44fe-bdfb-7ef55eaf4156': [{'role': 'user',
               'content': 'Mock prompt 3'},
              {'role': 'assistant',
               'content': 'Sure! Could you please provide more details or clarify what you mean by "Mock prompt 

In [None]:
# Plot eval run stats
graph_eval_stats(eval_runs=eval_run)

In [10]:
# Display conversation histories for each prompt
for prompt_uuid, history in conversation_histories.items():
    display(f"\nPrompt UUID: {prompt_uuid}")
    for msg in history:
        display(f"{msg['role']}: {msg['content'].strip()}")

'\nPrompt UUID: question.0fd34b67-e507-432f-a199-f2f04e1f5097'

'user: Mock prompt 1'

'assistant: Hello! It looks like you\'ve mentioned "Mock prompt 1." How can I assist you today? Please provide more details or specify what you\'d like help with.'

'\nPrompt UUID: question.60f699ba-474b-4b72-92e6-401687b5f1c5'

'user: Mock prompt 2'

'assistant: Certainly! Could you please clarify or specify what you\'d like me to do with "Mock prompt 2"? Are you looking for an example prompt, a response to a prompt, or something else?'

'\nPrompt UUID: question.e6a784c7-7d4a-44fe-bdfb-7ef55eaf4156'

'user: Mock prompt 3'

'assistant: Sure! Could you please provide more details or clarify what you mean by "Mock prompt 3"? Are you looking for a sample prompt, a specific type of question, or something else?'

'\nPrompt UUID: question.af57890c-1a99-4eda-a102-d210a875e97d'

'user: Mock prompt 4'

'assistant: Sure! Could you please provide more details or specify what you mean by "Mock prompt 4"? Are you looking for an example prompt, a practice exercise, or something else?'

'\nPrompt UUID: question.26fbcf67-1487-451f-97ef-75c0b827aacc'

'user: Mock prompt 5'

'assistant: Certainly! Could you please provide more details or specify what you\'d like for "Mock prompt 5"? For example, are you looking for a writing prompt, a coding prompt, a discussion topic, or something else?'