In [33]:

import sys
import search_agent
sys.path.insert(0, '..')

In [18]:
import pickle

with open('eval-run-v2-2025-10-29-20-15.bin', 'rb') as f_out:
    rows = pickle.load(f_out)

In [62]:
import docs
github_data = docs.read_github_data()
parsed_data = docs.parse_data(github_data)

file_index = {d['filename']: d['content'] for d in parsed_data}

In [21]:
len(rows)
row = rows[10]
row

{'question': 'install Evidently for LLM',
 'answer': '# Installing Evidently for LLM\n\n## Installation Steps\n\n1. **Create an Account:** If you haven\'t done so, sign up for a free account at [Evidently Cloud](https://app.evidently.cloud/signup). After logging in, create an organization.\n\n2. **Install the Evidently Python Package:** You can install the Evidently library using pip. For general installation:\n   ```bash\n   pip install evidently\n   ```\n   If you are specifically working with large language models (LLMs), use the following command to install additional dependencies:\n   ```bash\n   pip install evidently[llm]\n   ```\n\n3. **Connect to the Cloud Workspace:** Once installed, you need to import the CloudWorkspace from the Evidently library and provide your API token to connect:\n   ```python\n   from evidently.ui.workspace import CloudWorkspace\n   ws = CloudWorkspace(\ntoken="API_KEY",\n   url="https://app.evidently.cloud")\n   ```\n   Alternatively, set the environme

In [10]:
from pydantic_ai import Agent
from dotenv import load_dotenv

load_dotenv()

True

In [47]:
judge_instructions = """
Use this checklist to evaluate the quality of an AI agent’s answer
(<ANSWER>) to a user question (<QUESTION>). We also include the
entire log (<LOG>) for analysis. In <REFERENCE> you will see
the file, from which the user question was generated. 

For each item of the checklist, check if the condition is met. 

Checklist:

- instructions_follow: The agent followed the user’s instructions (in <INSTRUCTIONS>)
- instructions_avoid: The agent avoided doing things it was told not to do  
- answer_relevant: The response directly addresses the user’s question  
- answer_match: The ANSWER is similar to the REFERENCE
- answer_clear: The answer is clear and correct  
- answer_citations: The response includes proper citations or sources when required  
- completeness: The response is complete and covers all key aspects of the request
- tool_call_search: Is the search tool invoked? 

Output true/false for each check and provide a short explanation for your judgment.
"""

In [48]:
from pydantic import BaseModel

class EvaluationCheck(BaseModel):
    check_name: str
    reasoning: str
    check_pass: bool

class EvaluationChecklist(BaseModel):
    checklist: list[EvaluationCheck]
    summary: str


In [49]:
# use different model to evaluate other model
import json

In [50]:
judge = Agent(
    name='judge',
    instructions=judge_instructions,
    model="gpt-5-nano",
    output_type=EvaluationChecklist
)

In [51]:
row

{'question': 'install Evidently for LLM',
 'answer': '# Installing Evidently for LLM\n\n## Installation Steps\n\n1. **Create an Account:** If you haven\'t done so, sign up for a free account at [Evidently Cloud](https://app.evidently.cloud/signup). After logging in, create an organization.\n\n2. **Install the Evidently Python Package:** You can install the Evidently library using pip. For general installation:\n   ```bash\n   pip install evidently\n   ```\n   If you are specifically working with large language models (LLMs), use the following command to install additional dependencies:\n   ```bash\n   pip install evidently[llm]\n   ```\n\n3. **Connect to the Cloud Workspace:** Once installed, you need to import the CloudWorkspace from the Evidently library and provide your API token to connect:\n   ```python\n   from evidently.ui.workspace import CloudWorkspace\n   ws = CloudWorkspace(\ntoken="API_KEY",\n   url="https://app.evidently.cloud")\n   ```\n   Alternatively, set the environme

In [52]:
user_prompt = f"""
<INSTRUCTIONS>{search_agent.search_instructions}</INSTRUCTIONS>
<QUESTION>{row['question']}</QUESTION>
<ANSWER>{row['answer']}</ANSWER>
<LOG>{json.dumps(row['messages'])}</LOG>

"""

In [58]:
output = await judge.run(user_prompt=user_prompt)

In [59]:
judgement = output.output

In [60]:
for check in judgement.checklist:
    print(check)

check_name='instructions_follow' reasoning='The user required performing 3-6 searches before answering. The provided answer contains installation steps but no evidence of performing or including/search results from 3-6 searches. Therefore this criterion is not met.' check_pass=False
check_name='instructions_avoid' reasoning='No explicit instruction was given to avoid certain actions in the user prompt. The assistant did not violate any hidden avoidance rule.' check_pass=True
check_name='answer_relevant' reasoning='The answer addresses installing Evidently for LLM and includes LLMT-specific pip extras and cloud connection details.' check_pass=True
check_name='answer_match' reasoning='The content aligns with the typical Evidently setup docs and the reference topics listed (Cloud, Installation, Why Evidently).' check_pass=True
check_name='answer_clear' reasoning='Steps are clearly presented with code blocks and sequence.' check_pass=True
check_name='answer_citations' reasoning='The answer

In [61]:
import asyncio
from tqdm.auto import tqdm

async def map_progress(seq, f, max_concurrency=6):
    """Asynchronously map async function f over seq with progress bar."""
    semaphore = asyncio.Semaphore(max_concurrency)

    async def run(el):
        async with semaphore:
            return await f(el)

    # create one coroutine per element
    coros = [run(el) for el in seq]

    # turn them into tasks that complete as they finish
    completed = asyncio.as_completed(coros)

    results = []

    for coro in tqdm(completed, total=len(seq)):
        result = await coro
        results.append(result)

    return results

In [65]:
async def run_eval(row):
    original_filename = row['original_question']['filename']
    reference = file_index[original_filename]
    user_prompt = f"""
<INSTRUCTIONS>{search_agent.search_instructions}</INSTRUCTIONS>
<QUESTION>{row['question']}</QUESTION>
<ANSWER>{row['answer']}</ANSWER>
<REFERENCE>{reference}</REFERENCE>
<LOG>{json.dumps(row['messages'])}</LOG>
""".strip()
    output = await judge.run(user_prompt=user_prompt)
    return row, output

In [69]:
results = await map_progress(rows, run_eval, max_concurrency=10)

  0%|          | 0/23 [00:00<?, ?it/s]

In [72]:
from toyaikit.pricing import PricingConfig
pricing = PricingConfig()

def calculate_cost(model, all_results):
    input_tokens = 0
    output_tokens = 0
    
    for _, r in all_results:
        usage = r.usage()
        input_tokens = input_tokens + usage.input_tokens
        output_tokens = output_tokens + usage.output_tokens

    cost = pricing.calculate_cost(model, input_tokens, output_tokens)
    return cost

In [73]:
calculate_cost('gpt-5-nano', results)

CostInfo(input_cost=0.0060718000000000005, output_cost=0.0321832, total_cost=0.038255000000000004)

In [75]:
results[0]

({'question': 'install Evidently for LLM',
  'answer': '# Installing Evidently for LLM\n\n## Installation Steps\n\n1. **Create an Account:** If you haven\'t done so, sign up for a free account at [Evidently Cloud](https://app.evidently.cloud/signup). After logging in, create an organization.\n\n2. **Install the Evidently Python Package:** You can install the Evidently library using pip. For general installation:\n   ```bash\n   pip install evidently\n   ```\n   If you are specifically working with large language models (LLMs), use the following command to install additional dependencies:\n   ```bash\n   pip install evidently[llm]\n   ```\n\n3. **Connect to the Cloud Workspace:** Once installed, you need to import the CloudWorkspace from the Evidently library and provide your API token to connect:\n   ```python\n   from evidently.ui.workspace import CloudWorkspace\n   ws = CloudWorkspace(\ntoken="API_KEY",\n   url="https://app.evidently.cloud")\n   ```\n   Alternatively, set the environ

In [76]:
all_checks = []

for original_row, result in results[1:]:
    checks = result.output.checklist
    checks_formatted = {
        'question': original_row['question']
    }
    for check in checks:
        checks_formatted[check.check_name] = check.check_pass
    all_checks.append(checks_formatted)

In [77]:
all_checks

[{'question': 'ROC AUC in text data drift',
  'instructions_follow': True,
  'instructions_avoid': True,
  'answer_relevant': True,
  'answer_match': True,
  'answer_clear': True,
  'answer_citations': True,
  'completeness': True,
  'tool_call_search': True},
 {'question': 'Quickstart Evidently setup',
  'instructions_follow': False,
  'instructions_avoid': True,
  'answer_relevant': True,
  'answer_match': False,
  'answer_clear': True,
  'answer_citations': True,
  'completeness': False,
  'tool_call_search': False},
 {'question': 'using is_critical parameter in alerts',
  'instructions_follow': True,
  'instructions_avoid': True,
  'answer_relevant': True,
  'answer_match': True,
  'answer_clear': True,
  'answer_citations': True,
  'completeness': True,
  'tool_call_search': False},
 {'question': 'explore evidently dashboard features',
  'instructions_follow': True,
  'instructions_avoid': True,
  'answer_relevant': True,
  'answer_match': True,
  'answer_clear': True,
  'answer_c

In [78]:
import pandas as pd

In [79]:
df_eval = pd.DataFrame(all_checks)

In [82]:
df_eval[df_eval.columns[1:]].mean()

instructions_follow    0.809524
instructions_avoid          1.0
answer_relevant            0.95
answer_match           0.809524
answer_clear           0.952381
answer_citations       0.904762
completeness                0.9
tool_call_search           0.65
answer_relevance            1.0
dtype: object

In [84]:
# enum

In [85]:
from enum import Enum
from pydantic import BaseModel, Field

class CheckName(str, Enum):
    instructions_follow = "instructions_follow"
    instructions_avoid = "instructions_avoid" 
    answer_relevant = "answer_relevant"
    answer_clear = "answer_clear"
    answer_citations = "answer_citations"
    completeness = "completeness"
    tool_call_search = "tool_call_search"

CHECK_DESCRIPTIONS = {
    CheckName.instructions_follow: "The agent followed the user's instructions (in <INSTRUCTIONS>)",
    CheckName.instructions_avoid: "The agent avoided doing things it was told not to do",
    CheckName.answer_relevant: "The response directly addresses the user's question",
    CheckName.answer_clear: "The answer is clear and correct",
    CheckName.answer_citations: "The response includes proper citations or sources when required",
    CheckName.completeness: "The response is complete and covers all key aspects of the request",
    CheckName.tool_call_search: "Is the search tool invoked?"
}

class EvaluationCheck(BaseModel):
    check_name: CheckName = Field(description="The type of evaluation check")
    reasoning: str = Field(description="The reasoning behind the check result")
    check_pass: bool = Field(description="Whether the check passed (True) or failed (False)")
    
class EvaluationChecklist(BaseModel):
    checklist: list[EvaluationCheck] = Field(description="List of all evaluation checks")
    summary: str = Field(description="Evaluation summary")

In [86]:
def generate_checklist_text():
    checklist_items = []
    for check_name in CheckName:
        description = CHECK_DESCRIPTIONS[check_name]
        checklist_items.append(f"- {check_name.value}: {description}")
    return "\n".join(checklist_items)

In [87]:
generate_checklist_text()

"- instructions_follow: The agent followed the user's instructions (in <INSTRUCTIONS>)\n- instructions_avoid: The agent avoided doing things it was told not to do\n- answer_relevant: The response directly addresses the user's question\n- answer_clear: The answer is clear and correct\n- answer_citations: The response includes proper citations or sources when required\n- completeness: The response is complete and covers all key aspects of the request\n- tool_call_search: Is the search tool invoked?"

In [88]:
eval_instructions = f"""
Use this checklist to evaluate the quality of an AI agent’s answer (<ANSWER>) to a user question (<QUESTION>).
We also include the entire log (<LOG>) for analysis.

For each item, check if the condition is met. 

Checklist:

{generate_checklist_text()}

Output true/false for each check and provide a short explanation for your judgment.
"""

In [89]:
print(eval_instructions)


Use this checklist to evaluate the quality of an AI agent’s answer (<ANSWER>) to a user question (<QUESTION>).
We also include the entire log (<LOG>) for analysis.

For each item, check if the condition is met. 

Checklist:

- instructions_follow: The agent followed the user's instructions (in <INSTRUCTIONS>)
- instructions_avoid: The agent avoided doing things it was told not to do
- answer_relevant: The response directly addresses the user's question
- answer_clear: The answer is clear and correct
- answer_citations: The response includes proper citations or sources when required
- completeness: The response is complete and covers all key aspects of the request
- tool_call_search: Is the search tool invoked?

Output true/false for each check and provide a short explanation for your judgment.

