In [1]:
import docs
import search_tools

In [2]:
from dataclasses import dataclass
from pydantic import BaseModel

@dataclass
class AgentConfig:
    chunk_size: int = 2000
    chunk_step: int = 1000
    top_k: int = 5

    model: str = "gpt-4o-mini"

In [3]:
search_instructions = """
You are a search assistant for the Evidently documentation.

Evidently is an open-source Python library and cloud platform for evaluating, testing, and monitoring data and AI systems.
It provides evaluation metrics, testing APIs, and visual reports for model and data quality.

Your task is to help users find accurate, relevant information about Evidently's features, usage, and integrations.

You have access to the following tools:

- search — Use this to explore the topic and retrieve relevant snippets or documentation.
- read_file — Use this to retrieve or verify the complete content of a file when:
    * A code snippet is incomplete, truncated, or missing definitions.
    * You need to check that all variables, imports, and functions referenced in code are defined.
    * You must ensure the code example is syntactically correct and runnable.

If `read_file` cannot be used or the file content is unavailable, clearly state:
> "Unable to verify with read_file."

Search Strategy

- For every user query:
    * Perform at least 3 and at most 6 distinct searches to gather enough context.
    * Each search must use a different phrasing or keyword variation of the user's question.
    * Make sure that the search requests are relevant to evidently, testing, evaluating and monitoring AI systems.
    * No need to include "Evidently" in the search text.

- After collecting search results:
    1. Synthesize the information into a concise, accurate answer.
    2. If your answer includes code, always validate it with `read_file` before finalizing.
    3. If a code snippet or reference is incomplete, explicitly mention it.

Important:
- The 6-search limit applies only to `search` calls.
- You may call `read_file` at any time, even after the search limit is reached.
- `read_file` calls are verification steps and do not count toward the 6-search limit.

Code Verification and Completeness Rules

- All variables, functions, and imports in your final code examples must be defined or imported.
- Never shorten, simplify, or truncate code examples. Always present the full, verified version.
- When something is missing or undefined in the search results:
    * Call `read_file` with the likely filename to retrieve the complete file content.
    * Replace any partial code with the full verified version.
- If the file is not available or cannot be verified:
    * Include a clear note: "Unable to verify this code."
- Do not reformat, rename variables, or omit lines from the verified code.

Output Format

- Write your answer clearly and accurately.
- Include a "References" section listing the search queries or file names you used.
- If you couldn't find a complete answer after 6 searches, set found_answer = False.
"""


In [4]:
class Reference(BaseModel):
    title: str
    filename: str

class Section(BaseModel):
    heading: str
    content: str
    references: list[Reference]

class SearchResultArticle(BaseModel):
    found_answer: bool
    title: str
    sections: list[Section]
    references: list[Reference]


In [5]:
config = AgentConfig()

tools = search_tools.prepare_search_tools(
    config.chunk_size,
    config.chunk_step,
    config.top_k
)

In [6]:
from dotenv import load_dotenv
load_dotenv()


True

In [14]:
from agents import Agent, function_tool

In [16]:
from agents import Agent, function_tool

agent_tools = [
    function_tool(tools.search),
    function_tool(tools.read_file)
]

search_agent = Agent(
    name='search',
    tools=agent_tools,
    instructions=search_instructions,
    model=config.model,
    output_type=SearchResultArticle,
)


In [17]:
from agents import Runner

input = 'llm as a judge'
result = await Runner.run(search_agent, input=input)

In [19]:
from openai.types.responses import ResponseTextDeltaEvent

result = Runner.run_streamed(
    search_agent,
    input=input,
)

async for event in result.stream_events():
    if event.type == "run_item_stream_event":
        if event.item.type == "tool_call_item":
            tool_call = event.item.raw_item
            f_name = tool_call.name
            args = tool_call.arguments
            print(f"TOOL CALL ({event.item.agent.name}): {f_name}({args})")
    
    if event.type == "raw_response_event" and isinstance(event.data, ResponseTextDeltaEvent):
        print(event.data.delta, end='', flush=True)

TOOL CALL (search): search({"query":"using AI or LLMs as judges in legal systems"})
TOOL CALL (search): search({"query":"AI judges ethical implications"})
TOOL CALL (search): search({"query":"legal decisions by artificial intelligence"})
TOOL CALL (search): search({"query":"AI in the courtroom"})
TOOL CALL (search): search({"query":"effectiveness of AI in judicial roles"})
TOOL CALL (search): read_file({"filename":"examples/LLM_judge.mdx"})
TOOL CALL (search): read_file({"filename":"metrics/customize_llm_judge.mdx"})
TOOL CALL (search): read_file({"filename":"quickstart_llm.mdx"})
{"found_answer":true,"title":"Using LLMs as Judges","sections":[{"heading":"Overview","content":"Using Language Models (LLMs) as judges in various evaluations, such as for legal contexts or compliance checks, can help in assessing and interpreting complex data. LLMs can be employed to evaluate outputs based on predefined criteria, comparing new responses against reference inputs (reference-based evaluation) o

In [27]:
from agents.exceptions import MaxTurnsExceeded

class SearchResultHandler(JSONParserHandler):
    def on_field_start(self, path: str, field_name: str):
        if field_name == "references":
            level = path.count("/") + 2
            print(f"\n{'#' * level} References\n")

    def on_field_end(self, path, field_name, value, parsed_value=None):
        if field_name == "title" and path == "":
            print(f"# {value}")

        elif field_name == "heading":
            print(f"\n\n## {value}\n")
        elif field_name == "content":
            print("\n") 

    def on_value_chunk(self, path, field_name, chunk):
        if field_name == "content":
            print(chunk, end="", flush=True)

    def on_array_item_end(self, path, field_name, item=None):
        if field_name == "references":
            title = item.get("title", "")
            filename = item.get("filename", "")
            print(f"- [{title}]({filename})")

handler = SearchResultHandler()

In [26]:
async def run_stream(agent, input, handler, max_turns=3):
    try:
        result = Runner.run_streamed(
            agent,
            input=input,
            max_turns=max_turns
        )
        
        parser = StreamingJSONParser(handler)

        async for event in result.stream_events():
            if event.type == "run_item_stream_event":
                if event.item.type == "tool_call_item":
                    tool_call = event.item.raw_item
                    f_name = tool_call.name
                    args = tool_call.arguments
                    print(f"TOOL CALL ({event.item.agent.name}): {f_name}({args})")
            
            if event.type == "raw_response_event" and isinstance(event.data, ResponseTextDeltaEvent):
                parser.parse_incremental(event.data.delta)

        return result
    except MaxTurnsExceeded as e:
        print('too many turns')
        finish_prompt = 'System message: The number of searches has exceeded the limit. Proceed to finishing the writeup'
        finish_message = [{'role': 'user', 'content': finish_prompt}]
        messages = result.to_input_list() + finish_message
        final_result = await run_stream(agent, input=messages, handler=handler, max_turns=1)
        return final_result


In [28]:
result = await run_stream(search_agent, 'llm as a judge', SearchResultHandler())

TOOL CALL (search): search({"query":"using LLM as judge in evaluations"})
TOOL CALL (search): search({"query":"LLM for decision-making systems"})
TOOL CALL (search): search({"query":"role of LLM in bias evaluation"})
TOOL CALL (search): search({"query":"evaluating AI decisions using LLMs"})
TOOL CALL (search): search({"query":"how LLMs can assist in legal judgments"})
TOOL CALL (search): search({"query":"LLMs in compliance and ethical decision-making"})
TOOL CALL (search): read_file({"filename":"examples/LLM_judge.mdx"})
TOOL CALL (search): read_file({"filename":"examples/LLM_jury.mdx"})
too many turns
# Using LLMs as Judges in Evaluations


## Overview of LLMs as Judges

Large Language Models (LLMs) can serve as evaluators in various contexts, helping to assess responses or outputs based on defined criteria. Their use can be classified into two primary types:

1. **Reference-based Evaluation**: Newly generated responses are compared against a reference set of responses (the "ground tr

In [29]:
## Creating a guradrail agent

In [30]:
guardrail_instructions = """
Make sure the user queries are related to the Evidently framework and its documentation.

Evidently is an open-source Python library and cloud platform for evaluating, testing, and monitoring data,
AI and LLM systems. It provides evaluation metrics, testing APIs, and visual reports for model and data quality.

Examples of relevant topics:

- Create a custom LLM judge
- Customize data drift detection
- llm evaluations

Output 'fail=True' if the query is not about Evidently documentation or related topics.
Keep reasoning short (up to 10 words)
""".strip()

In [31]:
class EvidentlyDocsGuardrail(BaseModel):
    reasoning: str
    fail: bool

In [32]:
guardrail_agent = Agent( 
    name="guardrail",
    instructions=guardrail_instructions,
    output_type=EvidentlyDocsGuardrail,
    model='gpt-4o-mini'
)

In [33]:
result = await Runner.run(guardrail_agent, input='llm as a judge')

In [34]:
result.final_output

EvidentlyDocsGuardrail(reasoning='Relevant to LLM evaluations in Evidently.', fail=False)

In [42]:
result = await Runner.run(guardrail_agent, 'whats sqrt(pi)')

In [43]:
result.final_output

EvidentlyDocsGuardrail(reasoning='Not related to Evidently documentation.', fail=True)

In [35]:
# Implementing the Input Guardrail Function

from agents import GuardrailFunctionOutput, input_guardrail


In [None]:
@input_guardrail
async def guardrail(ctx, agent, messages):
    result = await Runner.run(guardrail_agent, input=messages)
    decision = result.final_output
    return GuardrailFunctionOutput(
        output_info=decision.reasoning,
        tripwire_triggered=decision.fail
    )

In [36]:
@input_guardrail
async def guardrail(ctx, agent, messages):
    if type(messages) == list and len(messages) > 1:
        return GuardrailFunctionOutput(
            output_info='no need to trigger for continued conversations',
            tripwire_triggered=False
        )
    result = await Runner.run(guardrail_agent, input=messages)
    decision = result.final_output
    return GuardrailFunctionOutput(
        output_info=decision.reasoning,
        tripwire_triggered=decision.fail
    )

In [None]:
# we have input guardrail and output guardrail

In [37]:
@input_guardrail
async def documentation_guardrail(ctx, agent, input):
    result = await Runner.run(guardrail_agent, input)
    final_output = result.final_output

    return GuardrailFunctionOutput(
        output_info=final_output.reasoning, 
        tripwire_triggered=final_output.fail,
    )

In [38]:
search_agent = Agent(
    name='search',
    instructions=search_instructions,
    tools=agent_tools,
    input_guardrails=[documentation_guardrail],
    model=config.model,
    output_type=SearchResultArticle
)

In [44]:
from agents.exceptions import InputGuardrailTripwireTriggered

try:
    result = await Runner.run(search_agent, 'whats sqrt(pi)')
except InputGuardrailTripwireTriggered as e:
    output = e.guardrail_result.output
    if output.tripwire_triggered: 
        print(output.output_info)

Not related to Evidently documentation.


In [45]:
async def run_stream(agent, input, handler, max_turns=3):
    try:
        result = Runner.run_streamed(
            agent,
            input=input,
            max_turns=max_turns
        )
        
        parser = StreamingJSONParser(handler)

        async for event in result.stream_events():
            if event.type == "run_item_stream_event":
                if event.item.type == "tool_call_item":
                    tool_call = event.item.raw_item
                    f_name = tool_call.name
                    args = tool_call.arguments
                    print(f"TOOL CALL ({event.item.agent.name}): {f_name}({args})")
            
            if event.type == "raw_response_event" and isinstance(event.data, ResponseTextDeltaEvent):
                parser.parse_incremental(event.data.delta)

        return result
    except MaxTurnsExceeded as e:
        print('too many turns')
        finish_prompt = 'System message: The number of searches has exceeded the limit. Proceed to finishing the writeup'
        finish_message = [{'role': 'user', 'content': finish_prompt}]
        messages = result.to_input_list() + finish_message
        final_result = await run_stream(agent, input=messages, handler=handler, max_turns=1)
        return final_result
    except InputGuardrailTripwireTriggered as e:
        run_data = e.run_data
        for input_guardrail in run_data.input_guardrail_results:
            o = input_guardrail.output
            if o.tripwire_triggered:
                print(o.output_info)
        return e.run_data


In [47]:
result = await run_stream(search_agent, 'how much is sqrt(pi)', SearchResultHandler())

TOOL CALL (search): search({"query":"sqrt(pi)"})
Query not related to Evidently documentation.
