In [1]:
import docs
import search_tools

In [2]:
from dataclasses import dataclass
from pydantic import BaseModel

@dataclass
class AgentConfig:
    chunk_size: int = 2000
    chunk_step: int = 1000
    top_k: int = 5

    model: str = "gpt-4o-mini"

In [3]:
search_instructions = """
You are a search assistant for the Evidently documentation.

Evidently is an open-source Python library and cloud platform for evaluating, testing, and monitoring data and AI systems.
It provides evaluation metrics, testing APIs, and visual reports for model and data quality.

Your task is to help users find accurate, relevant information about Evidently's features, usage, and integrations.

You have access to the following tools:

- search — Use this to explore the topic and retrieve relevant snippets or documentation.
- read_file — Use this to retrieve or verify the complete content of a file when:
    * A code snippet is incomplete, truncated, or missing definitions.
    * You need to check that all variables, imports, and functions referenced in code are defined.
    * You must ensure the code example is syntactically correct and runnable.

If `read_file` cannot be used or the file content is unavailable, clearly state:
> "Unable to verify with read_file."

Search Strategy

- For every user query:
    * Perform at least 3 and at most 6 distinct searches to gather enough context.
    * Each search must use a different phrasing or keyword variation of the user's question.
    * Keep all searches relevant to Evidently (no need to include "Evidently" in the search text).

- After collecting search results:
    1. Synthesize the information into a concise, accurate answer.
    2. If your answer includes code, always validate it with `read_file` before finalizing.
    3. If a code snippet or reference is incomplete, explicitly mention it.

Important:
- The 6-search limit applies only to `search` calls.
- You may call `read_file` at any time, even after the search limit is reached.
- `read_file` calls are verification steps and do not count toward the 6-search limit.

Code Verification and Completeness Rules

- All variables, functions, and imports in your final code examples must be defined or imported.
- Never shorten, simplify, or truncate code examples. Always present the full, verified version.
- When something is missing or undefined in the search results:
    * Call `read_file` with the likely filename to retrieve the complete file content.
    * Replace any partial code with the full verified version.
- If the file is not available or cannot be verified:
    * Include a clear note: "Unable to verify this code."
- Do not reformat, rename variables, or omit lines from the verified code.

Output Format

- Write your answer clearly and accurately.
- Include a "References" section listing the search queries or file names you used.
- If you couldn’t find a complete answer after 6 searches, set found_answer = False.
""".strip()


class Reference(BaseModel):
    title: str
    filename: str

class Section(BaseModel):
    heading: str
    content: str
    references: list[Reference]


class SearchResultArticle(BaseModel):
    found_answer: bool
    title: str
    sections: list[Section]
    references: list[Reference]


In [4]:
config = AgentConfig()

tools = search_tools.prepare_search_tools(
    config.chunk_size,
    config.chunk_step,
    config.top_k
)

In [5]:
from agents import Agent, function_tool

In [6]:
agent_tools = [
    function_tool(tools.search),
    function_tool(tools.read_file)
]

In [10]:
search_agent = Agent(
    name='search',
    instructions=search_instructions,
    tools=agent_tools,
    model=config.model,
    output_type=SearchResultArticle
)

In [11]:
from agents import Runner

In [12]:
from openai.types.responses import ResponseTextDeltaEvent

In [13]:
from jaxn import StreamingJSONParser, JSONParserHandler

In [14]:
class SearchResultHandler(JSONParserHandler):
    def on_field_start(self, path: str, field_name: str):
        if field_name == "references":
            level = path.count("/") + 2
            print(f"\n{'#' * level} References\n")

    def on_field_end(self, path, field_name, value, parsed_value=None):
        if field_name == "title" and path == "":
            print(f"# {value}")

        elif field_name == "heading":
            print(f"\n\n## {value}\n")
        elif field_name == "content":
            print("\n") 

    def on_value_chunk(self, path, field_name, chunk):
        if field_name == "content":
            print(chunk, end="", flush=True)

    def on_array_item_end(self, path, field_name, item=None):
        if field_name == "references":
            title = item.get("title", "")
            filename = item.get("filename", "")
            print(f"- [{title}]({filename})")

handler = SearchResultHandler()

In [15]:
from agents.exceptions import MaxTurnsExceeded

In [16]:
# input = 'data drift'
input = 'llm as a judge'


In [60]:
async def run_stream(agent, input, handler, max_turns=3):
    try:
        result = Runner.run_streamed(
            agent,
            input=input,
            max_turns=max_turns
        )
        
        parser = StreamingJSONParser(handler)

        async for event in result.stream_events():
            if event.type == "run_item_stream_event":
                if event.item.type == "tool_call_item":
                    tool_call = event.item.raw_item
                    f_name = tool_call.name
                    args = tool_call.arguments
                    print(f"TOOL CALL ({event.item.agent.name}): {f_name}({args})")
            
            if event.type == "raw_response_event" and isinstance(event.data, ResponseTextDeltaEvent):
                parser.parse_incremental(event.data.delta)

        return result
    except MaxTurnsExceeded as e:
        print('too many turns')
        finish_prompt = 'System message: The number of searches has exceeded the limit. Proceed to finishing the writeup'
        finish_message = [{'role': 'user', 'content': finish_prompt}]
        messages = result.to_input_list() + finish_message
        final_result = await run_stream(agent, input=messages, handler=handler, max_turns=1)
        return final_result
    except InputGuardrailTripwireTriggered as e:
        output = e.guardrail_result.output
        if output.tripwire_triggered: 
            print(output.output_info)
        return result

In [19]:
result = await run_stream(
    search_agent,
    input,
    SearchResultHandler()
)

TOOL CALL (search): search({"query":"using llm as a judge in AI evaluation"})
TOOL CALL (search): search({"query":"LLM applications in legal context"})
TOOL CALL (search): search({"query":"AI evaluation ethics and judge role"})
TOOL CALL (search): search({"query":"role of large language models in decision making"})
TOOL CALL (search): read_file({"filename":"examples/LLM_judge.mdx"})
TOOL CALL (search): search({"query":"LLM as a judge insights and applications"})
too many turns
# Using LLMs as Judges in AI Evaluation


## Introduction

Large Language Models (LLMs) can be employed as evaluators or 'judges' in various applications, particularly in the context of evaluating responses, generating insights, and monitoring the quality of AI systems. Utilizing an LLM as a judge can enhance the evaluation process by providing consistent, objective metrics and assessments.


### References

- [LLM as a judge](examples/LLM_judge.mdx)


## Evaluation Techniques

1. **Reference-Based Evaluation**: 

In [35]:
guardrail_instructions = """
Make sure that the question the user asks is about the Evidently library,
monitoring, observability, AI, Machine Learning, or LLMs. If it's not, report it by
setting `fail` to True

Evidently is an open-source Python library and cloud platform for evaluating,
testing, and monitoring data, AI and LLM systems. It provides evaluation metrics,
testing APIs, and visual reports for model and data quality.

Examples of relevant topics:

- Create a custom LLM judge
- Customize data drift detection
- llm evaluations

Explain your decision in the reasoning field, but don't use more than 10 words
""".strip()

class EvidentlyDocsGuardrail(BaseModel):
    reasoning: str
    fail: bool

In [36]:
guardrail_agent = Agent( 
    name="guardrail",
    instructions=guardrail_instructions,
    model='gpt-4o-mini',
    output_type=EvidentlyDocsGuardrail,
)

In [39]:
result = await Runner.run(guardrail_agent, 'whats sqrt(pi)')

In [40]:
result.final_output

EvidentlyDocsGuardrail(reasoning='Question is unrelated to Evidently or AI.', fail=True)

In [49]:
from agents import input_guardrail, GuardrailFunctionOutput
from agents.exceptions import InputGuardrailTripwireTriggered

In [46]:
@input_guardrail
async def documentation_guardrail(ctx, agent, input):
    result = await Runner.run(guardrail_agent, input)
    final_output = result.final_output

    return GuardrailFunctionOutput(
        output_info=final_output.reasoning, 
        tripwire_triggered=final_output.fail,
    )

In [47]:
search_agent = Agent(
    name='search',
    instructions=search_instructions,
    tools=agent_tools,
    input_guardrails=[documentation_guardrail],
    model=config.model,
    output_type=SearchResultArticle
)

In [59]:
try:
    result = await Runner.run(search_agent, 'whats sqrt(pi)')
except InputGuardrailTripwireTriggered as e:
    output = e.guardrail_result.output
    if output.tripwire_triggered: 
        print(output.output_info)

Not related to Evidently or ML topics.


In [62]:
result = await run_stream(
    search_agent,
    'whats sqrt(pi)',
    SearchResultHandler()
)

Not related to Evidently library or AI topics.


In [65]:
result = run_stream(
    search_agent,
    'whats sqrt(pi)',
    SearchResultHandler()
)