In [1]:
import asyncio

In [18]:
async def agent():
    print('[agent] start')
    for i in range(10):
        print('[agent]', i)
        await asyncio.sleep(1)
    print('[agent] stop')
    return 'yahoo'

In [23]:
async def guardrail():
    print('[guardrail pass] start')
    await asyncio.sleep(1.5)
    print('[guardrail pass] check pass')
    print('[guardrail pass] stop')

    return True

In [9]:
check_pass = await guardrail()
if check_pass:
    await agent()

[guardrail] start
[guardrail] check pass
[guardrail] stop
[agent] start
[agent] 0
[agent] 1
[agent] 2
[agent] 3
[agent] 4
[agent] 5
[agent] 6
[agent] 7
[agent] 8
[agent] 9
[agent] stop


In [10]:
r1, r2 = await asyncio.gather(
    guardrail(),
    agent()
)

[guardrail] start
[agent] start
[agent] 0
[agent] 1
[guardrail] check pass
[guardrail] stop
[agent] 2
[agent] 3
[agent] 4
[agent] 5
[agent] 6
[agent] 7
[agent] 8
[agent] 9
[agent] stop


In [26]:
from dataclasses import dataclass

@dataclass
class GuardrailFunctionOutput:
    output_info: str
    tripwire_triggered: bool

In [28]:
class GuardrailException(Exception):
    def __init__(self, message: str, info: GuardrailFunctionOutput):
        super().__init__(message)
        self.info = info

async def guardrail_fail():
    print('[guardrail fail] start')
    await asyncio.sleep(2.5)
    print('[guardrail fail] check fails')
    info = GuardrailFunctionOutput(
        output_info='check fails',
        tripwire_triggered=True
    )
    raise GuardrailException("check fails", info)

In [20]:
try:
    agent_task = asyncio.create_task(agent())
    guardrail_task = asyncio.create_task(guardrail())

    r1, r2 = await asyncio.gather(
        agent_task,
        guardrail_task
    )
    print(r1)
except GuardrailException as e:
    agent_task.cancel()

    try:
        await agent_task
    except asyncio.CancelledError:
        print('[main] agent cancelled')


[agent] start
[agent] 0
[guardrail] start
[agent] 1
[guardrail] check pass
[guardrail] stop
[agent] 2
[agent] 3
[agent] 4
[agent] 5
[agent] 6
[agent] 7
[agent] 8
[agent] 9
[agent] stop
yahoo


In [29]:
async def run_with_guardrails(agent_coroutine, guardrails):
    """
    Run `agent_coroutine` while multiple guardrails monitor it.

    Parameters:
        agent_coroutine: an *awaitable*, e.g. agent()
        guardrails: an iterable of *awaitables*, e.g. [guard1(), guard2()]

    Returns:
        The result of the agent, if no guardrail triggers.

    Raises:
        GuardrailException from any guardrail.
    """

    agent_task = asyncio.create_task(agent_coroutine)
    guard_tasks = [asyncio.create_task(g) for g in guardrails]

    try:
        # If any guardrail raises GuardrailException,
        # gather will throw and we drop into except.
        await asyncio.gather(agent_task, *guard_tasks)

        # Agent finished successfully.
        return agent_task.result()

    except GuardrailException as e:
        # At least one guardrail fired.
        print("[guardrail fired]", e.info)

        # Cancel the agent.
        agent_task.cancel()
        try:
            await agent_task
        except asyncio.CancelledError:
            print("[run_with_guardrails] agent cancelled")

        # Cancel all guardrails (they may still be running).
        for t in guard_tasks:
            t.cancel()
        await asyncio.gather(*guard_tasks, return_exceptions=True)

        raise

In [30]:
result = await run_with_guardrails(
    agent(),
    [guardrail(), guardrail_fail()]
)

[agent] start
[agent] 0
[guardrail pass] start
[guardrail fail] start
[agent] 1
[guardrail pass] check pass
[guardrail pass] stop
[agent] 2
[guardrail fail] check fails
[guardrail fired] GuardrailFunctionOutput(output_info='check fails', tripwire_triggered=True)
[run_with_guardrails] agent cancelled


GuardrailException: check fails

In [31]:
import ver31

In [32]:
import search_agent

In [33]:
agent = search_agent.create_agent()

In [36]:
result = await run_with_guardrails(
    ver31.run(agent, 'llm as a judge'),
    [guardrail()] #, guardrail_fail()]
)

[guardrail pass] start
[guardrail pass] check pass
[guardrail pass] stop
TOOL CALL (search): input_guardrail({"message":"llm as a judge"})
TOOL CALL (search): search({"query": "using llm as a judge in decision making"})
TOOL CALL (search): search({"query": "how llm can assist in legal judgments"})
TOOL CALL (search): search({"query": "legal applications of LLMs in judiciary"})
TOOL CALL (search): search({"query": "AI judges using LLM technology"})
TOOL CALL (search): search({"query": "impact of LLMs on judicial decisions"})
# Using LLMs as Judges



## Introduction

Evidently allows for the integration of Large Language Models (LLMs) as evaluators or judges in various contexts, including decision-making processes. It leverages LLMs to evaluate the quality of AI outputs, data integrity, and model performance using custom criteria or established benchmarks.

### References

- [LLM as a judge](examples/LLM_judge.mdx)
- [LLM Evaluation](quickstart_llm.mdx)


## How to Implement an LLM Judg