# Evaluation: Agent

https://docs.smith.langchain.com/evaluation/quickstart

In [1]:
from dotenv import load_dotenv
from os import environ as env
from langsmith import Client

In [2]:
load_dotenv('.env-leo', override=True)
langsmith_key = env.get('LANGSMITH_API_KEY')
openai_api_key = env.get('OPENAI_API_KEY')

In [4]:
from langchain.agents import AgentExecutor, create_openai_functions_agent
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.tools import tool
from langchain_openai import ChatOpenAI

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "Spit some bars about the topic\n\n{agent_scratchpad}"),
        ("user", "{question}"),
    ]
)

@tool
def get_encouragement(request: str) -> str:
    """Get some encouragement."""
    return "You can do it!"

tools = [get_encouragement]
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.7)

# Since chains and agents can be stateful (they can have memory),
# create a constructor to pass in to the run_on_dataset method.
def create_agent():
    agent = create_openai_functions_agent(
        llm,
        tools=[get_encouragement],
        prompt=prompt,
    )
    return AgentExecutor(agent=agent, tools=tools)

In [6]:
client = Client()
dataset_name = "Rap Battle Dataset"

In [7]:
from langchain.smith import RunEvalConfig
from langsmith.evaluation import EvaluationResult, run_evaluator

@run_evaluator
def must_mention(run, example) -> EvaluationResult:
    prediction = run.outputs.get("text") or ""
    required = example.outputs.get("must_mention") or []
    score = all(phrase in prediction for phrase in required)
    return EvaluationResult(key="must_mention", score=score)

eval_config = RunEvalConfig(
    custom_evaluators=[must_mention],
    # You can also use a prebuilt evaluator
    # by providing a name or RunEvalConfig.<configured evaluator>
    evaluators=[
        # You can specify an evaluator by name/enum.
        # In this case, the default criterion is "helpfulness"
        "criteria",
        # Or you can configure the evaluator
        RunEvalConfig.Criteria("harmfulness"),
        RunEvalConfig.Criteria(
            {
                "cliche": "Are the lyrics cliche?"
                " Respond Y if they are, N if they're entirely unique."
            }
        ),
    ],
)
client.run_on_dataset(
    dataset_name=dataset_name,
    llm_or_chain_factory=create_agent,
    evaluation=eval_config,
    verbose=True,
    project_name="agent-test-1",
    # Any experiment metadata can be specified here
    project_metadata={"version": "1.0.0"},
)

View the evaluation results for project 'agent-test-1' at:
https://smith.langchain.com/o/08d5427c-65f6-54e0-ba99-91184efa5aca/datasets/8257acae-3a07-4903-bb7c-9a7b93b8ff65/compare?selectedSessions=47bf71e1-8aad-45e2-8e42-a39b0d263bff

View all tests for Dataset Rap Battle Dataset at:
https://smith.langchain.com/o/08d5427c-65f6-54e0-ba99-91184efa5aca/datasets/8257acae-3a07-4903-bb7c-9a7b93b8ff65
[------------------------------------------------->] 2/2

{'project_name': 'agent-test-1',
 'results': {'c0b1a426-dbb1-4bc7-98ae-1d7137988fe9': {'input': {'question': 'a rap battle between Atticus Finch and Cicero'},
   'feedback': [EvaluationResult(key='helpfulness', score=1, value='Y', comment='The criterion for this task is "helpfulness: Is the submission helpful, insightful, and appropriate?"\n\nStep 1: Assessing Helpfulness\nThe submission is a creative interpretation of the given task, which is a rap battle between Atticus Finch and Cicero. It provides a clear and engaging narrative of how such a battle might unfold, highlighting the strengths and characteristics of both characters. Therefore, it can be considered helpful in understanding and visualizing the given scenario.\n\nStep 2: Assessing Insightfulness\nThe submission is insightful as it captures the essence of both characters - Atticus Finch\'s dedication to justice and Cicero\'s rhetorical prowess. It also insightfully sets up the context of a rap battle between these two figur