# Evaluation: LLM or Chat Model

https://docs.smith.langchain.com/evaluation/quickstart

In [3]:
from dotenv import load_dotenv
from os import environ as env
from langsmith import Client

In [4]:
load_dotenv('.env-leo', override=True)
langsmith_key = env.get('LANGSMITH_API_KEY')
openai_api_key = env.get('OPENAI_API_KEY')

In [5]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

In [6]:
client = Client()
dataset_name = "Rap Battle Dataset"

In [8]:
from langchain.smith import RunEvalConfig
from langsmith.evaluation import EvaluationResult, run_evaluator

@run_evaluator
def must_mention(run, example) -> EvaluationResult:
    prediction = run.outputs["generations"][0][0]["text"]
    required = example.outputs.get("must_mention") or []
    score = all(phrase in prediction for phrase in required)
    return EvaluationResult(key="must_mention", score=score)

eval_config = RunEvalConfig(
    custom_evaluators=[must_mention],
    # You can also use a prebuilt evaluator
    # by providing a name or RunEvalConfig.<configured evaluator>
    evaluators=[
        # You can specify an evaluator by name/enum.
        # In this case, the default criterion is "helpfulness"
        "criteria",
        # Or you can configure the evaluator
        RunEvalConfig.Criteria("harmfulness"),
        RunEvalConfig.Criteria(
            {
                "cliche": "Are the lyrics cliche?"
                " Respond Y if they are, N if they're entirely unique."
            }
        ),
    ],
)
client.run_on_dataset(
    dataset_name=dataset_name,
    llm_or_chain_factory=llm,
    evaluation=eval_config,
    verbose=True,
    project_name=f"test-chatopenai",
    project_metadata={"version": "1.0.0"},
)

View the evaluation results for project 'test-chatopenai-2024-03-24 02:25:36' at:
https://smith.langchain.com/o/08d5427c-65f6-54e0-ba99-91184efa5aca/datasets/8257acae-3a07-4903-bb7c-9a7b93b8ff65/compare?selectedSessions=a2d6a26a-f139-482d-85fe-cd3b488ac68e

View all tests for Dataset Rap Battle Dataset at:
https://smith.langchain.com/o/08d5427c-65f6-54e0-ba99-91184efa5aca/datasets/8257acae-3a07-4903-bb7c-9a7b93b8ff65
[------------------------------------------------->] 2/2

{'project_name': 'test-chatopenai-2024-03-24 02:25:36',
 'results': {'c0b1a426-dbb1-4bc7-98ae-1d7137988fe9': {'input': {'question': 'a rap battle between Atticus Finch and Cicero'},
   'feedback': [EvaluationResult(key='helpfulness', score=1, value='Y', comment='The criterion for this task is "helpfulness: Is the submission helpful, insightful, and appropriate?"\n\n1. Helpfulness: The submission is helpful in the sense that it provides a creative and entertaining response to the given input. It imagines a rap battle between Atticus Finch and Cicero, two figures known for their eloquence and wisdom, and presents their arguments in a way that reflects their respective characters and values.\n\n2. Insightfulness: The submission is insightful as it captures the essence of both characters. Atticus Finch is portrayed as a man of honor and justice, while Cicero is depicted as a master of rhetoric and wit. The rap battle format allows for a unique exploration of these characters\' traits and p