# Evaluation: Runnable (LCEL 문법)

https://docs.smith.langchain.com/evaluation/quickstart

In [1]:
from dotenv import load_dotenv
from os import environ as env
from langsmith import Client

In [2]:
load_dotenv('.env-leo', override=True)
langsmith_key = env.get('LANGSMITH_API_KEY')
openai_api_key = env.get('OPENAI_API_KEY')

In [3]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
prompt = ChatPromptTemplate.from_messages([("human", "Spit some bars about {question}.")])
chain = prompt | llm | StrOutputParser()

In [4]:
client = Client()
dataset_name = "Rap Battle Dataset"

In [6]:
from langchain.smith import RunEvalConfig
from langsmith.evaluation import EvaluationResult, run_evaluator

@run_evaluator
def must_mention(run, example) -> EvaluationResult:
    prediction = run.outputs.get("output") or ""
    required = example.outputs.get("must_mention") or []
    score = all(phrase in prediction for phrase in required)
    return EvaluationResult(key="must_mention", score=score)

    
eval_config = RunEvalConfig(
    custom_evaluators=[must_mention],
    # You can also use a prebuilt evaluator
    # by providing a name or RunEvalConfig.<configured evaluator>
    evaluators=[
        # You can specify an evaluator by name/enum.
        # In this case, the default criterion is "helpfulness"
        "criteria",
        # Or you can configure the evaluator
        RunEvalConfig.Criteria("harmfulness"),
        RunEvalConfig.Criteria(
            {
                "cliche": "Are the lyrics cliche?"
                " Respond Y if they are, N if they're entirely unique."
            }
        ),
    ],
)
client.run_on_dataset(
    dataset_name=dataset_name,
    llm_or_chain_factory=chain,
    evaluation=eval_config,
    verbose=True,
    project_name="test-runnable",
    # Any experiment metadata can be specified here
    project_metadata={"version": "1.0.0"},
)

View the evaluation results for project 'test-runnable-2024-03-24 02:37:03' at:
https://smith.langchain.com/o/08d5427c-65f6-54e0-ba99-91184efa5aca/datasets/8257acae-3a07-4903-bb7c-9a7b93b8ff65/compare?selectedSessions=78e93c78-7044-451e-ba3a-a3a7712878a6

View all tests for Dataset Rap Battle Dataset at:
https://smith.langchain.com/o/08d5427c-65f6-54e0-ba99-91184efa5aca/datasets/8257acae-3a07-4903-bb7c-9a7b93b8ff65
[------------------------------------------------->] 2/2

{'project_name': 'test-runnable-2024-03-24 02:37:03',
 'results': {'c0b1a426-dbb1-4bc7-98ae-1d7137988fe9': {'input': {'question': 'a rap battle between Atticus Finch and Cicero'},
   'feedback': [EvaluationResult(key='helpfulness', score=1, value='Y', comment='The criterion for this task is "helpfulness: Is the submission helpful, insightful, and appropriate?"\n\n1. Helpfulness: The submission is a creative interpretation of the given task, which is a rap battle between Atticus Finch and Cicero. It provides a narrative of how such a battle might unfold, which can be considered helpful in understanding the task.\n\n2. Insightfulness: The submission demonstrates insight into the characters of Atticus Finch and Cicero, highlighting their respective strengths - Finch\'s moral compass and Cicero\'s rhetorical genius.\n\n3. Appropriateness: The submission is appropriate as it sticks to the task of creating a rap battle between the two characters. It does not include any offensive or inapprop