# Evaluation: Custom class

https://docs.smith.langchain.com/evaluation/quickstart

In [1]:
from dotenv import load_dotenv
from os import environ as env
from langsmith import Client

In [2]:
load_dotenv('.env-leo', override=True)
langsmith_key = env.get('LANGSMITH_API_KEY')
openai_api_key = env.get('OPENAI_API_KEY')

In [31]:
class MyPredictor:
    state = 0    
    def predict(input_: dict) -> dict:
        MyPredictor.state += 1
        return {"output": f"Bar Bar Bar {MyPredictor.state}"}

def create_object(input):
    return MyPredictor.predict(input)

In [32]:
client = Client()
dataset_name = "Rap Battle Dataset"

In [33]:
from langchain.smith import RunEvalConfig, run_on_dataset
from langsmith.evaluation import EvaluationResult, run_evaluator

@run_evaluator
def must_mention(run, example) -> EvaluationResult:
    prediction = run.outputs.get("output") or ""
    must_mention = example.outputs.get("must_mention") or []
    score = all(phrase in prediction for phrase in must_mention)
    return EvaluationResult(key="must_mention", score=score)

eval_config = RunEvalConfig(
    custom_evaluators=[must_mention],
    # You can also use a prebuilt evaluator
    # by providing a name or RunEvalConfig.<configured evaluator>
    evaluators=[
        # You can specify an evaluator by name/enum.
        # In this case, the default criterion is "helpfulness"
        "criteria",
        # Or you can configure the evaluator
        RunEvalConfig.Criteria("harmfulness"),
        RunEvalConfig.Criteria(
            {
                "cliche": "Are the lyrics cliche?"
                " Respond Y if they are, N if they're entirely unique."
            }
        ),
    ],
)
client.run_on_dataset(
    dataset_name=dataset_name,
    # We are passing the "factory" function in this case.
    llm_or_chain_factory=create_object,
    evaluation=eval_config,
    verbose=True,
    project_name="custom-class-test-1",
    project_metadata={"version": "1.0.0"},
)

View the evaluation results for project 'custom-class-test-1' at:
https://smith.langchain.com/o/08d5427c-65f6-54e0-ba99-91184efa5aca/datasets/8257acae-3a07-4903-bb7c-9a7b93b8ff65/compare?selectedSessions=750523e9-ac7b-4144-a499-01bde7252a6b

View all tests for Dataset Rap Battle Dataset at:
https://smith.langchain.com/o/08d5427c-65f6-54e0-ba99-91184efa5aca/datasets/8257acae-3a07-4903-bb7c-9a7b93b8ff65
[------------------------------------------------->] 2/2

{'project_name': 'custom-class-test-1',
 'results': {'c0b1a426-dbb1-4bc7-98ae-1d7137988fe9': {'input': {'question': 'a rap battle between Atticus Finch and Cicero'},
   'feedback': [EvaluationResult(key='helpfulness', score=0, value='N', comment='The criterion for this task is "helpfulness: Is the submission helpful, insightful, and appropriate?"\n\nThe input task is a rap battle between Atticus Finch and Cicero. \n\nThe submission is "Bar Bar Bar 2."\n\nTo assess the submission against the criterion, we need to consider whether it is helpful, insightful, and appropriate.\n\nThe submission does not seem to be helpful or insightful as it does not provide any information or insight about a rap battle between Atticus Finch and Cicero. It is unclear what "Bar Bar Bar 2" means in this context.\n\nAs for appropriateness, while the submission is not offensive or inappropriate in a general sense, it does not seem to be appropriate to the specific task of a rap battle between Atticus Finch and 