In [11]:
from langsmith import Client
import os

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = "{YOUR_LANGCHAIN_APIKEY}"

# Inputs are provided to your model, so it know what to generate
dataset_inputs = [
    "a rap battle between Atticus Finch and Cicero",
    "a rap battle between Barbie and Oppenheimer",
    # ... add more as desired
]

# Outputs are provided to the evaluator, so it knows what to compare to
# Outputs are optional but recommended.
dataset_outputs = [
    {"must_mention": ["lawyer", "justice"]},
    {"must_mention": ["plastic", "nuclear"]},
]
client = Client()
dataset_name = "Rap Battle Dataset"

# Storing inputs in a dataset lets us
# run chains and LLMs over a shared set of examples.
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Rap battle prompts.",
)
client.create_examples(
    inputs=[{"question": q} for q in dataset_inputs],
    outputs=dataset_outputs,
    dataset_id=dataset.id,
)

In [13]:
import openai

# You evaluate any arbitrary function over the dataset.
# The input to the function will be the inputs dictionary for each example.
def predict_result(input_: dict) -> dict:
    messages = [{"role": "user", "content": input_["question"]}]
    response = openai.chat.completions.create(messages=messages, model="gpt-4o-mini")
    return {"output": response}

In [None]:
from langchain.smith import RunEvalConfig
from langsmith.evaluation.evaluator import EvaluationResult


def must_mention_evaluator(run, example):
    try:
        prediction = run.outputs.get("output", "")
        required_phrases = example.outputs.get("must_mention", [])
        score = all(phrase.lower() in prediction.lower() for phrase in required_phrases)
        return EvaluationResult(
            key="must_mention",
            score=float(score),
            comment=f"Required phrases found: {score}"
        )
    except Exception as e:
        return EvaluationResult(
            key="must_mention",
            score=0.0,
            comment=f"Error in evaluation: {str(e)}"
        )


eval_config = RunEvalConfig(
    evaluators=[
        RunEvalConfig.Criteria("helpfulness"),
        RunEvalConfig.Criteria("relevance"),
        RunEvalConfig.Criteria("harmfulness"),
        RunEvalConfig.Criteria({
            "cliche": "Are the lyrics cliche? Respond Y if they are, N if they're entirely unique."
        })
    ],
    custom_evaluators=[must_mention_evaluator]
)

evaluation_results = client.run_on_dataset(
    dataset_name=dataset_name,
    llm_or_chain_factory=model,
    evaluation=eval_config,
    project_name="rap-battle-evaluation",
    verbose=True
)

print("\nEvaluation Results:")
for result in evaluation_results:
    print(f"Result: {result}")
    print("---")