# Prompt Iteration Walkthrough

https://github.com/langchain-ai/langsmith-cookbook/blob/main/testing-examples/movie-demo/prompt_iteration.ipynb

In [1]:
from langsmith import Client

client = Client()

In [2]:
toxic_examples = [
    ("Shut up, idiot", "Toxic"),
    ("You're a wonderful person", "Not toxic"),
    ("This is the worst thing ever", "Toxic"),
    ("I had a great day today", "Not toxic"),
    ("Nobody likes you", "Toxic"),
    ("This movie is a masterpiece", "Not toxic"),
    ("Go away and never come back", "Toxic"),
    ("Thank you for your help", "Not toxic"),
    ("This is so dumb", "Toxic"),
    ("I appreciate your efforts", "Not toxic"),
    ("This is a waste of time", "Toxic"),
    ("This movie blows", "Toxic"),
    ("This is unacceptable. I want to speak to the manager.", "Toxic"),
]

In [3]:
toxic_dataset_name = "Toxic Queires"

if not client.has_dataset(dataset_name=toxic_dataset_name):
    toxic_dataset = client.create_dataset(dataset_name=toxic_dataset_name)
    inputs, outputs = zip(
        *[({"text": text}, {"label": label}) for text, label in toxic_examples]
    )
    client.create_examples(inputs=inputs, outputs=outputs, dataset_id=toxic_dataset.id)

In [9]:
from langsmith import traceable, wrappers
from openai import Client

openai = wrappers.wrap_openai(Client())

@traceable
def label_query(text):
    messages = [
        {
            "role": "system",
            "content": "Given the following user query, assess whether it contains toxic content. Please provide a sample 'Tocic' or 'Not toxic' response based on your assessment.",
        },
        {"role": "user", "content": text},
    ]
    result = openai.chat.completions.create(
        messages=messages,
        model="gpt-4o-mini",
        temperature=0
    )
    return result.choices[0].message.content


@traceable
def label_query_alternate_prompt(text):
    messages = [
        {
            "role": "system",
            "content": "Please review the user query below and determine if it contains any form of toxic behavior, such as insults, threats, or highly negative comments. Respond with 'Toxic' if it does, and 'Not toxic' if it doesn't.",
        },
        {"role": "user", "content": text},
    ]
    result = openai.chat.completions.create(
        messages=messages,
        model="gpt-4o-mini",
        temperature=0
    )
    return result.choices[0].message.content

In [None]:
label_query("You're a wonderful person")

In [None]:
from langsmith.evaluation import evaluate


# row-level evaluator
def correct_label(run, example) -> dict:
    score = run.outputs.get("output") == example.outputs.get("label")
    return {"score": int(score)}


# summary (experiment-level) evaluator
def summary_eval(runs, examples):
    correct = 0
    for i, run in enumerate(runs):
        if run.outputs["output"] == examples[i].outputs["label"]:
            correct += 1
    if correct / len(runs) > 0.5:
        return {"key": "pass", "score": True}
    else:
        return {"key": "pass", "score": False}


result_1 = evaluate(
    lambda inputs: label_query(inputs["text"]),
    data=toxic_dataset_name,
    evaluators=[correct_label],
    summary_evaluators=[summary_eval],
    experiment_prefix="Toxic Queries",
    metadata={
        "prompt_version": "1"
    },
)

In [None]:
result_2 = evaluate(
    lambda inputs: label_query_alternate_prompt(inputs["text"]),
    data=toxic_dataset_name,
    evaluators=[correct_label],
    summary_evaluators=[summary_eval],
    experiment_prefix="Toxic Queries",
    metadata={
        "prompt_version": "2"
    },
)