# LangSmith Evaluation Deep Dive

https://github.com/langchain-ai/langsmith-cookbook/blob/main/introduction/langsmith_introduction.ipynb

## 手動によるデータセットの作成

In [None]:
import pandas as pd

# QA
inputs = [
    "How many tokens was DBRX pre-trained on?",
    "Is DBRX a MOE model and how many parameters does it have?",
    "How many GPUs was DBRX trained on and what was the connectivity between GPUs?",
]

outputs = [
    "DBRX was pre-trained on 12 trillion tokens of text and code data.",
    "Yes, DBRX is a fine-grained mixture-of-experts (MoE) architecture with 132B total parameters.",
    "DBRX was trained on 3072 NVIDIA H100s connected by 3.2Tbps Infiniband",
]

qa_pairs = [{"questions": q, "answer": a} for q, a in zip(inputs, outputs)]
df = pd.DataFrame(qa_pairs)
df.head()

In [2]:
csv_path = "./DBRX_eval.csv"
df.to_csv(csv_path, index=False)

In [None]:
from langsmith import Client

client = Client()
dataset_name = "DBRX"

dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="QA pairs about DBRX model.",
)
dataset

In [None]:
dataset.id

In [9]:
client.create_examples(
    inputs=[{"question": q} for q in inputs],
    outputs=[{"answer": a} for a in outputs],
    dataset_id=dataset.id,
)

In [10]:
new_questions = [
    "What is the context window of DBRX Instruct?",
]

new_answers = [
    "DBRX Instruct was trained with up to a 32K token context window.",
]

client.create_examples(
    inputs=[{"question": q} for q in new_questions],
    outputs=[{"answer": a} for a in new_answers],
    dataset_id=dataset.id,
)

## 質問に対する回答を生成

In [11]:
import os

os.environ["LANGCHAIN_PROJECT"] = "DBRX"

In [None]:
import requests
from bs4 import BeautifulSoup

url = "https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
text = [p.text for p in soup.find_all("p")]
full_text = "\n".join(text)

print(full_text[:1000])

In [14]:
import openai
from langsmith.wrappers import wrap_openai

openai_client = wrap_openai(openai.Client())

In [15]:
def answer_dbrx_question_oai(inputs: dict) -> dict:
    system_msg = (
        f"Answer user questions in 2-3 sentences about this context: \n\n\n {full_text}"
    )

    messages = [
        {"role": "system", "content": system_msg},
        {"role": "user", "content": inputs["question"]}
    ]

    response = openai_client.chat.completions.create(
        messages=messages,
        model="gpt-4o-mini",
    )

    return {"answer": response.dict()["choices"][0]["message"]["content"]}

In [None]:
answer_dbrx_question_oai({"question": "What are the main differences in training efficiency between MPT-7B vs DBRX?"})

In [None]:
answer_dbrx_question_oai({"question": "How many tokens was DBRX pre-trained on?"})

## LLM-as-a-Judgeによる評価

In [None]:
from langsmith.evaluation import evaluate, LangChainStringEvaluator

qa_evaluator = [LangChainStringEvaluator("cot_qa")]
dataset_name = "DBRX"

experiment_results = evaluate(
    answer_dbrx_question_oai,
    data=dataset_name,
    evaluators=qa_evaluator,
    experiment_prefix="test-dbrx-qa-oai",
    metadata={
        "variant": "stuff website context into gpt-4o-mini",
    }
)

## Custom evaluator

In [None]:
from langsmith.schemas import Run, Example


def is_answered(run: Run, example: Example) -> dict:
    student_answer = run.outputs.get("answer")

    if not student_answer:
        return {"key": "is_answered", "score": 0}
    else:
        return {"key": "is_answered", "score": 1}


qa_evaluator = [is_answered]
dataset_name = "DBRX"

experiment_results = evaluate(
    answer_dbrx_question_oai,
    data=dataset_name,
    evaluators=qa_evaluator,
    experiment_prefix="test-dbrx-qa-custom-eval-is-answered",
    metadata={
        "variant": "stuff website context into gpt-4o-mini",
    }
)

## Mistralとの比較

In [28]:
import ollama
from langsmith.run_helpers import traceable


@traceable(run_type="llm")
def call_ollama(messages, model: str):
    stream = ollama.chat(messages=messages, model=model, stream=True)
    response = ""
    for chunk in stream:
        print(chunk["message"]["content"], end="", flush=True)
        response = response + chunk["message"]["content"]
    return response


def answer_dbrx_question_mistral(inputs: dict) -> dict:
    system_msg = (
        f"Answer user questions about this context: \n\n\n {full_text}"
    )

    messages = [
        {"role": "system", "content": system_msg},
        {"role": "user", "content": f'Answer the question in 2-3 sentences {inputs["question"]}'}
    ]

    response = call_ollama(messages, model="mistral")

    return {"answer": response}

In [None]:
result = answer_dbrx_question_mistral({"question": "What are the main differences in training efficiency between MPT-7B vs DBRX?"})
result

In [None]:
# Mistral Evaluators
qa_evaluator = [LangChainStringEvaluator("cot_qa")]
dataset_name = "DBRX"

experiment_results = evaluate(
    answer_dbrx_question_mistral,
    data=dataset_name,
    evaluators=qa_evaluator,
    experiment_prefix="test-dbrx-qa-mistral",
    metadata={
        "variant": "stuff website context into mistral",
    }
)