## Building an ECN Eval

This notebook shows how to build and run an eval following instructions in docs/build-eval.md
Inspiration taken from https://github.com/alan-eu/evals/blob/main/examples/mmlu.ipynb

In [None]:
import os


# Assuming this notebook is in examples/
registry_path = os.path.join(os.getcwd(), "../evals/registry")

In [None]:
import datasets

HF_TOKEN = "..."
dataset = datasets.load_dataset("Alan-health/ecn", token=HF_TOKEN)

In [None]:

sys_msg = "Vous êtes un expert médical. Vous devez répondre à des questions médicales à choix multiple. Choisissez les réponses correctes parmi les réponses proposées, indiquées par les lettres A, B, C, D ou E. Si la bonne réponse est la réponse A, répondez simplement 'A'. Les questions peuvents avoir plusieurs réponses. Dans ce cas, répondez avec toutes les réponses, séparées par une virgule, par example 'B,D,E'."


def create_chat_prompt(sample):
    user_prompt = (
        f"{sample['question']}\n"
        f"A: {sample['answer_A']}\n"
        f"B: {sample['answer_B']}\n"
        f"C: {sample['answer_C']}\n"
        f"D: {sample['answer_D']}\n"
        f"E: {sample['answer_E']}\n"
        f"Réponse:"
    )
    return [
        {"role": "system", "content": sys_msg},
        {"role": "user", "content": user_prompt}
    ]

def create_chat_example(sample):
    """
    Form few-shot prompts in the recommended format: https://github.com/openai/openai-python/blob/main/chatml.md#few-shot-prompting
    """
    user_prompt = (
        f"{sample['question']}\n"
        f"A: {sample['answer_A']}\n"
        f"B: {sample['answer_B']}\n"
        f"C: {sample['answer_C']}\n"
        f"D: {sample['answer_D']}\n"
        f"E: {sample['answer_E']}\n"
        f"Réponse:"
    )
    return [
        {"role": "system", "content": user_prompt, "name": "example_user"},
        {"role": "system", "content": sample['Correct answer'], "name": "example_assistant"},
    ]

In [None]:
import yaml
import json

registry_yaml = {}

subject_path = os.path.join(registry_path, "data", "ecn")
os.makedirs(subject_path, exist_ok=True)

# Create few-shot prompts
few_shot_dataset = dataset['train'].filter(lambda s: s['question_year'] == 2019 and not s['Has picture']).map(lambda example: {'sample': create_chat_example(example), **example})
few_shot_path = os.path.join(registry_path, "data", "ecn", "few_shot.jsonl")
with open(few_shot_path, "w", encoding="utf-8") as f:
    for example in few_shot_dataset.select(range(4)):
        f.write(json.dumps(example, ensure_ascii=False) + "\n")

# Create test prompts and ideal completions
eval_dataset = dataset['train'].filter(lambda s: s['question_year'] == 2020 and not s['Has picture']).map(lambda example: {'input': create_chat_prompt(example), "ideal": example['Correct answer'], **example})
samples_path = os.path.join(registry_path, "data", "ecn", "samples.jsonl")
with open(samples_path, "w", encoding="utf-8") as f:
    for example in eval_dataset:
        f.write(json.dumps(example, ensure_ascii=False) + "\n")

eval_id = f"match_ecn"

registry_yaml[eval_id] = {
    "id": f"{eval_id}.test.v1",
    "metrics": ["accuracy", "total_score", "average_score", "valid_format"]
}
registry_yaml[f"{eval_id}.test.v1"] = {
    "class": "evals.elsuite.ecn:Ecn",
    "args": {
        "samples_jsonl": samples_path,
        "few_shot_jsonl": few_shot_path,
        "num_few_shot": 4,
    }
}
yaml_path = os.path.join(registry_path, "evals", "match_ecn.yaml")
with open(yaml_path, "w") as f:
    yaml.dump(registry_yaml, f)

In [None]:
# This will generate a JSONL which will record samples and logs and store it in /tmp/evallogs
!oaieval gpt-3.5-turbo match_ecn