# MERA
MERA (Multimodal Evaluation for Russian-language Architectures) is a new open benchmark for the Russian language for evaluating fundamental models.

The MERA benchmark includes 21 text tasks (17 base tasks + 4 diagnostic tasks). See the [task-table](https://mera.a-ai.ru/tasks) for a complete list.

In [None]:
!pip install datasets

In [4]:
from datasets import load_dataset

DATASET_PATH = "ai-forever/MERA"

# dataset names in low-case and n-shots
task_name_few_shot = {
    "chegeka": 4,
    "bps": 2,
    "lcs":2,
    "mathlogicqa": 5,
    "ruworldtree": 5,
    "ruopenbookqa": 5,
    "simplear": 5,
    "rumultiar": 5,
    "rummlu": 5,
    "multiq": 0,
    "parus": 0,
    "rcb": 0,
    "rumodar": 0,
    "rwsd": 0,
    "use": 0,
    "rudetox": 0,
    "ruethics": 0,
    "ruhatespeech": 0,
    "ruhhh": 0,
    "rutie": 0,
    "ruhumaneval": 0,
}
task_names = list(task_name_few_shot.keys())
#rummlu
task_names = [task_names[8]]

In [None]:
!pip install openai

In [6]:
from openai import OpenAI

client = OpenAI(
    # defaults to os.environ.get("OPENAI_API_KEY")
    api_key="",
)
client

<openai.OpenAI at 0x7df94215e650>

In [5]:
import abc
import typing as tp

class Conversation(abc.ABC):
    """
    Inspired by https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
    """
    def __init__(self, system_prompt: str = "", roles: tp.Tuple[str, str] = ("user", "assistant")):
        self.system_prompt = system_prompt
        self.roles = roles
        self.messages: tp.List[tp.List[str, str]] = []

    def get_prompt(self) -> str:
        pass

    def update_last_message(self, text: str) -> None:
        self.messages[-1] = (self.messages[-1][0], text)

    def append_message(self, role: str, text: str) -> None:
        self.messages.append({"role":role, "content":text})

In [17]:
from tqdm.auto import tqdm
import numpy as np
import json
import pathlib

output_dir = "mera_openai"
pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)

def append_to_jsonl(data: list, filename: str) -> None:
    with open(filename, "a") as f:
        f.write(json.dumps(data) + "\n")

def get_prompt_from_dataframes(task_name, row, dev_df, k_shot):
    assert 0 <= k_shot <= 5
    if task_name == "rummlu":
        row_domain = row["meta"]["domain"]
        k_shot_df = dev_df[dev_df.apply(lambda x: x["meta"]["domain"]==row_domain, axis=1)].head(k_shot)
        k_shot_text = "\n".join(k_shot_df["inputs"].apply(lambda x: "{text}\nA) {option_a}\nB) {option_b}\nC) {option_c}\nD) {option_d}\nОтвет:".format(**x)) + k_shot_df["outputs"])
        row["inputs"]["text"] = k_shot_text + "\n" + row["inputs"]["text"]

    if isinstance(row["inputs"], dict):
        row_prompt = row["instruction"].format(**row["inputs"])
    else:
        row_prompt = row["instruction"].format(inputs=row["inputs"])

    conv = Conversation()
    conv.append_message(conv.roles[0], row_prompt)
    return conv.messages

for task_name in task_names:
    k_shot = task_name_few_shot[task_name]
    ds = load_dataset(DATASET_PATH, task_name)
    test_df = ds["test"].to_pandas()

    dev_df = None
    if "train" in ds.column_names:
        dev_df = ds["train"].to_pandas()

    js_out = {"data": {"test": []}}
    for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc=task_name):
        input_messages = get_prompt_from_dataframes(task_name, row, dev_df, k_shot)
        chat_params = {
            "model": "gpt-3.5-turbo-1106",
            "messages": input_messages,
            "max_tokens": 1,
        }
        chat_completion = client.chat.completions.create(**chat_params)

        preds = {"A":0, "B":0, "C":0, "D":0}
        pred = chat_completion.choices[0].message.content
        if pred in preds.keys():
            preds[pred] = 1
        else:
            print("empty response", row["meta"]["id"])

        if task_name == "rummlu":
            best_idx = np.argmax(list(preds.values()))
            y_pred = list(preds.keys())[best_idx]
            js_out["data"]["test"].append({"outputs": y_pred, "meta": {"id": row["meta"]["id"]}})


    if task_name == "rummlu":
        task_out_filename = "ruMMLU"
        jsonl_filepath = str(pathlib.Path(output_dir) / f"{task_out_filename}.json")

    with open(jsonl_filepath, "w") as f:
        f.write(json.dumps(js_out, indent=4))

rummlu:   0%|          | 0/961 [00:00<?, ?it/s]

|Модель| Задача| Результат  | Метрика  |
|---|---|---|---|
|  Human Benchmark | ruMMLU |  **0.898** |  Accuracy |
|  gpt-3.5-turbo-1106 | ruMMLU |  **0.743** |  Accuracy |