In [5]:
import os
from typing import List, Union
from tqdm import tqdm
from databench_eval import Evaluator, utils
import pandas as pd

In [6]:
class CustomEvaluator(Evaluator):
    def eval(
        self,
        responses: Union[List[str], str],
        lite: bool = False,
    ) -> dict:
        if isinstance(responses, str):
            with open(responses, "r") as f:
                responses = f.read().splitlines()

        correct = {
            "average": 0,
            "category": 0,
            "boolean": 0,
            "number": 0,
            "list[category]": 0,
            "list[number]": 0,
        }
        truths = self.qa["answer"] if not lite else self.qa["sample_answer"]
        evals = {
            "average": [],
            "category": [],
            "boolean": [],
            "number": [],
            "list[category]": [],
            "list[number]": [],
        }
        for response, truth, semantic in tqdm(
            zip(responses, truths, self.qa["type"]), total=len(truths)
        ):
            truthy = self.compare(response, truth, semantic)
            if self.compare(response, truth, semantic):
                correct["average"] += 1
                correct[semantic] += 1
            evals["average"].append(truthy)
            evals[semantic].append(truthy)
        self.evals = evals
        return {
            "average": correct["average"] / len(truths),
            "category": correct["category"] / len(self.evals["category"]),
            "boolean": correct["boolean"] / len(self.evals["boolean"]),
            "number": correct["number"] / len(self.evals["number"]),
            "list[category]": correct["list[category]"]
            / len(self.evals["list[category]"]),
            "list[number]": correct["list[number]"] / len(self.evals["list[number]"]),
        }

In [7]:
qa = utils.load_qa(name="semeval", split="dev")
evaluator = CustomEvaluator(qa=qa)

In [8]:
input_file = "logs/"

logs = []
for filename in os.listdir(input_file):
    if not filename.endswith(".txt"):
        continue
    with open(os.path.join(input_file, filename), "r", encoding="utf-8") as f:
        lines = f.read().split("--------------------")
        model = lines[0].split("\n")[0].replace("Model:", "")
        for i in range(0, len(lines) - 1):
            lines[i] = lines[i].split("Response:")[1].strip()
        accuracy = evaluator.eval(lines, lite=filename.endswith("lite.txt"))
        accuracy["log"] = filename.replace(".txt", "")
        accuracy["task"] = "Task 2" if filename.endswith("lite.txt") else "Task 1"
        accuracy["model"] = model
        logs.append(accuracy)

100%|██████████| 320/320 [00:00<00:00, 18012.90it/s]
100%|██████████| 320/320 [00:00<00:00, 21767.04it/s]
100%|██████████| 320/320 [00:00<00:00, 20554.96it/s]
100%|██████████| 320/320 [00:00<00:00, 26300.70it/s]
100%|██████████| 320/320 [00:00<00:00, 9692.00it/s]
100%|██████████| 320/320 [00:00<00:00, 8789.87it/s]
100%|██████████| 320/320 [00:00<00:00, 19613.87it/s]
100%|██████████| 320/320 [00:00<00:00, 22329.80it/s]
100%|██████████| 320/320 [00:00<00:00, 53233.54it/s]
100%|██████████| 320/320 [00:00<00:00, 12280.88it/s]
100%|██████████| 320/320 [00:00<00:00, 24544.69it/s]
100%|██████████| 320/320 [00:00<00:00, 20729.57it/s]
100%|██████████| 320/320 [00:00<00:00, 22941.24it/s]
100%|██████████| 320/320 [00:00<00:00, 14800.76it/s]
100%|██████████| 320/320 [00:00<00:00, 11541.44it/s]
100%|██████████| 320/320 [00:00<00:00, 21133.99it/s]
100%|██████████| 320/320 [00:00<00:00, 17339.22it/s]
100%|██████████| 320/320 [00:00<00:00, 21752.22it/s]
100%|██████████| 320/320 [00:00<00:00, 20338.18i

In [10]:
df = pd.DataFrame(logs)
df = df[
    [
        "log",
        "model",
        "task",
        "average",
        "boolean",
        "category",
        "number",
        "list[category]",
        "list[number]",
    ]
]
df.sort_values(by=["log"])
df.to_csv("evaluations.csv", index=False)