# MuSiQue evaluation utilities

In [1]:
#|default_exp musique.eval

In [2]:
#|hide
from fastcore.test import *
from nbdev.showdoc import *

In [3]:
#|export

import evaluate
import pandas as pd

from bellek.text.utils import fuzzy_match

In [4]:
#|export

def fuzzy_match_metric(prediction: str, references: list[str]) -> float:
    return max([float(fuzzy_match(prediction, ref)) for ref in references])

In [5]:
#|export

musique_metric = evaluate.load("bdsaglam/musique")


def compute_scores(predicted_answer: str, reference_answers: list[str]) -> dict:
    musique_scores = musique_metric.compute(predictions=[predicted_answer], references=[reference_answers])
    fuzzy_match = fuzzy_match_metric(predicted_answer, reference_answers)
    return {**musique_scores, "fuzzy_match": fuzzy_match}

In [6]:
scores = compute_scores("Alexandre the Great", ["Alexander the Great", "Great Alexander"])
scores

{'exact_match': 0.0, 'f1': 0.5, 'fuzzy_match': 1.0}

In [7]:
#|export

def calculate_metrics(dataf: pd.DataFrame) -> dict:
    prediction_list = dataf["predicted_answer"].tolist()
    references_list = dataf["answers"].tolist()
    scores_list = [compute_scores(prediction, references) for prediction, references in zip(prediction_list, references_list)]
    return pd.DataFrame(scores_list).mean().to_dict()

In [8]:
#|export

def _exact_match(example):
    pred = example['predicted_answer']
    return pred is not None and any(pred == ref for ref in example['answers'])

def _fuzzy_match(example):
    pred = example['predicted_answer']
    return pred is not None and any((pred in ref) or (ref in pred) or fuzzy_match(pred, ref) for ref in example['answers'])


def compare_answers(dataf: pd.DataFrame) -> pd.DataFrame:
    dataf['exact_match'] = dataf.apply(_exact_match, axis=1)
    dataf['fuzzy_match'] = dataf.apply(_fuzzy_match, axis=1)
    return dataf

In [9]:
#|hide
import nbdev; nbdev.nbdev_export()