In [2]:
import re
import string
import collections
from typing import Callable
import evaluate


def compute(predictions, references):
    """Returns the scores"""

    if len(predictions) != len(references):
        raise ValueError(
            "The number of predictions and references should be the same."
        )

    if len(predictions) == 0:
        return {"exact_match": 0.0, "f1": 0.0}

    exact_scores = [
        metric_max_over_ground_truths(compute_exact, prediction, reference)
        for prediction, reference in zip(predictions, references)
    ]
    f1_scores = [
        metric_max_over_ground_truths(compute_f1, prediction, reference)
        for prediction, reference in zip(predictions, references)
    ]
    return {
        "exact_match": sum(exact_scores) / len(exact_scores),
        "f1": sum(f1_scores) / len(f1_scores),
    }


# Source: https://github.com/StonyBrookNLP/musique/blob/main/metrics/answer.py


def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def get_tokens(s):
    if not s:
        return []
    return normalize_answer(s).split()


def compute_exact(a_gold, a_pred):
    return int(normalize_answer(a_gold) == normalize_answer(a_pred))


def compute_f1(a_gold, a_pred):
    gold_toks = get_tokens(a_gold)
    pred_toks = get_tokens(a_pred)
    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
    num_same = sum(common.values())
    if len(gold_toks) == 0 or len(pred_toks) == 0:
        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
        return int(gold_toks == pred_toks)
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(pred_toks)
    recall = 1.0 * num_same / len(gold_toks)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def metric_max_over_ground_truths(
    metric_fn: Callable[[str, str], float],
    prediction: str,
    ground_truths: list[str],
) -> float:
    scores_for_ground_truths = [
        metric_fn(prediction, ground_truth) for ground_truth in ground_truths
    ]
    return max(scores_for_ground_truths)




In [4]:
from bellem.text.utils import fuzzy_match

In [1]:
import pandas as pd

In [3]:
df = pd.read_json("./comparisons.jsonl", orient='records', lines=True)
df.head()

Unnamed: 0,id,paragraphs,question,question_decomposition,answer,answer_aliases,answerable,answers,predicted_answer,exact_match,fuzzy_match
0,2hop__131818_161450,"[{'idx': 0, 'title': 'Maria Carrillo High Scho...",Where is the Voshmgir District located?,"[{'id': 131818, 'question': 'Which state is Vo...",in the north-east of the country south of the ...,"[Caspian Sea, in the north-east of the country...",True,"[Caspian Sea, in the north-east of the country...",Golestan Province,False,False
1,2hop__711946_269414,"[{'idx': 0, 'title': 'Wild Thing (Tone Lōc son...",What record label is the performer who release...,"[{'id': 711946, 'question': 'All Your Faded Th...",Kill Rock Stars,[Kill Rock Stars],True,[Kill Rock Stars],Not applicable.,False,False
2,2hop__311931_417706,"[{'idx': 0, 'title': 'The Main Attraction (alb...",What record label does the performer of Emotio...,"[{'id': 311931, 'question': 'Emotional Rain >>...",Attic Records,"[Attic, Attic Records]",True,"[Attic, Attic Records]",Attic Records,True,True
3,2hop__358582_189042,"[{'idx': 0, 'title': 'The Main Attraction (alb...",What is the record label of the Thrill of a Li...,"[{'id': 358582, 'question': 'Thrill of a Lifet...",New Renaissance Records,[New Renaissance Records],True,[New Renaissance Records],Capitol Records,False,False
4,2hop__341176_711757,"[{'idx': 0, 'title': 'Gmina Pabianice', 'parag...",What other district is found in the same count...,"[{'id': 341176, 'question': 'Gmina Stężyca, Lu...",Gmina Ryki,"[Ryki, Gmina Ryki]",True,"[Ryki, Gmina Ryki]",Gmina Stężyca,False,False


In [7]:
i = 0
row = df.iloc[0]
row

id                                                      2hop__131818_161450
paragraphs                [{'idx': 0, 'title': 'Maria Carrillo High Scho...
question                            Where is the Voshmgir District located?
question_decomposition    [{'id': 131818, 'question': 'Which state is Vo...
answer                    in the north-east of the country south of the ...
answer_aliases            [Caspian Sea, in the north-east of the country...
answerable                                                             True
answers                   [Caspian Sea, in the north-east of the country...
predicted_answer                                          Golestan Province
exact_match                                                           False
fuzzy_match                                                           False
Name: 0, dtype: object

In [8]:
compute([row['predicted_answer']], [row['answers']])

{'exact_match': 0.0, 'f1': 0.0}

In [9]:
scores_series = df.apply(lambda row: compute([row['predicted_answer']], [row['answers']]), axis=1)
df['EM'] = scores_series.map(lambda d: d['exact_match'])
df['F1'] = scores_series.map(lambda d: d['f1'])

In [10]:
df.head()

Unnamed: 0,id,paragraphs,question,question_decomposition,answer,answer_aliases,answerable,answers,predicted_answer,exact_match,fuzzy_match,EM,F1
0,2hop__131818_161450,"[{'idx': 0, 'title': 'Maria Carrillo High Scho...",Where is the Voshmgir District located?,"[{'id': 131818, 'question': 'Which state is Vo...",in the north-east of the country south of the ...,"[Caspian Sea, in the north-east of the country...",True,"[Caspian Sea, in the north-east of the country...",Golestan Province,False,False,0.0,0.0
1,2hop__711946_269414,"[{'idx': 0, 'title': 'Wild Thing (Tone Lōc son...",What record label is the performer who release...,"[{'id': 711946, 'question': 'All Your Faded Th...",Kill Rock Stars,[Kill Rock Stars],True,[Kill Rock Stars],Not applicable.,False,False,0.0,0.0
2,2hop__311931_417706,"[{'idx': 0, 'title': 'The Main Attraction (alb...",What record label does the performer of Emotio...,"[{'id': 311931, 'question': 'Emotional Rain >>...",Attic Records,"[Attic, Attic Records]",True,"[Attic, Attic Records]",Attic Records,True,True,1.0,1.0
3,2hop__358582_189042,"[{'idx': 0, 'title': 'The Main Attraction (alb...",What is the record label of the Thrill of a Li...,"[{'id': 358582, 'question': 'Thrill of a Lifet...",New Renaissance Records,[New Renaissance Records],True,[New Renaissance Records],Capitol Records,False,False,0.0,0.4
4,2hop__341176_711757,"[{'idx': 0, 'title': 'Gmina Pabianice', 'parag...",What other district is found in the same count...,"[{'id': 341176, 'question': 'Gmina Stężyca, Lu...",Gmina Ryki,"[Ryki, Gmina Ryki]",True,"[Ryki, Gmina Ryki]",Gmina Stężyca,False,False,0.0,0.5


In [18]:
em_success_mask = (df['EM'] > 0).values
em_fail_mask = (df['EM'] < 1).values
f1_success_mask = (df['F1'] > 0).values
f1_fail_mask = (df['F1'] < 0.1).values

In [27]:
report_df = df.drop(columns=['paragraphs', 'question', 'question_decomposition', 'answer', 'answers', 'answerable', 'exact_match', 'fuzzy_match'])
report_df.head()

Unnamed: 0,id,answer_aliases,predicted_answer,EM,F1
0,2hop__131818_161450,"[Caspian Sea, in the north-east of the country...",Golestan Province,0.0,0.0
1,2hop__711946_269414,[Kill Rock Stars],Not applicable.,0.0,0.0
2,2hop__311931_417706,"[Attic, Attic Records]",Attic Records,1.0,1.0
3,2hop__358582_189042,[New Renaissance Records],Capitol Records,0.0,0.4
4,2hop__341176_711757,"[Ryki, Gmina Ryki]",Gmina Stężyca,0.0,0.5


In [48]:
print(report_df.loc[em_success_mask & f1_success_mask].sample(3).drop(columns=['id']).to_markdown(index=False))

| answer_aliases                     | predicted_answer   |   EM |   F1 |
|:-----------------------------------|:-------------------|-----:|-----:|
| ['Chao Phraya River']              | Chao Phraya River  |    1 |    1 |
| ['Joan Erikson']                   | Joan Erikson       |    1 |    1 |
| ['Chiang Hsiao-wu', 'Alex Chiang'] | Chiang Hsiao-wu    |    1 |    1 |


In [50]:
print(report_df.loc[em_fail_mask & f1_success_mask].sample(3).to_markdown(index=False))

| id                  | answer_aliases              | predicted_answer      |   EM |   F1 |
|:--------------------|:----------------------------|:----------------------|-----:|-----:|
| 2hop__223655_463572 | ['MGM', 'MGM Records']      | Jasmine Records       |    0 |  0.5 |
| 2hop__574419_270105 | ['Stanley']                 | Stanley, North Dakota |    0 |  0.5 |
| 2hop__358582_189042 | ['New Renaissance Records'] | Capitol Records       |    0 |  0.4 |


In [32]:
print(report_df.loc[em_fail_mask & f1_success_mask].set_index("id").loc[['2hop__307218_161450', '2hop__280451_84616', '2hop__121145_561444' ]].to_markdown(index=False))

| answer_aliases                                                                                               | predicted_answer      |   EM |       F1 |
|:-------------------------------------------------------------------------------------------------------------|:----------------------|-----:|---------:|
| ['Caspian Sea', 'in the north-east of the country south of the Caspian Sea']                                 | north-east of Iran    |    0 | 0.363636 |
| ['leading role', 'for Best Performance by a Leading Actress in a Play in Ondine', 'lead', 'leading actress'] | Best Leading Actress  |    0 | 0.8      |
| ['Dovber Schneuri']                                                                                          | Rabbi Dovber Schneuri |    0 | 0.8      |


In [38]:
print(report_df.loc[em_fail_mask & f1_fail_mask].sample(3).drop(columns=['id']).to_markdown(index=False))

| answer_aliases                              | predicted_answer         |   EM |   F1 |
|:--------------------------------------------|:-------------------------|-----:|-----:|
| ['Ahmad Shah Qajar']                        | Mahmoud Mirza            |    0 |    0 |
| ['in Northern Florida', 'Northern Florida'] | 45th most populated city |    0 |    0 |
| ['Het Scheur']                              | North Sea                |    0 |    0 |


In [47]:
print(report_df.loc[em_fail_mask & f1_fail_mask].sample(3).drop(columns=['id']).to_markdown(index=False))

| answer_aliases        | predicted_answer   |   EM |   F1 |
|:----------------------|:-------------------|-----:|-----:|
| ['Het Scheur']        | North Sea          |    0 |    0 |
| ['TML Entertainment'] | MCA Records        |    0 |    0 |
| ['1065']              | Not provided.      |    0 |    0 |
