From b20c12ba4a68f5e155fc0cf320dab73fdb077264 Mon Sep 17 00:00:00 2001 From: Kenton Lee Date: Thu, 16 May 2019 12:07:16 -0700 Subject: [PATCH] Use optimal matching for the drop eval and add appropriate tests. (#2853) * Use optimal matching for the drop eval and add appropriate tests. * Variable names for pylint * more pylint --- allennlp/tests/tools/drop_eval_test.py | 6 +++++ allennlp/tools/drop_eval.py | 34 +++++++++----------------- 2 files changed, 18 insertions(+), 22 deletions(-) diff --git a/allennlp/tests/tools/drop_eval_test.py b/allennlp/tests/tools/drop_eval_test.py index 5edcce7936d..817657e86a0 100644 --- a/allennlp/tests/tools/drop_eval_test.py +++ b/allennlp/tests/tools/drop_eval_test.py @@ -73,6 +73,12 @@ def test_multi_span_overlap_in_incorrect_cases(self): assert get_metrics(["ottoman", "Kantakouzenous"], ["ottoman", "army of Kantakouzenous"]) == (0.0, 0.75) + def test_order_invariance(self): + assert get_metrics(["a"], ["a", "b"]) == (0, 0.5) + assert get_metrics(["b"], ["a", "b"]) == (0, 0.5) + assert get_metrics(["b"], ["b", "a"]) == (0, 0.5) + + class TestDropEvalFunctional: def test_json_loader(self): annotation = {"pid1": {"qa_pairs":[{"answer": {"number": "1"}, "validated_answers": \ diff --git a/allennlp/tools/drop_eval.py b/allennlp/tools/drop_eval.py index 970bf8b6d57..9d7229d3503 100755 --- a/allennlp/tools/drop_eval.py +++ b/allennlp/tools/drop_eval.py @@ -8,6 +8,7 @@ import re import numpy as np +from scipy.optimize import linear_sum_assignment # From here through _normalize_answer was originally copied from: @@ -72,31 +73,20 @@ def _answer_to_bags(answer: Union[str, List[str], Tuple[str, ...]]) -> Tuple[Set def _align_bags(predicted: List[Set[str]], gold: List[Set[str]]) -> List[float]: """ - Takes gold and predicted answer sets and first finds a greedy 1-1 alignment + Takes gold and predicted answer sets and first finds the optimal 1-1 alignment between them and gets maximum metric values over all the answers """ - f1_scores = [] + scores = np.zeros([len(gold), len(predicted)]) for gold_index, gold_item in enumerate(gold): - max_f1 = 0.0 - max_index = None - best_alignment: Tuple[Set[str], Set[str]] = (set(), set()) - if predicted: - for pred_index, pred_item in enumerate(predicted): - current_f1 = _compute_f1(pred_item, gold_item) - if current_f1 >= max_f1: - best_alignment = (gold_item, pred_item) - max_f1 = current_f1 - max_index = pred_index - match_flag = _match_numbers_if_present(*best_alignment) - gold[gold_index] = set() - predicted[max_index] = set() - else: - match_flag = False - if match_flag: - f1_scores.append(max_f1) - else: - f1_scores.append(0.0) - return f1_scores + for pred_index, pred_item in enumerate(predicted): + if _match_numbers_if_present(gold_item, pred_item): + scores[gold_index, pred_index] = _compute_f1(pred_item, gold_item) + row_ind, col_ind = linear_sum_assignment(-scores) + + max_scores = np.zeros([len(gold)]) + for row, column in zip(row_ind, col_ind): + max_scores[row] = max(max_scores[row], scores[row, column]) + return max_scores def _compute_f1(predicted_bag: Set[str], gold_bag: Set[str]) -> float: