In [162]:
import string
from collections import Counter
from typing import Callable

import numpy as np
import regex
import pandas as pd

# Normalization and score functions from SQuAD evaluation script https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/
def normalize_answer(s: str) -> str:
    def remove_articles(text):
        return regex.sub(r"\b(a|an|the)\b", " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def em(prediction, ground_truth, normalize_fn):
    return float(normalize_fn(prediction) == normalize_fn(ground_truth))


def f1_and_recall(prediction, ground_truth, normalize_fn):
    prediction_tokens = normalize_fn(prediction).split()
    ground_truth_tokens = normalize_fn(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())

    if num_same == 0:
        return 0, 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1, recall

# returns the f1 score and recall score for one question/answer
def f1_recall_score(prediction, ground_truths, normalize_fn: Callable[[str], str] = lambda x: x):
    result = [f1_and_recall(prediction, gt, normalize_fn) for gt in ground_truths]
    unzip = list(zip(*result))
    return max(unzip[0]), max(unzip[1])


def exact_match_score(prediction, ground_truths, normalize_fn: Callable[[str], str] = lambda x: x):
    return max([em(prediction, gt, normalize_fn) for gt in ground_truths])


TypeError: 'float' object is not callable

In [164]:
# variable_name = f1_recall_score('hello bye garbage', ['hi', 'hello hello', 'hello <> ', 'bye'], normalize_answer)
# print(variable_name)

In [165]:
# returns exact match probability over all answers, f1 score average, and recall score average over all Q/A pairs
# def total_score(predictions_file, ground_truths_file):
#     reference_answers = open(ground_truths_file, 'r')
#     ref = reference_answers.readlines()

#     rag_answers = open(predictions_file, 'r')
#     rag = rag_answers.readlines()
#     assert(len(rag) == len(ref))

#     exact_match_sum = 0.0
#     f1_sum = 0.0
#     recall_sum = 0.0
#     for pred, truth in zip(rag, ref):
#         ground_truths = truth.split(';')
#         exact_match_sum += exact_match_score(pred, ground_truths, normalize_answer)
#         f1, recall = f1_recall_score(pred, ground_truths, normalize_answer)
#         f1_sum += f1
#         recall_sum += recall

#     return exact_match_sum/len(rag), f1_sum/len(rag), recall_sum/len(rag)

def total_score(predictions, ground_truths):
    assert(len(predictions) == len(ground_truths))

    exact_match_sum = 0.0
    f1_sum = 0.0
    recall_sum = 0.0
    for pred, truth in zip(predictions, ground_truths):
        ground_truths = truth.split(';')
        exact_match_sum += exact_match_score(pred, ground_truths, normalize_answer)
        f1, recall = f1_recall_score(pred, ground_truths, normalize_answer)
        f1_sum += f1
        recall_sum += recall

    return exact_match_sum/len(predictions), f1_sum/len(predictions), recall_sum/len(predictions)

In [23]:
# print(total_score('system_outputs/system_output1.txt', 'data/test/reference_answers.txt'))

In [173]:
paths = [
    # 'system_outputs/flan-t5-large-output.csv',
    # 'system_outputs/flan-t5-xlarge-output.csv', 
    # 'system_outputs/llama-no-temp-one-shot-output.csv',
    # 'system_outputs/llama-no-temp-output.csv',
    # 'system_outputs/Mistral-output.csv',
    'system_outputs/flan-t5-large-no-context.csv',
    'system_outputs/flan-t5-xlarge-no-context.csv',
    'system_outputs/llama-no-temp-one-shot-no-context-output.csv',
]

df_metrics = pd.DataFrame(columns=['Model', 'Category', 'EM', 'F1', 'Recall'])

for path in paths:
    df = pd.read_csv(path)
    df = df[~(df['Category'] == 'json_hard')]
    # display(df)

    if path == 'system_outputs/flan-t5-large-no-context.csv':
        model = 'flan-t5-large-no-context'
    elif path == 'system_outputs/flan-t5-xlarge-no-context.csv':
        model = 'flan-t5-xlarge-no-context'
    else:
        model = path[15:-11]
    categories = df['Category'].unique().tolist() + ['all']
    for category in categories:
        if category == 'all':
            cat_df = df
        else:
            cat_df = df[df['Category'] == category]
        em_score, f1, recall = total_score(cat_df['ModelAnswer'], cat_df['Answer'])
        em_score = round(em_score, 3)
        f1 = round(f1, 3)
        recall = round(recall, 3)
        metrics = {'Model': model, 'Category': category, 'EM': em_score, 
                   'F1': f1, 'Recall': recall}
        df_metrics = pd.concat([df_metrics, pd.DataFrame([metrics])], ignore_index=True)
    
display(df_metrics)

df_metrics.to_csv('system_outputs/all_metrics.csv', index=False)


  df_metrics = pd.concat([df_metrics, pd.DataFrame([metrics])], ignore_index=True)


Unnamed: 0,Model,Category,EM,F1,Recall
0,flan-t5-large-no-context,webpages,0.074,0.119,0.126
1,flan-t5-large-no-context,tabular_webpages,0.0,0.021,0.031
2,flan-t5-large-no-context,other_pdf,0.043,0.109,0.091
3,flan-t5-large-no-context,papers_pdf,0.0,0.0,0.0
4,flan-t5-large-no-context,schedule_pdf,0.0,0.04,0.03
5,flan-t5-large-no-context,jsons,0.0,0.015,0.013
6,flan-t5-large-no-context,all,0.028,0.059,0.059
7,flan-t5-xlarge-no-context,webpages,0.037,0.069,0.074
8,flan-t5-xlarge-no-context,tabular_webpages,0.0,0.06,0.062
9,flan-t5-xlarge-no-context,other_pdf,0.13,0.185,0.176


In [172]:
gold_answer_paths = [
    'IAA_files/AQAA.txt',
    'IAA_files/EQEA.txt',
    'IAA_files/VQVA.txt',
]

annotator_answer_paths = [
    'IAA_files/AQEA.txt',
    'IAA_files/EQAA.txt',
    'IAA_files/VQAA.txt',
]

# Concat all gold answers
gold_answers = []
for path in gold_answer_paths:
    with open(path, 'r') as f:
        gold_answers += f.readlines()

# Concat all annotator answers
annotator_answers = []
for path in annotator_answer_paths:
    with open(path, 'r') as f:
        annotator_answers += f.readlines()

em_score, f1, recall = total_score(annotator_answers, gold_answers)
em_score = round(em_score, 3)
f1 = round(f1, 3)
recall = round(recall, 3)

print(em_score, f1, recall)

0.547 0.717 0.733
