# QASPER evaluation

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datasets import load_dataset
from tqdm import tqdm
import numpy as np
from collections import Counter, defaultdict
import datetime
import json
import os
from functools import partial
import random
random.seed(42)

import plotly.express as px
import plotly.graph_objects as go

import torch
from qasper.dataset_reader import QasperReader
from qasper.models import qasper, gpt35, agent_v1
from qasper.utils import print_wrap
from qasper.evaluator import token_f1_score, get_answers_and_evidence

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
MODEL_NAME = 'qasper'
# MODEL_NAME = 'gpt35'
# MODEL_NAME = 'agent_v1'

In [4]:
if torch.cuda.is_available():
    print("Using CUDA")
    qasper.qasper_led.cuda()
    
print(qasper.qasper_led.device)

cpu


## Generate question instances

Each article has multiple questions, and each question has multiple references (i.e., answers). We first extract the questions.

In [5]:
dataset = load_dataset("allenai/qasper")

In [6]:
dataset['validation']

Dataset({
    features: ['id', 'title', 'abstract', 'full_text', 'qas', 'figures_and_tables'],
    num_rows: 281
})

In [7]:
# dataset['validation'][0]

In [8]:
# distribution of years based on id
dates = []
for article in dataset['validation']:
    yy = int( article['id'][:2])
    mm = int( article['id'][2:4])
    dates.append(datetime.date(year=2000+yy, month=mm, day=1))

In [9]:
px.histogram(x=dates)

In [10]:
reader = QasperReader()

In [11]:
def question_generator(split):
    for article in split:
        for question in reader._article_to_instances(article):
            yield question

In [12]:
val_questions = list(question_generator(dataset['validation']))
# instances = random.sample(instances, 100)

In [13]:
reader._stats

defaultdict(int,
            {'number of documents': 281,
             'number of questions': 1005,
             'number of answers': 3015,
             'questions with multiple answers': 1005,
             'extractive questions': 962,
             'extractive questions with multiple spans': 406,
             'multiple_evidence_spans_count': 536,
             'answers with table or figure as evidence': 212,
             'freeform answers': 431,
             'yes/no questions': 208,
             'answers with no evidence': 212,
             'unanswerable questions': 163,
             'number of truncated contexts': 15})

In [14]:
len(val_questions)

1005

In [15]:
example_question = val_questions[0]
print(example_question.keys())
# print('QUESTION WITH CONTEXT:')
# print_wrap(instance['s_question_with_context'])
# print(example_instance['s_question_with_context'])

dict_keys(['question_with_context', 's_question_with_context', 'paragraph_indices', 'global_attention_mask', 'evidence', 'answer', 'metadata'])


In [16]:
type(example_question)
# print_wrap(' '.join(example_instance['question_with_context']))

dict

In [17]:
print('QUESTION:', example_question['metadata']['question'])
print('ANSWER:', example_question['answer'])

QUESTION: which multilingual approaches do they compare with?
ANSWER: BIBREF19, BIBREF20


In [18]:
example_question['metadata']['article_id']

'1912.01214'

In [19]:
# plot histogram of full_text lengths
question_token_lengths = []
for question in val_questions:
    question_token_lengths.append(len(question['question_with_context']))
    
px.histogram(question_token_lengths, title=f'Histogram of question + context token lengths (total={len(question_token_lengths)})')

### GT Distribution

Now let's extract the references into a dictionary called `gold_answers_and_evidence` that has `question_id`s as keys and a list of answers/references as values.

In [20]:
gold_answers_and_evidence = get_answers_and_evidence(dataset['validation'])

In [21]:
# list the first 10 question ids
for question_id in list(gold_answers_and_evidence.keys())[:10]:
    print(question_id)


b6f15fb6279b82e34a5bf4828b7b5ddabfdf1d54
f5e6f43454332e0521a778db0b769481e23e7682
9a05a5f4351db75da371f7ac12eb0b03607c4b87
5eda469a8a77f028d0c5f1acd296111085614537
18c5d366b1da8447b5404eab71f4cc658ba12e6f
b5e4866f0685299f1d7af267bbcc4afe2aab806f
1f085b9bb7bfd0d6c8cba1a9d73f08fcf2da7590
b6ae8e10c6a0d34c834f18f66ab730b670fb528c
a87a009c242d57c51fc94fe312af5e02070f898b
ef4dba073d24042f24886580ae77add5326f2130


In [22]:
gold_answers_and_evidence['b6f15fb6279b82e34a5bf4828b7b5ddabfdf1d54']

[{'answer': 'BIBREF19, BIBREF20',
  'evidence': ['Table TABREF19 and TABREF26 report zero-shot results on Europarl and Multi-UN evaluation sets, respectively. We compare our approaches with related approaches of pivoting, multilingual NMT (MNMT) BIBREF19, and cross-lingual transfer without pretraining BIBREF16. The results show that our approaches consistently outperform other approaches across languages and datasets, especially surpass pivoting, which is a strong baseline in the zero-shot scenario that multilingual NMT systems often fail to beat BIBREF19, BIBREF20, BIBREF23. Pivoting translates source to pivot then to target in two steps, causing inefficient translation process. Our approaches use one encoder-decoder model to translate between any zero-shot directions, which is more efficient than pivoting. Regarding the comparison between transfer approaches, our cross-lingual pretraining based transfer outperforms transfer method that does not use pretraining by a large margin.'],
 

In [23]:
# get frequency of types
types = Counter()
n_refs = defaultdict(list)
for question_id, references in gold_answers_and_evidence.items():
    ref_types = []
    for ref in references:
        ref_types.append(ref['type'])
        types[ref['type']] += 1
        n_refs[ref['type']].append(len(references))

    # # check if all refs are the same
    # assert len(set(refs)) == 1, f"question_id={question_id}: {refs}"
    # NOTE: not all refs associated with the question have the same type

In [24]:
types

Counter({'extractive': 962, 'abstractive': 431, 'boolean': 208, 'none': 163})

In [25]:
fig = go.Figure()
for type_, counts in n_refs.items():
    fig.add_trace(go.Histogram(x=counts, name=type_))
# stacked bars
fig.update_layout(barmode='stack')
# fig.update_layout(barmode='overlay')
fig.show()

## QASPER LED baseline

In [26]:
qasper_answer = qasper.predict(example_question)[0]
print('PREDICTED ANSWER:', qasper_answer)

Input ids are automatically padded from 5733 to 6144 to be a multiple of `config.attention_window`: 1024

Using the model-agnostic default `max_length` (=20) to control the generation length. We recommend setting `max_new_tokens` to control the maximum length of the generation.



PREDICTED ANSWER: Our experimenters use a multilingual method called interlatable neural networks (multilingual)


In [27]:
token_f1_score(qasper_answer, example_question['answer'])

(0, 0, 0)

## GPT 3.5 zero shot

In [28]:
gpt35_answer = gpt35.predict(example_question)
print("PREDICTED ANSWER:")
print_wrap(gpt35_answer)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


PREDICTED ANSWER:
In the provided technical documentation, the approach of cross-lingual
pretraining based transfer for Neural Machine Translation (NMT) is thoroughly
explained. The document compares this approach with several existing methods for
multilingual translation tasks. Here are some of the key points highlighted in
the documentation:


1. **Existing Methods Comparison**:

   - The document compares the proposed cross-lingual pretraining based transfer
approach with pivot-based methods, transfer learning, multilingual NMT, and
unsupervised NMT. It discusses the strengths and limitations of each approach in
handling zero-resource or low-resource translation scenarios.


2. **Approach Description**:

   - The approach involves pretraining a universal encoder with source/pivot
monolingual or source-pivot bilingual data, training a pivot-target parent
model, and then directly translating source sentences into target sentences
using the trained model. The document explains the step

In [29]:
token_f1_score(gpt35_answer, example_question['answer'])

(0, 0, 0)

## Agent v1

In [30]:
agent_v1_answer = agent_v1.predict(example_question, verbose=1)
print("PREDICTED ANSWER:", agent_v1_answer)

## 1. Initial_Response
content='' additional_kwargs={'tool_calls': [{'id':
'call_TkZBHyD8ZFa8BEHxkov3oEQE', 'function': {'arguments': '{"Thought":"I need
to find the multilingual approaches compared in the paper with arxiv identifier 
1912.01214.","Actions":[{"tool":"arxiv_lookup","argument":{"arxiv_id":"1912.0121
4","query":"multilingual approaches"}}]}', 'name': 'Answer'}, 'type':
'function'}]} response_metadata={'token_usage': {'completion_tokens': 53,
'prompt_tokens': 1153, 'total_tokens': 1206}, 'model_name': 'gpt-3.5-turbo',
'system_fingerprint': 'fp_c2295e73ad', 'finish_reason': 'stop', 'logprobs':
None} id='9507ba53-e983-4e2b-8471-9562839f3d4d'

---
## 2. execute_tools
[ToolMessage(content='{"{\'arxiv_id\': \'1912.01214\', \'query\': \'multilingual
approaches\'}": "In [Sx1] Introduction:\\nOur proposed approach significantly
improves zero-shot translation performance, consistently surpassing pivoting and
multilingual approaches. Meanwhile, the performance on supervised translat

## Evaluate F1 score

In [31]:
def model_factory(model_name):
    if model_name == 'qasper':
        return qasper.predict
    elif model_name == 'gpt35':
        return gpt35.predict
    elif model_name == 'agent_v1':
        return agent_v1.predict
    else:
        raise ValueError(f"Invalid model name: {model_name}")

In [32]:
predict_func = model_factory(MODEL_NAME)

In [33]:
def evaluate(gold, predicted, verbose=0, output_file=None):
    max_answer_f1s = []
    max_evidence_f1s = []
    max_answer_f1s_by_type = {
        "extractive": [],
        "abstractive": [],
        "boolean": [],
        "none": [],
    }
    num_missing_predictions = 0
    for question_id in gold:
        if question_id not in predicted:
            num_missing_predictions += 1
            max_answer_f1s.append(0.0)
            max_evidence_f1s.append(0.0)
            continue
        
        answer_f1s_and_types = [
            (token_f1_score(predicted[question_id]["answer"], reference["answer"]),
             reference["type"])
            for reference in gold[question_id]
        ]
        # take the reference with the highest F1 score (if there are multiple references)
        (max_answer_f1, precision, recall), answer_type = sorted(answer_f1s_and_types, key=lambda x: x[0][0], reverse=True)[0]
        max_answer_f1s.append(max_answer_f1)
        max_answer_f1s_by_type[answer_type].append(max_answer_f1)

        # evidence_f1s = [
        #     paragraph_f1_score(predicted[question_id]["evidence"], reference["evidence"])
        #     for reference in gold[question_id]
        # ]
        # max_evidence_f1s.append(max(evidence_f1s))

        if verbose:
            fprint = partial(print, file=output_file) if output_file else print
            fprint("Question ID:", question_id)
            fprint("Question:", predicted[question_id]["question"])
            fprint("Gold:")
            for reference in gold[question_id]:
                fprint('\t' + reference['answer'])
            fprint("Predicted:")
            fprint('\t' + predicted[question_id]['answer'])
            fprint("Answer F1:")
            fprint('\t' + str(max_answer_f1))
            fprint("Precision:")
            fprint('\t' + str(precision))
            fprint("Recall:")
            fprint('\t' + str(recall))
            fprint("")

    mean = lambda x: sum(x) / len(x) if x else 0.0
    return {
        "Answer F1": mean(max_answer_f1s),
        "Answer F1 by type": {key: mean(value) for key, value in max_answer_f1s_by_type.items()},
        # "Evidence F1": mean(max_evidence_f1s),
        "Missing predictions": num_missing_predictions,
        # "max_answer_f1s_by_type" : max_answer_f1s_by_type,
        "Freq of types": {key: len(value) for key, value in max_answer_f1s_by_type.items()},
    }

In [34]:
predicted_answers_and_evidence = {}

In [35]:
# randomly sample 100 questions
# sample_val_questions = random.sample(val_questions, 100)

# for now let's just use the first 10 questions
# but later we should turn this off so that we aren't over-indexing on these examples
sample_val_questions = val_questions[:10]

for question in tqdm(sample_val_questions):
    question_id = question["metadata"]["question_id"]

    if question_id in predicted_answers_and_evidence: # keep this to conserve API requests
        continue

    out = predict_func(question)
    if isinstance(out, list):
        pred_answer = out[0]
    else:
        pred_answer = out

    predicted_answers_and_evidence[question_id] = {
        "question": question["metadata"]["question"],
        "answer": pred_answer,
        "out" : out,
        # "evidence": prediction_data["predicted_evidence"]
    }


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [01:00<00:00,  6.01s/it]


In [36]:
predicted_answers_and_evidence

{'b6f15fb6279b82e34a5bf4828b7b5ddabfdf1d54': {'question': 'which multilingual approaches do they compare with?',
  'answer': 'The document discusses a novel approach for zero-shot translation in Neural Machine Translation (NMT) by leveraging cross-lingual pre-training. The approach aims to address the challenge of translating between low-resource or zero-resource language pairs without direct parallel data. Here is a breakdown of the key points in the document:\n\n1. **Introduction:**\n   - Highlights the limitations of existing methods like pivoting and transfer learning in zero-shot translation scenarios.\n   - Introduces the concept of domain shift problem affecting transfer learning in NMT.\n   - Proposes a transfer approach based on cross-lingual pre-training to improve zero-shot translation performance.\n\n2. **Related Work:**\n   - Discusses pivot-based methods, transfer learning, multilingual NMT, and unsupervised NMT approaches in zero-shot translation.\n   - Compares the adva

## Save results

In [37]:
# save predictions
save_predictions_path = f'output/{MODEL_NAME}-predictions.json'

# if it already exists, append a timestamp
if os.path.exists(save_predictions_path):
    save_predictions_path = save_predictions_path.replace('.json', f'-{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}.json')

with open(save_predictions_path, 'w') as f:
    json.dump(predicted_answers_and_evidence, f, indent=4)

In [38]:
# save evaluation output (easier to read)
save_evaluation_path = f'output/{MODEL_NAME}-evaluation.txt'

# if it already exists, append a timestamp
if os.path.exists(save_evaluation_path):
    save_evaluation_path = save_evaluation_path.replace('.txt', f'-{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}.txt')

with open(save_evaluation_path, 'w') as f:
    
    evaluation_output = evaluate(
        {k:v for k, v in gold_answers_and_evidence.items() \
            if k in predicted_answers_and_evidence}, 
        predicted_answers_and_evidence,
        verbose=1,
        output_file=f,
    )

    # print evaluation metrics
    print('='*40, file=f)
    print("Evaluation metrics:", file=f)
    print(json.dumps(evaluation_output, indent=4), file=f)

In [39]:
len(gold_answers_and_evidence)

1005

In [40]:
evaluation_output

{'Answer F1': 0.06850330890624352,
 'Answer F1 by type': {'extractive': 0.06079980442178291,
  'abstractive': 0.1297172290549774,
  'boolean': 0.0,
  'none': 0.0},
 'Missing predictions': 0,
 'Freq of types': {'extractive': 7, 'abstractive': 2, 'boolean': 1, 'none': 0}}