In [1]:
%load_ext autoreload
%autoreload 2

In [80]:
from datasets import load_dataset
from tqdm import tqdm

from qasper.dataset_reader import QasperReader
from qasper.models import qasper, gpt35

In [144]:
dataset = load_dataset("allenai/qasper")

In [147]:
dataset['validation']

Dataset({
    features: ['id', 'title', 'abstract', 'full_text', 'qas', 'figures_and_tables'],
    num_rows: 281
})

In [81]:
reader = QasperReader()

In [82]:
def instance_generator(split):
    for article in split:
        for instance in reader._article_to_instances(article):
            yield instance

In [121]:
# randomly sample 100 instances
import random
random.seed(42)
instances = list(instance_generator(dataset['validation']))
instances = random.sample(instances, 100)

In [122]:
len(instances)

100

In [125]:
instance = instances[0]
print(instance.keys())
print('QUESTION WITH CONTEXT:', instance['s_question_with_context'])

dict_keys(['question_with_context', 's_question_with_context', 'paragraph_indices', 'global_attention_mask', 'evidence', 'answer', 'metadata'])
QUESTION WITH CONTEXT: Did they experiment with this new dataset? Introduction How humans process language has become increasingly relevant in natural language processing since physiological data during language understanding is more accessible and recorded with less effort. In this work, we focus on eye-tracking and electroencephalography (EEG) recordings to capture the reading process. On one hand, eye movement data provides millisecond-accurate records about where humans look when they are reading, and is highly correlated with the cognitive load associated with different stages of text processing. On the other hand, EEG records electrical brain activity across the scalp and is a direct measure of physiological processes, including language processing. The combination of both measurement methods enables us to study the language understanding

In [126]:
print('QUESTION:', instance['metadata']['question'])
print('ANSWER:', instance['answer'])

QUESTION: Did they experiment with this new dataset?
ANSWER: No


## Evaluate F1 score

In [157]:
from qasper.evaluator import token_f1_score, get_answers_and_evidence, evaluate

In [128]:
qasper_answer = qasper.predict(instance)[0]
print(qasper_answer)

Input ids are automatically padded from 4697 to 5120 to be a multiple of `config.attention_window`: 1024


, we generated a new type of record with a new data-sequence, using one-


In [133]:
token_f1_score(qasper_answer, instance['answer'])

0

In [130]:
gpt35_answer = gpt35.predict(instance)
print(gpt35_answer)

Yes, in this work, the team experimented with a new dataset called the Zurich Cognitive Language Processing Corpus (ZuCo) 2.0. The dataset includes raw and preprocessed eye-tracking and electroencephalography (EEG) data of 18 subjects. The participants read 739 English sentences from Wikipedia in a natural reading setting as well as during annotation tasks. The team developed this corpus to collect recordings during both normal reading and task-specific annotation. They provided simultaneous eye movement and brain activity recordings to analyze and compare the two reading paradigms.

The dataset is publicly available at https://osf.io/2urht/ and is specifically tailored for training and evaluating machine learning algorithms for natural language processing (NLP) purposes. The team conducted a detailed technical validation of the data to ensure the quality of the recordings. They compared the results to previous studies and their own ZuCo 1.0 dataset to validate the findings.

By experi

In [134]:
token_f1_score(gpt35_answer, instance['answer'])

0

In [None]:
def evaluate(gold, predicted):
    max_answer_f1s = []
    max_evidence_f1s = []
    max_answer_f1s_by_type = {
        "extractive": [],
        "abstractive": [],
        "boolean": [],
        "none": [],
    }
    num_missing_predictions = 0
    for question_id, references in gold.items():
        if question_id not in predicted:
            num_missing_predictions += 1
            max_answer_f1s.append(0.0)
            max_evidence_f1s.append(0.0)
            continue
        answer_f1s_and_types = [
            (token_f1_score(predicted[question_id]["answer"], reference["answer"]),
             reference["type"])
            for reference in gold[question_id]
        ]
        max_answer_f1, answer_type = sorted(answer_f1s_and_types, key=lambda x: x[0], reverse=True)[0]
        max_answer_f1s.append(max_answer_f1)
        max_answer_f1s_by_type[answer_type].append(max_answer_f1)
        # evidence_f1s = [
        #     paragraph_f1_score(predicted[question_id]["evidence"], reference["evidence"])
        #     for reference in gold[question_id]
        # ]
        # max_evidence_f1s.append(max(evidence_f1s))

    mean = lambda x: sum(x) / len(x) if x else 0.0
    return {
        "Answer F1": mean(max_answer_f1s),
        "Answer F1 by type": {key: mean(value) for key, value in max_answer_f1s_by_type.items()},
        # "Evidence F1": mean(max_evidence_f1s),
        "Missing predictions": num_missing_predictions
    }

In [149]:
# gold_data = json.load(open(args.gold))
gold_answers_and_evidence = get_answers_and_evidence(dataset['validation'])

In [152]:
gold_answers_and_evidence.keys()

dict_keys(['b6f15fb6279b82e34a5bf4828b7b5ddabfdf1d54', 'f5e6f43454332e0521a778db0b769481e23e7682', '9a05a5f4351db75da371f7ac12eb0b03607c4b87', '5eda469a8a77f028d0c5f1acd296111085614537', '18c5d366b1da8447b5404eab71f4cc658ba12e6f', 'b5e4866f0685299f1d7af267bbcc4afe2aab806f', '1f085b9bb7bfd0d6c8cba1a9d73f08fcf2da7590', 'b6ae8e10c6a0d34c834f18f66ab730b670fb528c', 'a87a009c242d57c51fc94fe312af5e02070f898b', 'ef4dba073d24042f24886580ae77add5326f2130', '2df4a045a9cd7b44874340b6fdf9308d3c55327a', 'a313e98994fc039a82aa2447c411dda92c65a470', '37861be6aecd9242c4fdccdfcd06e48f3f1f8f81', '7e62a53823aba08bc26b2812db016f5ce6159565', '9eabb54c2408dac24f00f92cf1061258c7ea2e1a', '3d013f15796ae7fed5272183a166c45f16e24e39', '9ee07edc371e014df686ced4fb0c3a7b9ce3d5dc', 'd3aa0449708cc861a51551b128d73e11d62207d2', 'cfbec1ef032ac968560a7c76dec70faf1269b27c', 'c0e341c4d2253eb42c8840381b082aae274eddad', '1ec152119cf756b16191b236c85522afeed11f59', '891c2001d6baaaf0da4e65b647402acac621a7d2', '66c96c297c2cffdf5013

In [153]:
gold_answers_and_evidence['b6f15fb6279b82e34a5bf4828b7b5ddabfdf1d54']

[{'answer': 'BIBREF19, BIBREF20',
  'evidence': ['Table TABREF19 and TABREF26 report zero-shot results on Europarl and Multi-UN evaluation sets, respectively. We compare our approaches with related approaches of pivoting, multilingual NMT (MNMT) BIBREF19, and cross-lingual transfer without pretraining BIBREF16. The results show that our approaches consistently outperform other approaches across languages and datasets, especially surpass pivoting, which is a strong baseline in the zero-shot scenario that multilingual NMT systems often fail to beat BIBREF19, BIBREF20, BIBREF23. Pivoting translates source to pivot then to target in two steps, causing inefficient translation process. Our approaches use one encoder-decoder model to translate between any zero-shot directions, which is more efficient than pivoting. Regarding the comparison between transfer approaches, our cross-lingual pretraining based transfer outperforms transfer method that does not use pretraining by a large margin.'],
 

In [155]:
predicted_answers_and_evidence = {}
for instance in tqdm(instances):
    question_id = instance["metadata"]["question_id"]
    # prediction_data = json.loads(line)
    pred_answer = qasper.predict(instance)[0]

    predicted_answers_and_evidence[question_id] = {
        "answer": pred_answer,
        # "evidence": prediction_data["predicted_evidence"]
    }


Input ids are automatically padded from 4784 to 5120 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 6265 to 7168 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 3630 to 4096 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 4648 to 5120 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 1822 to 2048 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 7227 to 8192 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 5391 to 6144 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 5245 to 6144 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 8900 to 9216 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 4004 to 4096 to

In [161]:
evaluation_output = evaluate(
    {k:v for k, v in gold_answers_and_evidence.items() \
        if k in predicted_answers_and_evidence}, 
    predicted_answers_and_evidence)

In [162]:
len(gold_answers_and_evidence)

1005

In [163]:
evaluation_output

{'Answer F1': 0.03898698183383156,
 'Answer F1 by type': {'extractive': 0.04167783212003597,
  'abstractive': 0.07259226309525431,
  'boolean': 0.0,
  'none': 0.0},
 'Missing predictions': 0}

In [None]:
# print(json.dumps(evaluation_output, indent=2))