This repository has been archived by the owner on Dec 16, 2022. It is now read-only.
/
quac.py
145 lines (133 loc) · 6.62 KB
/
quac.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import json
import logging
from typing import Any, Dict, List, Tuple
from overrides import overrides
from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer
from allennlp.data.tokenizers import Token, Tokenizer, SpacyTokenizer
from allennlp_models.rc.dataset_readers import utils
logger = logging.getLogger(__name__)
@DatasetReader.register("quac")
class QuACReader(DatasetReader):
"""
Reads a JSON-formatted Question Answering in Context (QuAC) data file
and returns a ``Dataset`` where the ``Instances`` have four fields: ``question``, a ``ListField``,
``passage``, another ``TextField``, and ``span_start`` and ``span_end``, both ``ListField`` composed of
IndexFields`` into the ``passage`` ``TextField``.
Two ``ListField``, composed of ``LabelField``, ``yesno_list`` and ``followup_list`` is added.
We also add a
``MetadataField`` that stores the instance's ID, the original passage text, gold answer strings,
and token offsets into the original passage, accessible as ``metadata['id']``,
``metadata['original_passage']``, ``metadata['answer_text_lists'] and ``metadata['token_offsets']``.
# Parameters
tokenizer : `Tokenizer`, optional (default=`SpacyTokenizer()`)
We use this ``Tokenizer`` for both the question and the passage. See :class:`Tokenizer`.
Default is `SpacyTokenizer()`.
token_indexers : `Dict[str, TokenIndexer]`, optional
We similarly use this for both the question and the passage. See :class:`TokenIndexer`.
Default is `{"tokens": SingleIdTokenIndexer()}`.
num_context_answers : `int`, optional
How many previous question answers to consider in a context.
"""
def __init__(
self,
tokenizer: Tokenizer = None,
token_indexers: Dict[str, TokenIndexer] = None,
num_context_answers: int = 0,
**kwargs,
) -> None:
super().__init__(**kwargs)
self._tokenizer = tokenizer or SpacyTokenizer()
self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
self._num_context_answers = num_context_answers
@overrides
def _read(self, file_path: str):
# if `file_path` is a URL, redirect to the cache
file_path = cached_path(file_path)
logger.info("Reading file at %s", file_path)
with open(file_path) as dataset_file:
dataset_json = json.load(dataset_file)
dataset = dataset_json["data"]
logger.info("Reading the dataset")
for article in dataset:
for paragraph_json in article["paragraphs"]:
paragraph = paragraph_json["context"]
tokenized_paragraph = self._tokenizer.tokenize(paragraph)
qas = paragraph_json["qas"]
metadata = {}
metadata["instance_id"] = [qa["id"] for qa in qas]
question_text_list = [qa["question"].strip().replace("\n", "") for qa in qas]
answer_texts_list = [[answer["text"] for answer in qa["answers"]] for qa in qas]
metadata["question"] = question_text_list
metadata["answer_texts_list"] = answer_texts_list
span_starts_list = [
[answer["answer_start"] for answer in qa["answers"]] for qa in qas
]
span_ends_list = []
for answer_starts, an_list in zip(span_starts_list, answer_texts_list):
span_ends = [
start + len(answer) for start, answer in zip(answer_starts, an_list)
]
span_ends_list.append(span_ends)
yesno_list = [str(qa["yesno"]) for qa in qas]
followup_list = [str(qa["followup"]) for qa in qas]
instance = self.text_to_instance(
question_text_list,
paragraph,
span_starts_list,
span_ends_list,
tokenized_paragraph,
yesno_list,
followup_list,
metadata,
)
yield instance
@overrides
def text_to_instance(
self, # type: ignore
question_text_list: List[str],
passage_text: str,
start_span_list: List[List[int]] = None,
end_span_list: List[List[int]] = None,
passage_tokens: List[Token] = None,
yesno_list: List[int] = None,
followup_list: List[int] = None,
additional_metadata: Dict[str, Any] = None,
) -> Instance:
# We need to convert character indices in `passage_text` to token indices in
# `passage_tokens`, as the latter is what we'll actually use for supervision.
answer_token_span_list = []
passage_offsets = [(token.idx, token.idx + len(token.text)) for token in passage_tokens]
for start_list, end_list in zip(start_span_list, end_span_list):
token_spans: List[Tuple[int, int]] = []
for char_span_start, char_span_end in zip(start_list, end_list):
(span_start, span_end), error = utils.char_span_to_token_span(
passage_offsets, (char_span_start, char_span_end)
)
if error:
logger.debug("Passage: %s", passage_text)
logger.debug("Passage tokens: %s", passage_tokens)
logger.debug("Answer span: (%d, %d)", char_span_start, char_span_end)
logger.debug("Token span: (%d, %d)", span_start, span_end)
logger.debug("Tokens in answer: %s", passage_tokens[span_start : span_end + 1])
logger.debug("Answer: %s", passage_text[char_span_start:char_span_end])
token_spans.append((span_start, span_end))
answer_token_span_list.append(token_spans)
question_list_tokens = [self._tokenizer.tokenize(q) for q in question_text_list]
# Map answer texts to "CANNOTANSWER" if more than half of them marked as so.
additional_metadata["answer_texts_list"] = [
utils.handle_cannot(ans_list) for ans_list in additional_metadata["answer_texts_list"]
]
return utils.make_reading_comprehension_instance_quac(
question_list_tokens,
passage_tokens,
self._token_indexers,
passage_text,
answer_token_span_list,
yesno_list,
followup_list,
additional_metadata,
self._num_context_answers,
)