This repository has been archived by the owner on Dec 16, 2022. It is now read-only.
/
vgqa.py
238 lines (210 loc) · 9.87 KB
/
vgqa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
import logging
from os import PathLike
from typing import (
Dict,
# List,
Union,
Optional,
Tuple,
Iterable,
)
import json
import torch
from torch import Tensor
from allennlp.common.lazy import Lazy
from allennlp.common.file_utils import cached_path
from allennlp.data.vocabulary import Vocabulary
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.fields import Field, ArrayField, LabelField, ListField, TextField
from allennlp.data.image_loader import ImageLoader
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import TokenIndexer
from allennlp.data.tokenizers import Tokenizer
from allennlp.modules.vision.grid_embedder import GridEmbedder
from allennlp.modules.vision.region_detector import RegionDetector
from allennlp_models.vision.dataset_readers import utils
from allennlp_models.vision.dataset_readers.vision_reader import VisionReader
logger = logging.getLogger(__name__)
@DatasetReader.register("vgqa")
class VGQAReader(VisionReader):
"""
Parameters
----------
image_dir: `str`
Path to directory containing `png` image files.
image_loader: `ImageLoader`
The image loader component used to load the images.
image_featurizer: `Lazy[GridEmbedder]`
The backbone image processor (like a ResNet), whose output will be passed to the region
detector for finding object boxes in the image.
region_detector: `Lazy[RegionDetector]`
For pulling out regions of the image (both coordinates and features) that will be used by
downstream models.
answer_vocab: `Union[Vocabulary, str]`, optional
The vocabulary to use for answers. The reader will look into the `"answers"` namespace
in the vocabulary to find possible answers.
If this is given, the reader only outputs instances with answers contained in this vocab.
If this is not given, the reader outputs all instances with all answers.
If this is a URL or filename, we will download a previously saved vocabulary from there.
feature_cache_dir: `Union[str, PathLike]`, optional
An optional directory to cache the featurized images in. Featurizing images takes a long
time, and many images are duplicated, so we highly recommend to use this cache.
tokenizer: `Tokenizer`, optional
The `Tokenizer` to use to tokenize the text. By default, this uses the tokenizer for
`"bert-base-uncased"`.
token_indexers: `Dict[str, TokenIndexer]`, optional
The `TokenIndexer` to use. By default, this uses the indexer for `"bert-base-uncased"`.
cuda_device: `Union[int, torch.device]`, optional
Either a torch device or a GPU number. This is the GPU we'll use to featurize the images.
max_instances: `int`, optional
For debugging, you can use this parameter to limit the number of instances the reader
returns.
image_processing_batch_size: `int`
The number of images to process at one time while featurizing. Default is 8.
"""
def __init__(
self,
image_dir: Optional[Union[str, PathLike]] = None,
*,
image_loader: Optional[ImageLoader] = None,
image_featurizer: Optional[Lazy[GridEmbedder]] = None,
region_detector: Optional[Lazy[RegionDetector]] = None,
answer_vocab: Optional[Union[Vocabulary, str]] = None,
feature_cache_dir: Optional[Union[str, PathLike]] = None,
tokenizer: Optional[Tokenizer] = None,
token_indexers: Optional[Dict[str, TokenIndexer]] = None,
cuda_device: Optional[Union[int, torch.device]] = None,
max_instances: Optional[int] = None,
image_processing_batch_size: int = 8,
write_to_cache: bool = True,
) -> None:
run_featurization = image_loader and image_featurizer and region_detector
if image_dir is None and run_featurization:
raise ValueError(
"Because of the size of the image datasets, we don't download them automatically. "
"Please go to https://visualgenome.org/api/v0/api_home.html, download the datasets you need, "
"and set the image_dir parameter to point to your download location. This dataset "
"reader does not care about the exact directory structure. It finds the images "
"wherever they are."
)
super().__init__(
image_dir,
image_loader=image_loader,
image_featurizer=image_featurizer,
region_detector=region_detector,
feature_cache_dir=feature_cache_dir,
tokenizer=tokenizer,
token_indexers=token_indexers,
cuda_device=cuda_device,
max_instances=max_instances,
image_processing_batch_size=image_processing_batch_size,
write_to_cache=write_to_cache,
)
# read answer vocab
if answer_vocab is None:
self.answer_vocab = None
else:
if isinstance(answer_vocab, str):
answer_vocab = cached_path(answer_vocab, extract_archive=True)
answer_vocab = Vocabulary.from_files(answer_vocab)
self.answer_vocab = frozenset(
utils.preprocess_answer(a)
for a in answer_vocab.get_token_to_index_vocabulary("answers").keys()
)
def _read(self, file_path: str):
# if the splits are using slicing syntax, honor it
question_slice, file_path = utils.get_data_slice(file_path)
file_path = cached_path(file_path, extract_archive=True)
logger.info("Reading file at %s", file_path)
questions = []
with open(file_path) as dataset_file:
dataset = json.load(dataset_file)
for data in dataset:
for qa in data["qas"]:
questions.append(qa)
questions = questions[question_slice]
question_dicts = list(self.shard_iterable(questions))
processed_images: Iterable[
Optional[Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]]
]
if self.produce_featurized_images:
# It would be much easier to just process one image at a time, but it's faster to process
# them in batches. So this code gathers up instances until it has enough to fill up a batch
# that needs processing, and then processes them all.
filenames = [f"{question_dict['image_id']}.jpg" for question_dict in question_dicts]
logger.info("images size: %s", len(self.images))
try:
processed_images = self._process_image_paths(
self.images[filename] for filename in filenames
)
except KeyError as e:
print(self.images)
missing_id = e.args[0]
raise KeyError(
missing_id,
f"We could not find an image with the id {missing_id}. "
"Because of the size of the image datasets, we don't download them automatically. "
"Please go to https://visualqa.org/download.html, download the datasets you need, "
"and set the image_dir parameter to point to your download location. This dataset "
"reader does not care about the exact directory structure. It finds the images "
"wherever they are.",
)
else:
processed_images = [None for _ in range(len(question_dicts))]
logger.info("Reading the dataset")
failed_instances_count = 0
attempted_instances_count = 0
for qa, processed_image in zip(question_dicts, processed_images):
question = qa["question"]
answer = utils.preprocess_answer(qa["answer"])
qa_id = qa["qa_id"]
instance = self.text_to_instance(
qa_id,
question,
answer,
processed_image,
)
attempted_instances_count += 1
if instance is not None:
yield instance
else:
failed_instances_count += 1
failed_instances_fraction = failed_instances_count / attempted_instances_count
logger.warning(f"{failed_instances_fraction*100:.0f}% of instances failed.")
def text_to_instance(
self, # type: ignore
qa_id: int,
question: str,
answer: Optional[str],
image: Union[str, Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]],
use_cache: bool = True,
keep_impossible_questions: bool = True,
) -> Optional[Instance]:
question_field = TextField(self._tokenizer.tokenize(question), None)
fields: Dict[str, Field] = {
"question": question_field,
}
if isinstance(image, str):
features, coords, _, _ = next(self._process_image_paths([image], use_cache=use_cache))
else:
features, coords, _, _ = image
fields["box_features"] = ArrayField(features)
fields["box_coordinates"] = ArrayField(coords)
fields["box_mask"] = ArrayField(
features.new_ones((features.shape[0],), dtype=torch.bool),
padding_value=False,
dtype=torch.bool,
)
if answer is not None:
labels_fields = []
weights = []
if (not self.answer_vocab or answer in self.answer_vocab) or keep_impossible_questions:
labels_fields.append(LabelField(answer, label_namespace="answers"))
weights.append(1.0)
if len(labels_fields) <= 0:
return None
fields["label_weights"] = ArrayField(torch.tensor(weights))
fields["labels"] = ListField(labels_fields)
return Instance(fields)
def apply_token_indexers(self, instance: Instance) -> None:
instance["question"].token_indexers = self._token_indexers # type: ignore