This repository has been archived by the owner on Dec 16, 2022. It is now read-only.
/
evaluate.py
169 lines (136 loc) · 5.68 KB
/
evaluate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
"""
The `evaluate` subcommand can be used to
evaluate a trained model against a dataset
and report any metrics calculated by the model.
"""
import argparse
import json
import logging
from typing import Any, Dict
from overrides import overrides
from allennlp.commands.subcommand import Subcommand
from allennlp.common import logging as common_logging
from allennlp.common.util import prepare_environment
from allennlp.data import DataLoader
from allennlp.models.archival import load_archive
from allennlp.training.util import evaluate
logger = logging.getLogger(__name__)
@Subcommand.register("evaluate")
class Evaluate(Subcommand):
@overrides
def add_subparser(self, parser: argparse._SubParsersAction) -> argparse.ArgumentParser:
description = """Evaluate the specified model + dataset"""
subparser = parser.add_parser(
self.name, description=description, help="Evaluate the specified model + dataset."
)
subparser.add_argument("archive_file", type=str, help="path to an archived trained model")
subparser.add_argument(
"input_file", type=str, help="path to the file containing the evaluation data"
)
subparser.add_argument(
"--output-file", type=str, help="optional path to write the metrics to as JSON"
)
subparser.add_argument(
"--predictions-output-file",
type=str,
help="optional path to write the predictions to as JSON lines",
)
subparser.add_argument(
"--weights-file", type=str, help="a path that overrides which weights file to use"
)
cuda_device = subparser.add_mutually_exclusive_group(required=False)
cuda_device.add_argument(
"--cuda-device", type=int, default=-1, help="id of GPU to use (if any)"
)
subparser.add_argument(
"-o",
"--overrides",
type=str,
default="",
help=(
"a json(net) structure used to override the experiment configuration, e.g., "
"'{\"iterator.batch_size\": 16}'. Nested parameters can be specified either"
" with nested dictionaries or with dot syntax."
),
)
subparser.add_argument(
"--batch-size", type=int, help="If non-empty, the batch size to use during evaluation."
)
subparser.add_argument(
"--batch-weight-key",
type=str,
default="",
help="If non-empty, name of metric used to weight the loss on a per-batch basis.",
)
subparser.add_argument(
"--extend-vocab",
action="store_true",
default=False,
help="if specified, we will use the instances in your new dataset to "
"extend your vocabulary. If pretrained-file was used to initialize "
"embedding layers, you may also need to pass --embedding-sources-mapping.",
)
subparser.add_argument(
"--embedding-sources-mapping",
type=str,
default="",
help="a JSON dict defining mapping from embedding module path to embedding "
"pretrained-file used during training. If not passed, and embedding needs to be "
"extended, we will try to use the original file paths used during training. If "
"they are not available we will use random vectors for embedding extension.",
)
subparser.add_argument(
"--file-friendly-logging",
action="store_true",
default=False,
help="outputs tqdm status on separate lines and slows tqdm refresh rate",
)
subparser.set_defaults(func=evaluate_from_args)
return subparser
def evaluate_from_args(args: argparse.Namespace) -> Dict[str, Any]:
common_logging.FILE_FRIENDLY_LOGGING = args.file_friendly_logging
# Disable some of the more verbose logging statements
logging.getLogger("allennlp.common.params").disabled = True
logging.getLogger("allennlp.nn.initializers").disabled = True
logging.getLogger("allennlp.modules.token_embedders.embedding").setLevel(logging.INFO)
# Load from archive
archive = load_archive(
args.archive_file,
weights_file=args.weights_file,
cuda_device=args.cuda_device,
overrides=args.overrides,
)
config = archive.config
prepare_environment(config)
model = archive.model
model.eval()
# Load the evaluation data
dataset_reader = archive.validation_dataset_reader
evaluation_data_path = args.input_file
logger.info("Reading evaluation data from %s", evaluation_data_path)
data_loader_params = config.pop("validation_data_loader", None)
if data_loader_params is None:
data_loader_params = config.pop("data_loader")
if args.batch_size:
data_loader_params["batch_size"] = args.batch_size
data_loader = DataLoader.from_params(
params=data_loader_params, reader=dataset_reader, data_path=evaluation_data_path
)
embedding_sources = (
json.loads(args.embedding_sources_mapping) if args.embedding_sources_mapping else {}
)
if args.extend_vocab:
logger.info("Vocabulary is being extended with test instances.")
model.vocab.extend_from_instances(instances=data_loader.iter_instances())
model.extend_embedder_vocab(embedding_sources)
data_loader.index_with(model.vocab)
metrics = evaluate(
model,
data_loader,
args.cuda_device,
args.batch_weight_key,
output_file=args.output_file,
predictions_output_file=args.predictions_output_file,
)
logger.info("Finished evaluating.")
return metrics