/
evaluate.py
executable file
·127 lines (96 loc) · 5.76 KB
/
evaluate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import torch
import tensorflow as tf
import util
import sys
import numpy as np
def read_doc_keys(fname):
keys = set()
with open(fname) as f:
for line in f:
keys.add(line.strip())
return keys
if __name__ == "__main__":
try:
phase = sys.argv[2]
except:
phase = 'test'
model_name = sys.argv[1]
trained_with_gold = True if 'gold' in model_name else False
config = util.initialize_from_env()
model = util.get_model(config)
saver = tf.train.Saver()
log_dir = config["log_dir"]
print_scores = True # if True it creates a test_scores.tsv file with evaluation scores
all_masks = True # if True it goes through the same document multiple times to get masked predictions for all mentions
with_gold_mentions = True # if True it also calculates predictions with gold mentions
left_context_only_eval = True
if not all_masks:
eval_masked = 1 # iterations through masked evaluation
if phase not in ['eval', 'test']:
print('Specify a phase: test or eval')
quit()
if print_scores:
output_file = open('data/' + model_name + '/' + phase + '_scores.tsv', 'w')
else:
output_file = None
with tf.Session() as session:
model.restore(session)
# Make sure eval mode is True if you want official conll results
# WARNING: in the masked evaluation conll score may differ from py ones due to skipped mentions (if included in masked mentions)
if left_context_only_eval:
print('Left-only: Evaluation without masks')
# Non-masked evaluation; depending on with_gold_mentions with predicted and/or gold mentions
model.evaluate(session, official_stdout=True, eval_mode=True, phase = phase, with_gold_mentions = with_gold_mentions, output_file = output_file, trained_with_gold = trained_with_gold, left_context_only = True, eval_on_masks_only = True )
model.evaluate(session, official_stdout=True, eval_mode = True, masked = True, eval_on_masks_only = True, phase = phase, all_masks = False, with_gold_mentions = with_gold_mentions, output_file = output_file, trained_with_gold = trained_with_gold, left_context_only = True)
print('Evaluation without masks')
# Non-masked evaluation; depending on with_gold_mentions with predicted and/or gold mentions
model.evaluate(session, official_stdout=True, eval_mode=True, phase = phase, with_gold_mentions = with_gold_mentions, output_file = output_file, trained_with_gold = trained_with_gold)
if all_masks:
# Masked evaluation; depending on with_gold_mentions with predicted and/or gold mentions
model.evaluate(session, official_stdout=True, eval_mode = True, masked = True, eval_on_masks_only = True, phase = phase, all_masks = all_masks, with_gold_mentions = with_gold_mentions, output_file = output_file, trained_with_gold = trained_with_gold)
else:
# If not evaluation on all masked mentions
i = 0
eval_f1_mask_all, eval_antecedent_f1_mask_all, eval_antecedent_f1_masks_only_all = [], [], []
eval_f1_mask_all_gold, eval_antecedent_f1_mask_all_gold, eval_antecedent_f1_masks_only_all_gold = [], [], []
while i < eval_masked:
print('Evaluation with masks - ', i)
output_file.write('Run with masks - ' + str(i) + '\n')
output = model.evaluate(session, official_stdout=True, eval_mode = True, masked = True, eval_on_masks_only = True, phase = phase, with_gold_mentions = with_gold_mentions, output_file = output_file, trained_with_gold = trained_with_gold)
if with_gold_mentions and not trained_with_gold:
output_predicted, output_gold = output
else:
if trained_with_gold:
output_gold = output
output_predicted = output
if not trained_with_gold:
eval_summary_mask, eval_f1_mask, eval_antecedent_f1_mask, eval_antecedent_f1_masks_only = output_predicted
eval_f1_mask_all.append(eval_f1_mask)
eval_antecedent_f1_mask_all.append(eval_antecedent_f1_mask)
eval_antecedent_f1_masks_only_all.append(eval_antecedent_f1_masks_only)
if with_gold_mentions:
eval_summary_mask, eval_f1_mask, eval_antecedent_f1_mask, eval_antecedent_f1_masks_only = output_gold
eval_f1_mask_all_gold.append(eval_f1_mask)
eval_antecedent_f1_mask_all_gold.append(eval_antecedent_f1_mask)
eval_antecedent_f1_masks_only_all_gold.append(eval_antecedent_f1_masks_only)
i +=1
if not trained_with_gold:
eval_f1_mask = np.average(eval_f1_mask_all)
eval_antecedent_f1_mask = np.average(eval_antecedent_f1_mask_all)
eval_antecedent_f1_masks_only = np.average(eval_antecedent_f1_masks_only_all)
print('With predicted mentions')
print('Average F1 coreference with masks', eval_f1_mask)
print('Average F1 antecedent with masks', eval_antecedent_f1_mask)
print('Average F1 antecedent only on masks', eval_antecedent_f1_masks_only)
eval_f1_mask = np.average(eval_f1_mask_all_gold)
eval_antecedent_f1_mask = np.average(eval_antecedent_f1_mask_all_gold)
eval_antecedent_f1_masks_only = np.average(eval_antecedent_f1_masks_only_all_gold)
print('With gold mentions')
print('Average F1 coreference with masks', eval_f1_mask)
print('Average F1 antecedent with masks', eval_antecedent_f1_mask)
print('Average F1 antecedent only on masks', eval_antecedent_f1_masks_only)
output_file.close()