This repository has been archived by the owner on Mar 22, 2022. It is now read-only.
forked from chrisjbryant/errant
-
Notifications
You must be signed in to change notification settings - Fork 4
/
toolbox.py
176 lines (164 loc) · 5.9 KB
/
toolbox.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
from spacy.tokens import Doc
import regex as re
# Load latest Hunspell dictionaries:
def loadDictionary(path):
return set(open(path).read().split())
# Load Stanford Universal Tags map file.
def loadTagMap(path, args):
map_dict = {}
open_file = open(path).readlines()
for line in open_file:
line = line.strip().split("\t")
map_dict[line[0]] = line[1].strip()
if args.lang == 'en':
# Change ADP to PREP; makes it clearer
if line[1].strip() == "ADP":
map_dict[line[0]] = "PREP"
# Also change PROPN to NOUN; we don't need a prop noun tag
elif line[1].strip() == "PROPN":
map_dict[line[0]] = "NOUN"
# Add some spacy PTB tags not in the original mapping.
if args.lang == 'en':
map_dict['""'] = "PUNCT"
map_dict["SP"] = "SPACE"
map_dict["ADD"] = "X"
map_dict["GW"] = "X"
map_dict["NFP"] = "X"
map_dict["XX"] = "X"
return map_dict
# Input: A sentence + edit block in an m2 file.
# Output 1: The original sentence (a list of tokens)
# Output 2: A dictionary; key is coder id, value is a tuple.
# tuple[0] is the corrected sentence (a list of tokens), tuple[1] is the edits.
# Process M2 to extract sentences and edits.
def processM2(info):
info = info.split("\n")
orig_sent = info[0][2:].split() # [2:] ignore the leading "S "
all_edits = info[1:]
# Simplify the edits and group by coder id.
edit_dict = processEdits(all_edits)
out_dict = {}
# Loop through each coder and their edits.
for coder, edits in edit_dict.items():
# Copy orig_sent. We will apply the edits to it to make cor_sent
cor_sent = orig_sent[:]
gold_edits = []
offset = 0
for edit in edits:
# Do not apply noop or Um edits, but save them
if edit[2] in {"noop", "Um"}:
gold_edits.append(edit+[-1,-1])
continue
orig_start = edit[0]
orig_end = edit[1]
cor_toks = edit[3].split()
# Apply the edit.
cor_sent[orig_start+offset:orig_end+offset] = cor_toks
# Get the cor token start and end positions in cor_sent
cor_start = orig_start+offset
cor_end = cor_start+len(cor_toks)
# Keep track of how this affects orig edit offsets.
offset = offset-(orig_end-orig_start)+len(cor_toks)
# Save the edit with cor_start and cor_end
gold_edits.append(edit+[cor_start]+[cor_end])
# Save the cor_sent and gold_edits for each annotator in the out_dict.
out_dict[coder] = (cor_sent, gold_edits)
return orig_sent, out_dict
# Input: A list of edit lines for a sentence in an m2 file.
# Output: An edit dictionary; key is coder id, value is a list of edits.
def processEdits(edits):
edit_dict = {}
for edit in edits:
edit = edit.split("|||")
span = edit[0][2:].split() # [2:] ignore the leading "A "
start = int(span[0])
end = int(span[1])
cat = edit[1]
cor = edit[2]
id = edit[-1]
# Save the useful info as a list
proc_edit = [start, end, cat, cor]
# Save the proc edit inside the edit_dict using coder id.
if id in edit_dict.keys():
edit_dict[id].append(proc_edit)
else:
edit_dict[id] = [proc_edit]
return edit_dict
# Input 1: A list of token strings in a sentence.
# Input 2: A preloaded Spacy processing object.
# Annotate tokens with POS, lemma and parse info.
def applySpacy(input_sent, nlp, args, treetagger=None):
# Convert tokens to spacy tokens and POS tag and parse.
if args.tok:
sent = nlp(input_sent)
else:
sent = Doc(nlp.vocab, input_sent.split())
nlp.tagger(sent)
nlp.parser(sent)
if treetagger:
import treetaggerwrapper
tokens = []
if args.tok:
tokens = [token.text for token in sent]
else:
tokens = input_sent.split()
tags = treetaggerwrapper.make_tags(treetagger.tag_text("\n".join(tokens) + "\n", tagonly=True))
if len(tokens) == len(tags):
for i in range(0, len(tags)):
# use treetagger lemmas
if isinstance(tags[i], treetaggerwrapper.Tag):
sent[i].lemma_ = tags[i].lemma
# if spacy provides an empty tag (as for —),
# use treetagger tag, with an exception for German punctuation
if sent[i].tag_ == "":
sent[i].tag_ = tags[i].pos
if re.match(r'^\p{P}+$', sent[i].text):
if args.lang == "de":
sent[i].tag_ = "$("
# check (again) for empty tags (as for —),
# check for punctuation, otherwise use XX
for tok in sent:
if tok.tag_ == "":
tok.tag_ = "XX"
if re.match(r'^\p{P}+$', tok.text):
if args.lang == "en":
tok.tag_ = ":"
elif args.lang == "de":
tok.tag_ = "$("
return sent
# Input 1: An edit list. [orig_start, orig_end, cat, cor, cor_start, cor_end]
# Input 2: An original SpaCy sentence.
# Input 3: A corrected SpaCy sentence.
# Output: A minimised edit with duplicate words on both sides removed.
# E.g. [was eaten -> has eaten] becomes [was -> has]
def minimiseEdit(edit, orig, cor):
# edit = [orig_start, orig_end, cat, cor, cor_start, cor_end]
orig_toks = orig[edit[0]:edit[1]]
cor_toks = cor[edit[4]:edit[5]]
# While the first token is the same string in both (and both are not null)
while orig_toks and cor_toks and orig_toks[0].text == cor_toks[0].text:
# Remove that token from the span, and adjust the start offset.
orig_toks = orig_toks[1:]
cor_toks = cor_toks[1:]
edit[0] += 1
edit[4] += 1
# Then do the same from the last token.
while orig_toks and cor_toks and orig_toks[-1].text == cor_toks[-1].text:
# Remove that token from the span, and adjust the start offset.
orig_toks = orig_toks[:-1]
cor_toks = cor_toks[:-1]
edit[1] -= 1
edit[5] -= 1
# If both sides are not null, save the new correction string.
if orig_toks or cor_toks:
edit[3] = " ".join([tok.text for tok in cor_toks])
return edit
# Input 1: An edit list = [orig_start, orig_end, cat, cor, cor_start, cor_end]
# Input 2: A coder id for the specific annotator.
# Output: An edit in m2 file format.
def formatEdit(edit, coder_id=0):
span = " ".join(["A", str(edit[0]), str(edit[1])])
return "|||".join([span, edit[2], edit[3], "REQUIRED", "-NONE-", str(coder_id)])
def formatAnnotation(sent):
ann = ["/".join([w.orth_, w.tag_, w.pos_, w.lemma_, w.dep_]) for w in sent]
return " ".join(ann)