-
Notifications
You must be signed in to change notification settings - Fork 7
/
python_cynical_wrapper.py
412 lines (340 loc) · 19.1 KB
/
python_cynical_wrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
#!/usr/bin/env python3
'''
This is a Python re-implementation of
amittai-cynical-wrapper.sh
amittai-vocab-compute-counts.pl
amittai-vocab-ratios.pl
from https://github.com/amittai/cynical
It remains dependant on the original
amittai-cynical-selection.pl
But, it allows the user to run cynical data
selection as a python module
The original work is by amittai axelrod
The port is by Steve Sloto
'''
import os
import re
import shutil
import subprocess
import sys
from collections import Counter
import datetime
import copy
def cynical_selection(repr_lines, avail_lines, seed_lines=[],
batch_mode=False, keep_boring=True,
save_memory=True, lower=True, debug=True,
min_count=3, max_count=10000, num_lines=0,
outdir='/tmp/cynical_out', save_output=False,
cynical_perl_script="amittai-cynical-selection.pl"):
''' See the large argparse section in main() for info on what these parameters actually entail.
In some cases I've altered the defaults in this function to be more similar to bootstrapping usecase.
Note that we're assuming lists of strings for our representative, available, and seed corpora.
We're also expecting that this data is already tokenized (unless you want to run
selection on non-tokenized data for some weird reason.)
'''
#Check that the cyncial_perl_script exists and can be executed.
# Uncomment if you're using this as a pointer to a local script rather than something on the shell path
# if not os.path.exists(cynical_perl_script):
# raise FileNotFoundError("Path for cynical_perl_script does not exist!")
if shutil.which(cynical_perl_script) == None:
raise PermissionError("cynical_perl_script is not executible...")
# Set up the directory where we’re going to work (if it doesn’t exist)...
os.makedirs(outdir, mode=0o777, exist_ok=True)
# Run a few checks up front to see if any of our inputs are equivalent.
# If so, we can cut down on our pre-processing.
if debug:
sys.stderr.write("Checking whether input files are duplicates at {}\n".format(get_timestamp()))
avail_is_repr = (avail_lines == repr_lines)
seed_is_repr = (seed_lines == repr_lines)
seed_is_avail = (seed_lines == avail_lines)
# Cool, now we can change the contents of the lists away from how they were originally input.
# We'll start with avail, and follow all of the steps needed to get it ready for the perl script.
if debug:
sys.stderr.write("Sanitizing AVAIL Lines... at {}\n".format(get_timestamp()))
# cast this to a list b.c. we want to use its output twice (once to save for cynical, once for calculating mad stats)
avail_lines = list(prep_lines(avail_lines))
# We only care about having the full corpus for the available set written to file for downstream use
# For the others, we're just cleaning them up for use for vocabulary statistics.
sys.stderr.write("Writing preprocessed AVAIL corpus to disk at {}\n".format(get_timestamp()))
avail_corpus_file = os.path.join(outdir, "avail.corpus")
with open(avail_corpus_file, 'w', encoding='utf-8') as available_corpus:
for line in avail_lines:
available_corpus.write(line)
available_corpus.write('\n')
# Now we'll get our vocabulary counts, and write those to disk
if debug:
sys.stderr.write("Counting AVAIL Words... at {}\n".format(get_timestamp()))
avail_counts = get_vocab_counts(avail_lines)
if avail_is_repr:
# if we've prepped the same list already, skip the computation.
repr_counts = copy.deepcopy(avail_counts)
repr_vcb_file = avail_vcb_file
else:
# if we have a separate representative set, do the same prep minus writing the corpus to disk
if debug:
sys.stderr.write("Sanitizing REPR Lines... at {}\n".format(get_timestamp()))
repr_lines = prep_lines(repr_lines)
if debug:
sys.stderr.write("Counting REPR Words... at {}\n".format(get_timestamp()))
repr_counts = get_vocab_counts(repr_lines)
# Similar checks for whether we run computation for seed
if seed_is_repr:
seed_counts = copy.deepcopy(repr_counts)
elif seed_is_avail:
seed_counts = copy.deepcopy(avail_counts)
else:
# If we have a unique seed, we do our preprocessing and counting.
if debug:
sys.stderr.write("Sanitizing & Counting SEED Lines... at {}\n".format(get_timestamp()))
seed_lines = prep_lines(seed_lines)
seed_counts = get_vocab_counts(seed_lines)
# Since we don't care about our seed counts for any ratios, we output them now.
if debug:
sys.stderr.write("Writing SEED Words... at {}\n".format(get_timestamp()))
seed_vcb_file = os.path.join(outdir, "seed.vocab")
with open(seed_vcb_file, 'w', encoding='utf-8') as seed_vcb_out:
for line in seed_counts:
seed_vcb_out.write("\t".join([str(x) for x in line]))
seed_vcb_out.write("\n")
if debug:
sys.stderr.write("{} -- All Corpora have been preprocessed, vocab files written to disk\n".format(get_timestamp()))
sys.stderr.write("{} Calculating vocabulary ratio...\n".format(get_timestamp()))
ratios = get_vocab_ratios(repr_counts, avail_counts)
# We output vocabulary ratios, along with counts for REPR & AVAIL simultaneously...
sys.stderr.write("{} -- Writing vocabulary ratios & avail/repr counts to disk...\n".format(get_timestamp()))
vcb_ratio_file = os.path.join(outdir, "repr.avail.vocab.ratios")
repr_vcb_file = os.path.join(outdir, "repr.vocab")
avail_vcb_file = os.path.join(outdir, "avail.vocab")
with open(vcb_ratio_file, 'w', encoding='utf-8') as ratios_out, \
open(repr_vcb_file, 'w', encoding='utf-8') as repr_counts_out, \
open(avail_vcb_file, 'w', encoding='utf-8') as avail_counts_out:
for word_name, word_stats in ratios.items():
# count file formats are word <tab> probability <tab> count
if word_stats.corpus_1_count is not None:
repr_counts_out.write('{}\t{}\t{}\n'.format(word_name, str(word_stats.corpus_1_probability), str(word_stats.corpus_1_count)))
if word_stats.corpus_2_count is not None:
avail_counts_out.write('{}\t{}\t{}\n'.format(word_name, str(word_stats.corpus_2_probability), str(word_stats.corpus_2_count)))
# ratio file is also a *.tsv, order of columns besides word_name is provided in get_value_list method of WordProbabilityInformation
ratios_out.write(word_name)
ratios_out.write('\t')
ratios_out.write('\t'.join([str(value) for value in word_stats.get_value_list()]))
ratios_out.write('\n')
#Delete the things that will eat memory
del repr_counts
del avail_counts
del seed_counts
del repr_lines
del seed_lines
jaded_file = os.path.join(outdir, "jaded.output")
#Now begins the process of writing up our command-line arguments for calling the thing
#shlex says we want our calls to look like this:
cynical_args=[cynical_perl_script,
'--task_vocab='+repr_vcb_file,
'--unadapt_vocab='+avail_vcb_file,
'--available='+avail_corpus_file,
'--seed_vocab='+seed_vcb_file,
'--working_dir='+outdir,
'--stats='+vcb_ratio_file,
'--jaded='+jaded_file,
'--mincount='+str(min_count),
]
if num_lines > 0:
cynical_args.append('--numlines='+str(num_lines))
if max_count > 0:
cynical_args.append('--maxcount='+str(max_count))
if batch_mode == True:
cynical_args.append('--batchmode')
if keep_boring == True:
cynical_args.append('--keep_boring')
if save_memory == True:
cynical_args.append('--save_memory')
#Call our perl script
if debug:
sys.stderr.write("Calling cynical perl script... at {}\n".format(get_timestamp()))
sys.stderr.write("\n ".join(cynical_args)+"\n")
with subprocess.Popen(cynical_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as cynical_proc:
with open(outdir+"/cynical.stderr", "wb") as stderr, open(outdir+"/cynical.stdout", "wb") as stdout:
stderr.write(cynical_proc.stderr.read())
stdout.write(cynical_proc.stdout.read())
#Assuming all went well above, unify our scores / line order with our original avail list
cynical_out = read_jaded(jaded_file, avail_lines)
if save_output == False:
#nuke our output files. We have scores, and I want the world to burn!
shutil.rmtree(outdir, ignore_errors=True)
else:
if debug:
sys.stderr.write("We've finished with our selection, output is saved to "+jaded_file+"\n")
return cynical_out
def get_timestamp():
return datetime.datetime.now().isoformat(" ")
none_to_zero = lambda x: 0 if x is None else x
class WordProbabilityInformation():
'''Utility object for holding corpus-level stats about word probabilities'''
def __init__(self):
self.corpus_1_probability = None
self.corpus_1_count = None
self.corpus_2_probability = None
self.corpus_2_count = None
self.ratio = None
def add_corpus_1_info(self, probability, count):
self.corpus_1_probability = probability
self.corpus_1_count = count
def add_corpus_2_info(self, probability, count):
self.corpus_2_probability = probability
self.corpus_2_count = count
def compute_ratio(self):
# if we have probabilities for corpus_1 and corpus_2, then we ratio those
if self.corpus_1_probability is not None and self.corpus_2_probability is not None:
self.ratio = self.corpus_1_probability / self.corpus_2_probability
# if we have probabilities for only corpus 2
elif self.corpus_2_count is not None:
self.ratio = 1/(2 * self.corpus_2_count)
# if we have probabilities for only corpus 1
elif self.corpus_1_count is not None:
self.ratio = 2 * self.corpus_1_count
def get_value_list(self):
'''Returns cleaned list of values in format used by vocab ratio file.'''
return [self.ratio,
none_to_zero(self.corpus_1_probability),
none_to_zero(self.corpus_1_count),
none_to_zero(self.corpus_2_probability),
none_to_zero(self.corpus_2_count)]
def prep_lines(corpus, lower=True):
''' Takes in a corpus (list of str), does the hackneyed preprocessing for downsteam perl/cynical requirements'''
# "match at least two underscores preceded by space or start-of-line, and mark them"
corpus = map(lambda x: re.sub(r'(\s|^)(__+)', r'\1@\2@', x), corpus)
# Get rid of non-breaking space and unpleasant runs of spaces
corpus = map(lambda x: re.sub(r'\s+', ' ', x), corpus)
# strip other whitespace
corpus = map(lambda x: x.strip(), corpus)
# optional lowercase
if lower:
corpus = map(lambda x: x.lower(), corpus)
return corpus
def get_vocab_counts(corpus):
''' Reads in a list of sentences, returns a list of tuples of (word, probability, count)
sorted by descending frequency. This is a reimplementation of 'amittai-vocab-compute-counts.pl' '''
vocab_counts = Counter()
for line in corpus:
for word in line.split():
vocab_counts[word] += 1
vocab_size = sum(vocab_counts.values())
# We are /nearly there/ but we want to output words, probabilities, and counts rather than words and counts
reformat_line = lambda wordcount: (wordcount[0], float(wordcount[1]) / vocab_size, wordcount[1])
return map(reformat_line, vocab_counts.most_common())
def get_vocab_ratios(vc1, vc2):
''' Roughly speaking, this is a reimplemntation of 'amittai-vocab-ratios.pl'
To summarize/plagarize Amittai:
Takes two <lists of tuples>, produced by <get_vocab_counts>
each item consists of a words, a probability, and the count of the word in a corpus. Each file is
thus a unigram language model with count information. <Function> merges
the information <together into a dictionary of dictionaries that can be writ for downstream use.>
Usually used with model1=in-domain task and model2= general
data, making the ratio prob1/prob2 "number of times more likely the
word is in the in-domain corpus than in the general corpus".
If a word is only in one corpus, than the ratio is twice what it
would have been if the word had appeared once in the other
corpus. that is, we smooth the count in the other corpus to be 0.5.
No real reason, just seems like we shouldn't ignore words that
appear lots in one corpus but not at all in the other, and it
should be more skewed than if it had been a singleton.'''
ratio_dict = {}
# compile word stats from first list
for word, prob1, count1 in vc1:
# Add probability dict entries for any words that we have...
ratio_dict[word] = WordProbabilityInformation()
ratio_dict[word].add_corpus_1_info(prob1, count1)
# compile word stats from second list
for word, prob2, count2 in vc2:
if word not in ratio_dict:
ratio_dict[word] = WordProbabilityInformation()
ratio_dict[word].add_corpus_2_info(prob2, count2)
# get ratio information for compiled words
for word_probability_information in ratio_dict.values():
word_probability_information.compute_ratio()
return ratio_dict
def read_jaded(jaded_file, avail):
''' Takes in a file of so-called jadedness (the output of cynical), as well as the raw corpus of available lines. '''
jaded_info = {}
with open(jaded_file, 'r', encoding='utf-8') as jaded:
for line in jaded:
## JADED columns: sentence {input line_id, output rank, score,
## penalty, gain, total score, the root word, WGE, and the string.
line=line.split('\t')
raw_line = avail[int(line[0])-1] #original line number
jaded_info[raw_line] = {}
jaded_info[raw_line]['input_id'] = line[0]
jaded_info[raw_line]['output_rank'] = line[1]
#output_rank is the same as the order the lines were selected
jaded_info[raw_line]['score'] = line[2]
jaded_info[raw_line]['penalty'] = line[3]
jaded_info[raw_line]['gain'] = line[4]
jaded_info[raw_line]['total_score'] = line[5]
jaded_info[raw_line]['root_word']=line[6]
jaded_info[raw_line]['WGE']=line[7]
return jaded_info
def main():
''' An example implementation of reading in files for cynical from disk and calling the core perl module
This should function as 'amittai-cynical-wrapper.sh' except with cmd line args
in place of a wrapper '''
import argparse
parser = argparse.ArgumentParser(description='Substitute for Amittai Bash Wrapper')
parser.add_argument('--repr', required=True, help="Representative Set -- The Data We're Modelling")
parser.add_argument('--avail', required=True, help="Available Set -- The Data We're Selecting From")
parser.add_argument('--seed', help="Seed Set -- Data We Have Already Selected/Used (Optional)")
parser.add_argument('--batch-mode', \
help="""batchmode selects sqrt(k) sentences per iteration. much faster, much
more approximate. essential for huge corpora, but you probably want
to disable it when picking only the very best 10,000 sentences.""", action='store_true', default=False)
parser.add_argument('--min-count', help="ignore words that appear fewer than $mincount times. default 3", type=int, default=3)
parser.add_argument('--max-count', help="""ignore words that appear more than $maxcount times, at least until
there are < $maxcount left in AVAIL. this affects how much memory
is used. default 10,000. try sqrt(N), N**(2/3), or N/1000 for
bigger relative values.""", type=int, default=0) #downstream, this 0 is interpreted as 10,000
parser.add_argument('--num-lines', help="how many lines to select? Not setting this parameter means every line will be selected.", type=int, default=0)
parser.add_argument('--keep-boring', help="Use this option to select based on words with a vocab ratio close to 1.", action="store_true", default=True)
parser.add_argument('--lower', help="Lowercase the data, and reduce vocabulary size further.", action="store_true", default=False)
parser.add_argument('--out-dir', help='Directory to write our output files to.', required=True)
parser.add_argument('--save-memory', help="Saves memory, runs slower", action="store_true", default="False")
parser.add_argument('--cyn-path', help="Location of cynical perl script.", default="amittai-cynical-selection.pl")
args = parser.parse_args()
# Give the cyncial selection function file objects so that we don't read whole corpora into memory
sys.stderr.write("Opening files at {}".format(get_timestamp()))
with open(args.repr, 'r', encoding='utf-8') as repr_file, open(args.repr, 'r', encoding='utf-8') as avail_file:
if args.seed:
seed_file = open(args.seed, 'r', encoding='utf-8')
else:
seed_file = []
cynical_selection(repr_file, avail_file, seed_file,
batch_mode=args.batch_mode, keep_boring=args.keep_boring,
lower=args.lower, min_count=args.min_count, max_count=args.max_count,
num_lines=args.num_lines,outdir=args.out_dir, save_memory=args.save_memory,
cynical_perl_script=args.cyn_path, save_output=True)
# close our seed since we're done with it
if seed_file != []:
seed_file.close()
if __name__ == "__main__":
main()
## The MIT License (MIT)
##
## Copyright 2018
##
## Permission is hereby granted, free of charge, to any person
## obtaining a copy of this software and associated documentation
## files (the "Software"), to deal # in the Software without
## restriction, including without limitation the rights # to use,
## copy, modify, merge, publish, distribute, sublicense, and/or sell #
## copies of the Software, and to permit persons to whom the Software
## is # furnished to do so, subject to the following conditions:
## The above copyright notice and this permission notice shall be
## included in all copies or substantial portions of the Software.
##
## THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
## EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
## MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
## NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
## BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
## ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
## CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
## SOFTWARE.
# eof