/
main.py
444 lines (384 loc) · 17.4 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
# Copyright (C) 2012 Robert Lanfear and Brett Calcott
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details. You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# PartitionFinder also includes the PhyML program, the RAxML program, and the
# PyParsing library, all of which are protected by their own licenses and
# conditions, using PartitionFinder implies that you agree with those licences
# and conditions as well.
__VERSION__ = "2.1.1"
import logging
import sys
import shlex
import logtools
logging.basicConfig(
format="%(levelname)-8s | %(asctime)s | %(message)s",
level=logging.INFO
)
log = logtools.get_logger()
from optparse import OptionParser
# We import everything here as it forces all of debug regions to be loaded
import config
import analysis_method
import util
import reporter
import progress
import datetime
import parser
import raxml
import phyml
def debug_arg_callback(option, opt, value, theparser):
setattr(theparser.values, option.dest, value.split(','))
def get_debug_regions():
mlogger = logging.Logger.manager
return mlogger.loggerDict.keys()
def set_debug_regions(regions):
if regions is None:
return
valid_regions = set(get_debug_regions())
if 'all' in regions:
regions = valid_regions
else:
regions = set(regions)
errors = set()
for r in regions:
if r not in valid_regions:
log.error("'%s' is not a valid debug region", r)
errors.add(r)
if errors:
return errors
for r in regions:
logging.getLogger(r).setLevel(logging.DEBUG)
# Enhance the format
fmt = logging.Formatter(
"%(levelname)-8s | %(asctime)s | %(name)-10s | %(message)s")
logging.getLogger("").handlers[0].setFormatter(fmt)
return None
def parse_args(datatype, cmdargs=None):
usage = """usage: python %prog [options] <foldername>
PartitionFinder and PartitionFinderProtein are designed to discover optimal
partitioning schemes for nucleotide and amino acid sequence alignments.
They are also useful for finding the best model of sequence evolution for datasets.
The Input: <foldername>: the full path to a folder containing:
- A configuration file (partition_finder.cfg)
- A nucleotide/aa alignment in Phylip format
Take a look at the included 'example' folder for more details.
The Output: A file in the same directory as the .cfg file, named
'analysis' This file contains information on the best
partitioning scheme, and the best model for each partiiton
Usage Examples:
>python %prog example
Analyse what is in the 'example' sub-folder in the current folder.
>python %prog -v example
Analyse what is in the 'example' sub-folder in the current folder, but
show all the debug output
>python %prog -c ~/data/frogs
Check the configuration files in the folder data/frogs in the current
user's home folder.
>python %prog --force-restart ~/data/frogs
Deletes any data produced by the previous runs (which is in
~/data/frogs/output) and starts afresh
"""
op = OptionParser(usage)
op.add_option(
"-v", "--verbose",
action="store_true", dest="verbose",
help="show debug logging information (equivalent to --debug-out=all)")
op.add_option(
"-c", "--check-only",
action="store_true", dest="check_only",
help="just check the configuration files, don't do any processing")
op.add_option(
"-f", "--force-restart",
action="store_true", dest="force_restart",
help="delete all previous output and start afresh (!)")
op.add_option(
"-p", "--processes",
type="int", dest="processes", default=-1, metavar="N",
help="Number of concurrent processes to use."
" Use -1 to match the number of cpus on the machine."
" The default is to use -1.")
op.add_option(
"--show-python-exceptions",
action="store_true", dest="show_python_exceptions",
help="If errors occur, print the python exceptions")
op.add_option(
"--save-phylofiles",
action="store_true", dest="save_phylofiles",
help="save all of the phyml or raxml output. This can take a lot of space(!)")
op.add_option(
"--dump-results",
action="store_true", dest="dump_results",
help="Dump all results to a binary file. "
"This is only of use for testing purposes.")
op.add_option(
"--compare-results",
action="store_true", dest="compare_results",
help="Compare the results to previously dumped binary results. "
"This is only of use for testing purposes.")
op.add_option(
"-q", "--quick",
action="store_true", dest="quick", default=False,
help="Avoid anything slow (like writing schemes at each step),"
"useful for very large datasets."
)
op.add_option(
"-r", "--raxml",
action="store_true", dest="raxml",
help="Use RAxML (rather than PhyML) to do the analysis. See the manual"
)
op.add_option(
"-n", "--no-ml-tree",
action="store_true", dest="no_ml_tree", default=False,
help="Estimate a starting tree with NJ (PhyML) or MP (RaxML) instead"
" of the default which is to estimate a starting tree with ML "
" using in RAxML. Not recommended."
)
op.add_option(
"--cmdline-extras",
type="str", dest="cmdline_extras", default="", metavar="N",
help="Add additional commands to the phyml or raxml commandlines that PF uses."
"This can be useful e.g. if you want to change the accuracy of lnL calculations"
" ('-e' option in raxml), or use multi-threaded versions of raxml that require"
" you to specify the number of threads you will let raxml use ('-T' option in "
"raxml. E.g. you might specify this: --cmndline_extras ' -e 2.0 -T 10 '"
" N.B. MAKE SURE YOU PUT YOUR EXTRAS IN QUOTES, and only use this command if you"
" really know what you're doing and are very familiar with raxml and"
" PartitionFinder"
)
op.add_option(
"--weights",
type="str", dest="cluster_weights", default=None, metavar="N",
help="Mainly for algorithm development. Only use it if you know what you're doing."
"A list of weights to use in the clustering algorithms. This list allows you "
"to assign different weights to: the overall rate for a subset, the base/amino acid "
"frequencies, model parameters, and alpha value. This will affect how subsets are "
"clustered together. For instance: --cluster_weights '1, 2, 5, 1', would weight "
"the base freqeuncies 2x more than the overall rate, the model parameters 5x "
"more, and the alpha parameter the same as the model rate"
)
op.add_option(
"--kmeans",
type="str", dest="kmeans", default='entropy', metavar="type",
help="This defines which sitewise values to use: entropy or tiger "
"\n--kmeans entropy: use entropies for sitewise values"
"\n--kmeans tiger: use TIGER rates for sitewise values (only valid for Morphology)"
)
op.add_option(
"--rcluster-percent",
type="float", dest="cluster_percent", default=10.0, metavar="N",
help="This defines the proportion of possible schemes that the relaxed clustering"
" algorithm will consider before it stops looking. The default is 10%."
"\ne.g. --rcluster-percent 10.0"
)
op.add_option(
"--rcluster-max",
type="int", dest="cluster_max", default=-987654321, metavar="N",
help="This defines the number of possible schemes that the relaxed clustering"
" algorithm will consider before it stops looking. The default is to look at "
"the larger value out of 1000, and 10 times the number of data blocks you have."
"\ne.g. --rcluster-max 1000"
)
op.add_option(
"--min-subset-size",
type="int", dest="min_subset_size", default=False, metavar="N",
help="This defines the minimum subset size that the kmeans and rcluster"
" algorithm will accept. Subsets smaller than this "
" will be merged at with other subsets at the end of the algorithm"
" (for kmeans) or at the start of the algorithm (for rcluster)."
" See manual for details. The default value for kmeans is 100."
" The default value for rcluster is to ignore this option."
"\ne.g. --min-subset-size 100"
)
op.add_option(
'--debug-output',
type='string',
action='callback',
dest='debug_output',
metavar="REGION,REGION,...",
callback=debug_arg_callback,
help="(advanced option) Provide a list of debug regions to output extra "
"information about what the program is doing."
" Possible regions are 'all' or any of {%s}."
% ",".join(get_debug_regions())
)
op.add_option(
"--all-states",
action="store_true", dest="all_states", default=False,
help="In the kmeans and rcluster algorithms, this stipulates that PartitionFinder "
"should not produce subsets that do not have all possible states present. E.g."
" for DNA sequence data, all subsets in the final scheme must have A, C, T, "
" and G nucleotides present. This can occasionally be useful for downstream "
" analyses, particularly concerning amino acid datasets."
)
op.add_option(
'--profile',
action="store_true",
help="Output profiling information after running (this will slow everything down!)")
if cmdargs is None:
options, args = op.parse_args()
else:
options, args = op.parse_args(cmdargs)
options.datatype = datatype
# We should have one argument: the folder to read the configuration from
if not args:
op.print_help()
else:
check_options(op, options)
return options, args
def check_options(op, options):
# Error checking
if options.dump_results and options.compare_results:
op.error(
"options --dump_results and --compare_results are mutually exclusive!")
if options.verbose:
set_debug_regions(['all'])
else:
errors = set_debug_regions(options.debug_output)
if errors is not None:
bad = ",".join(list(errors))
op.error("Invalid debug regions: %s" % bad)
# Default to phyml
if options.raxml == 1:
options.phylogeny_program = 'raxml'
else:
options.phylogeny_program = 'phyml'
#A warning for people using the Pthreads version of RAxML
if options.cmdline_extras.count("-T") > 0:
log.warning("""
It looks like you're using a Pthreads version of RAxML. Be aware
that the default behaviour of PartitionFinder is to run one version
of RAxML per available processor. This might not be what you want
with Pthreads, since the minimum number of threads per RAxML run is
2 (i.e. -T 2). Make sure to limit the total number of RAxML runs
you start using the -p option in PartitionFinder. Specifically,
the total number of processors you will use with the Pthreads
version is the number you set via the -T option in
--cmdline-extras, multiplied by the number of processors you set
via the -p option in PartitionFinder. You should also be aware
that the Pthreads version of RAxML has a rare but known bug on some
platforms. This bug results in infinite likelihood values if it
happens on your dataset, PartitionFinder will give an error. In
that case you should switch back to using a single-threaded version
of RAxML, e.g. the SSE3 or AVX version. See the manual for more
info.""")
def check_python_version():
"""Check the python version is above 2.7 but lower than 3.0"""
python_version = float(
"%d.%d" % (sys.version_info[0], sys.version_info[1]))
log.info("You have Python version %.1f" % python_version)
if python_version < 2.7:
log.error("""
Your Python version is %.1f, but this program requires Python 2.7.
Please upgrade to version 2.7 by visiting www.python.org/getit,
or by following the instructions in the PartitionFinder manual.
""" % python_version)
return 0
if python_version > 3.0:
log.warning("""
Your Python version is %.1f. This program was not built to run
with version 3 or higher. To guarantee success, please use
Python 2.7.x""" % python_version)
def run_analysis(cfg, options):
# Now try processing everything....
method = analysis_method.choose_method(cfg.search)
reporter.TextReporter(cfg)
anal = method(cfg, options.force_restart, options.processes)
results = anal.analyse()
if options.dump_results:
results.dump(cfg)
elif options.compare_results:
results.compare(cfg)
def profile_analysis(cfg, options):
import cProfile, pstats
cProfile.runctx('run_analysis(cfg, options)', globals(), locals(), filename='profile.output')
p = pstats.Stats('profile.output')
p.sort_stats('time').print_stats(20)
p.sort_stats('cumtime').print_stats(20)
# p.strip_dirs().sort_stats(-1).print_stats()
def main(name, datatype, passed_args=None):
# If passed_args is None, this will use sys.argv
options, args = parse_args(datatype, passed_args)
if not args:
# Help has already been printed
return 2
log.info("------------- %s %s -----------------" % (name, __VERSION__))
start_time = datetime.datetime.now().replace(
microsecond=0) # start the clock ticking
check_python_version()
if passed_args is None:
cmdline = " ".join(sys.argv)
else:
cmdline = " ".join(passed_args)
log.info("Command-line arguments used: %s" % cmdline)
# Load, using the first argument as the folder
try:
# TODO: just pass the options in!
config.the_config.init(datatype,
options.phylogeny_program,
options.save_phylofiles,
options.cmdline_extras,
options.cluster_weights,
options.cluster_percent,
options.cluster_max,
options.kmeans,
options.quick,
options.min_subset_size,
options.all_states,
options.no_ml_tree)
cfg = config.the_config
# Set up the progress callback
progress.TextProgress(cfg)
cfg.load_base_path(args[0])
if options.check_only:
log.info("""
Exiting without processing (because of the
-c/--check-only option ...
""")
else:
try:
if options.profile:
profile_analysis(cfg, options)
else:
run_analysis(cfg, options)
finally:
# Make sure that we reset the configuration
cfg.reset()
# Successful exit
end_time = datetime.datetime.now().replace(microsecond=0)
processing_time = end_time - start_time
log.info("Total processing time: %s (h:m:s)" % processing_time)
log.info("Processing complete.")
return 0
except util.ExternalProgramError as e:
log.error("""A program that Partitionfinder uses failed. Output
follows, in case it's helpful for finding the problem""")
log.error("%s", e.stdout)
log.error("%s", e.stderr)
if options.show_python_exceptions or passed_args is not None:
raise
except util.ParseError:
log.error("""We failed to parse the output of an external program (See
previous errors). This is probably due to an update to the
program which has changed the output.""")
if options.show_python_exceptions or passed_args is not None:
raise
except util.PartitionFinderError:
log.error("Failed to run. See previous errors.")
# Reraise if we were called by call_main, or if the options is set
if options.show_python_exceptions or passed_args is not None:
raise
except (KeyboardInterrupt, SystemExit):
log.error("User interrupted the Program")
return 1
def call_main(datatype, cmdline):
cmdargs = shlex.split(cmdline)
main("<main.py:call_main(%s)>" % datatype, datatype, cmdargs)