/
reporter.py
435 lines (348 loc) · 18.7 KB
/
reporter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
# Copyright (C) 2012 Robert Lanfear and Brett Calcott
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details. You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# PartitionFinder also includes the PhyML program, the RAxML program, and the
# PyParsing library, all of which are protected by their own licenses and
# conditions, using PartitionFinder implies that you agree with those licences
# and conditions as well.
import logtools
import pandas
log = logtools.get_logger()
from config import the_config
from model_utils import *
import os
scheme_header_template = "%-18s: %s\n"
scheme_subset_template = "%-6s | %-10s | %-10s | %-32s | %-100s\n"
subset_template = "%-15s | %-15s | %-15s | %-15s | %-15s | %-15s\n"
# We write different output for these searches
_odd_searches = ['kmeans', 'krmeans']
_scheme_data_csv = 'scheme_data.csv'
class TextReporter(object):
def __init__(self, config):
self.cfg = config
self.cfg.reporter = self
self.header_done = False
def write_subset_summary(self, sub):
pth = os.path.join(self.cfg.subsets_path, sub.subset_id + '.txt')
# Sort everything
cols = ['model_id', 'params', 'lnl', 'aicc', 'aic', 'bic']
descr = self.cfg.data_layout.data_type.descr
indices = dict([(t[0], i) for i, t in enumerate(descr) if t[0] in cols])
sorted_results = [(row['aicc'], row) for row in sub.result_array]
sorted_results.sort()
output = open(pth, 'w')
output.write("Model selection results for subset: %s\n" % sub.subset_id)
if sub.alignment_path:
output.write("Subset alignment stored here: %s\n" % sub.alignment_path)
if the_config.search not in _odd_searches:
output.write("This subset contains the following data_blocks: %s\n" % sub.name)
output.write("Number of columns in subset: %d\n" % len(sub.columns))
output.write("Models are organised according to their AICc scores\n\n")
output.write(subset_template % ("Model", "Parameters", "lnL", "AICc", "AIC", "BIC"))
for aicc, row in sorted_results:
output.write(subset_template % (row[indices['model_id']],
row[indices['params']],
row[indices['lnl']],
row[indices['aicc']],
row[indices['aic']],
row[indices['bic']]))
def write_scheme_summary(self, sch, result):
pth = os.path.join(self.cfg.schemes_path, sch.name + '.txt')
output = open(pth, 'w')
self.output_scheme(sch, result, output)
summary_pth = os.path.join(self.cfg.schemes_path, _scheme_data_csv)
summary_output = open(summary_pth, "a")
self.add_scheme_to_csv(sch, result, summary_output)
def add_scheme_to_csv(self, sch, result, summary_output):
if not self.header_done:
summary_output.write('name,sites,lnL,parameters,subsets,aic,aicc,bic\n')
self.header_done = True
summary_output.write('%s,%d,%.2f,%d,%d,%.2f,%.2f,%.2f\n'
%(sch.name,
result.nsites,
result.lnl,
result.sum_k,
result.nsubs,
result.aic,
result.aicc,
result.bic
)
)
def output_scheme(self, sch, result, output):
self.write_scheme_header(sch, result, output)
sorted_subsets = [sub for sub in sch]
sorted_subsets.sort(key=lambda sub: min(sub.columns), reverse=False)
self.write_subsets(sch, result, output, sorted_subsets)
self.write_nexus_summary(output, sorted_subsets)
self.write_raxml(sch, result, output, sorted_subsets)
if self.cfg.datatype != "morphology":
self.write_mrbayes(sch, result, output, sorted_subsets)
def write_scheme_header(self, sch, result, output):
output.write(scheme_header_template % ("Scheme Name", sch.name))
output.write(scheme_header_template % ("Scheme lnL", result.lnl))
if self.cfg.model_selection == "aic":
output.write(scheme_header_template % ("Scheme AIC", result.aic))
if self.cfg.model_selection == "aicc":
output.write(scheme_header_template % ("Scheme AICc", result.aicc))
if self.cfg.model_selection == "bic":
output.write(scheme_header_template % ("Scheme BIC", result.bic))
output.write(scheme_header_template % ("Number of params", result.sum_k))
output.write(scheme_header_template % ("Number of sites", result.nsites))
output.write(scheme_header_template % ("Number of subsets", result.nsubs))
output.write("\n")
def write_nexus_summary(self, output, sorted_subsets):
output.write("\n\nNexus formatted character sets\n")
output.write("begin sets;\n")
subset_number = 1
charpartition = []
for sub in sorted_subsets:
if self.cfg.search in _odd_searches:
sites = [x + 1 for x in sub.columns]
partition_sites = str(sites).strip('[]')
else:
partition_sites = sub.site_description_no_commas
output.write("\tcharset Subset%s = %s;\n" % (subset_number, partition_sites))
charpartition.append("Group%s:Subset%s" % (subset_number, subset_number))
subset_number += 1
output.write('\tcharpartition PartitionFinder = %s;\n' % ', '.join(charpartition))
output.write('end;\n')
def write_subsets(self, sch, result, output, sorted_subsets):
output.write(scheme_subset_template % (
"Subset", "Best Model", "# sites", "subset id", "Partition names"))
number = 1
# a way to print out the scheme in PF format
pf_scheme_description = []
if self.cfg.search in _odd_searches:
for sub in sorted_subsets:
num_sites = len(sub.columns)
sites = [x + 1 for x in sub.columns]
pf_scheme_description.append("(%s)" % str(sites).strip('[]'))
output.write(scheme_subset_template % (
number,
sub.best_model,
num_sites,
sub.subset_id,
'NA',
))
number += 1
else:
for sub in sorted_subsets:
pf_scheme_description.append("(%s)" % sub.name)
output.write(scheme_subset_template % (
number,
sub.best_model,
len(sub.columns),
sub.subset_id,
sub.name,
))
number += 1
pf_scheme_description = " ".join(pf_scheme_description)
output.write("\n\nScheme Description in PartitionFinder format\n")
output.write("Scheme_%s = %s;" % (sch.name, pf_scheme_description))
def write_raxml(self, sch, result, output, sorted_subsets):
self.write_raxml_warning(output)
write_raxml_partitions(sch, output, sorted_subsets)
def write_raxml_warning(self, output):
output.write("\n\nRaxML-style partition definitions\n")
output.write("Warning: RAxML allows for only a single model of rate"
" heterogeneity in partitioned analyses. I.e. all "
"partitions must be assigned one of three types of model:"
" No heterogeneity (e.g. GTR); +G (e.g. GTR+G); or "
"+I+G (e.g. GTR+I+G). If the best models for your dataset"
"contain different types of model for different subsets "
"you will need to decide on "
"the best rate heterogeneity model before you run "
"RAxML. If you prefer to do things more rigorously, "
"you can run separate PartitionFinder analyses for each type "
"of rate heterogenetity "
"Then choose the scheme with the lowest AIC/AICc/BIC score. "
"Note that these re-runs will be quick!\n\n"
)
def write_mrbayes(self, sch, result, output, sorted_subsets):
"""Print out partition definitions in MrBayes format, might be
useful to some people
"""
output.write("\n\nMrBayes block for partition definitions\n")
output.write("Warning: MrBayes only allows a relatively small "
"collection of models. If any model in your analysis is not one that "
"is included in MrBayes (e.g. by setting nst = 1, 2, or "
"6 for DNA sequences; or is not in the available list of protein models for MrBayes)"
"then this MrBayes block will just set that model "
"to nst = 6 for DNA, or 'wag' for Protein. Similarly, the only additional parameters "
"that this MrBayes block will include are +I and +G. Other "
" parameters, such as +F and +X, are ignored. "
"If you want to use this MrBayes block for your analysis, "
"please make sure to check it carefully before you use it "
"we've done our best to make it accurate, but there may "
"be errors that remain!\n\n"
)
output.write("begin mrbayes;\n\n")
subset_number = 1
charpartition = []
for sub in sorted_subsets:
if self.cfg.search in _odd_searches:
sites = [x + 1 for x in sub.columns]
partition_sites = str(sites).strip('[]').replace(',','')
else:
partition_sites = sub.site_description_no_commas
output.write("\tcharset Subset%s = %s;\n" % (subset_number, partition_sites))
charpartition.append("Subset%s" % (subset_number))
subset_number += 1
output.write('\n\tpartition PartitionFinder = %d:%s;\n' %(len(charpartition), ', '.join(charpartition)))
output.write('\tset partition=PartitionFinder;\n\n')
subset_number = 1
for sub in sorted_subsets:
if self.cfg.datatype == "DNA":
model_text = get_mrbayes_modeltext_DNA(sub.best_model, subset_number)
elif self.cfg.datatype == "protein":
model_text = get_mrbayes_modeltext_protein(sub.best_model, subset_number)
else:
raise RuntimeError
output.write(model_text)
subset_number += 1
if len(sorted_subsets)>1:
output.write('\n\tprset applyto=(all) ratepr=variable;\n')
output.write('\tunlink statefreq=(all) revmat=(all) shape=(all) pinvar=(all) tratio=(all);\n')
if the_config.branchlengths == 'unlinked':
output.write('\tunlink brlens=(all);\n')
output.write('\nend;\n')
def write_best_scheme(self, result):
pth = os.path.join(self.cfg.output_path, 'best_scheme.txt')
output = open(pth, 'wb')
output.write('Settings used\n\n')
output.write(scheme_header_template % ("alignment", self.cfg.alignment_path))
output.write(scheme_header_template % ("branchlengths", self.cfg.branchlengths))
output.write(scheme_header_template % ("models", ', '.join(self.cfg.models)))
output.write(scheme_header_template % ("model_selection",
self.cfg.model_selection))
output.write(scheme_header_template % ("search", self.cfg.search))
if self.cfg.search in ["rcluster", "hcluster", "rclusterf"]:
pretty_weights = "rate = %s, base = %s, model = %s, alpha = %s" %(
str(self.cfg.cluster_weights["rate"]),
str(self.cfg.cluster_weights["freqs"]),
str(self.cfg.cluster_weights["model"]),
str(self.cfg.cluster_weights["alpha"]))
output.write(scheme_header_template % ("weights", pretty_weights))
if self.cfg.search.startswith("rcluster"):
output.write(scheme_header_template % ("rcluster-percent",
self.cfg.cluster_percent))
output.write(scheme_header_template % ("rcluster-max",
self.cfg.cluster_max))
if self.cfg.search == "kmeans" or self.cfg.search == "krmeans":
output.write(scheme_header_template % ("min-subset-size",
self.cfg.min_subset_size))
output.write(scheme_header_template % ("kmeans based on",
self.cfg.kmeans))
if self.cfg.all_states == True:
output.write(scheme_header_template % ("--all_states setting used",
self.cfg.all_states))
output.write('\n\nBest partitioning scheme\n\n')
self.output_scheme(result.best_scheme, result.best_result, output)
log.info("Information on best scheme is here: %s", pth)
citation_text = write_citation_text(self)
# now we write subset summaries for all the subsets in the best scheme
for s in result.best_scheme:
self.write_subset_summary(s)
log.info("\n")
log.info("\n")
for c in citation_text:
log.info("%s", c)
output.write(c)
def write_raxml_partitions(sch, output, sorted_subsets, use_lg = False):
"""Print out partition definitions in RaxML-like format, might be
useful to some people
"""
subset_number = 1
for sub in sorted_subsets:
if the_config.search in _odd_searches:
sites = [x + 1 for x in sub.columns]
partition_sites = str(sites).strip('[]')
else:
partition_sites = sub.site_description
if the_config.datatype == "DNA":
model = 'DNA'
elif the_config.datatype == "protein":
if use_lg == False:
model = get_raxml_protein_modelstring(sub.best_model)
else:
model = get_raxml_protein_modelstring("LG+G")
elif the_config.datatype == "morphology":
model = get_raxml_morphology_modelstring(sub.best_model)
else:
raise RuntimeError
output.write("%s, Subset%s = %s" % (model, subset_number, partition_sites))
output.write("\n")
subset_number += 1
def write_citation_text(self):
"""Tell users which papers to cite"""
citation_text = []
ref_PF2 = ("Lanfear, R., Frandsen, P., Wright, A., Senfeldova, T., Calcott, B. (2016) Forthcoming. "
"PartitionFinder 2: new methods for selecting partitioned models of evolution for"
"molecular and morphological phylogenetic analyses. "
"Submitted to MBE.")
ref_PF1 = ("Lanfear, R., Calcott, B., Ho, S. Y., & Guindon, S. (2012). "
"PartitionFinder: combined selection of partitioning schemes "
"and substitution models for phylogenetic analyses. "
"Molecular biology and evolution, 29(6), 1695-1701.")
ref_rcluster = ("Lanfear, R., Calcott, B., Kainer, D., Mayer, C., "
"& Stamatakis, A. (2014). Selecting optimal "
"partitioning schemes for phylogenomic datasets. "
"BMC evolutionary biology, 14(1), 82.")
ref_rclusterf = ref_rcluster
ref_kmeans = ("Frandsen, P. B., Calcott, B., Mayer, C., & Lanfear, R. "
"(2015). Automatic selection of partitioning schemes for "
"phylogenetic analyses using iterative k-means clustering "
"of site rates. BMC Evolutionary Biology, 15(1), 13.")
ref_phyml = ("Guindon, S., Dufayard, J. F., Lefort, V., Anisimova, M., "
"Hordijk, W., & Gascuel, O. (2010). New algorithms and "
"methods to estimate maximum-likelihood phylogenies: "
"assessing the performance of PhyML 3.0. "
"Systematic biology, 59(3), 307-321.")
ref_raxml = ("Stamatakis, A. (2014). RAxML version 8: a tool for "
"phylogenetic analysis and post-analysis of large phylogenies. "
"Bioinformatics, 30(9), 1312-1313.")
ref_morph = ("Lewis, P. O. (2001). A likelihood approach to estimating "
"phylogeny from discrete morphological character data. "
"Systematic biology, 50(6), 913-925.")
citation_text.append("\n\n\n*Citations for this analysis*\n")
citation_text.append("-----------------------------")
citation_text.append("\n")
citation_text.append("If you use this analysis in your published "
"work, please cite "
"the following papers on which your analysis relied.\n")
citation_text.append("\n")
citation_text.append("For the version of PartitionFinder you used, "
"please cite:\n")
citation_text.append("%s\n" % ref_PF2)
citation_text.append("\n")
citation_text.append("For the %s algorithm you used, please cite:\n"
% (self.cfg.search))
if self.cfg.search == "rcluster" or self.cfg.search == "hcluster":
citation_text.append("%s\n" % ref_rcluster)
elif self.cfg.search == "kmeans" or self.cfg.search == "krmeans":
citation_text.append("%s\n" % ref_kmeans)
elif self.cfg.search == "greedy":
citation_text.append("%s\n" % ref_PF1)
elif self.cfg.search == "rclusterf":
citation_text.append("%s\n" % ref_rclusterf)
citation_text.append("\n")
if self.cfg.phylogeny_program == 'phyml':
citation_text.append("Your analysis also used PhyML, so please cite:\n")
citation_text.append("%s\n" % ref_phyml)
elif self.cfg.phylogeny_program == 'raxml':
citation_text.append("Your analysis also used RAxML, so please cite:\n")
citation_text.append("%s\n" % ref_raxml)
citation_text.append("\n")
if self.cfg.datatype == 'morphology':
citation_text.append("For the model of morphological evolution you used, please cite:\n")
citation_text.append("%s\n" % ref_morph)
return citation_text