/
results_utils.py
597 lines (463 loc) · 18.9 KB
/
results_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
#!/usr/bin/env python
import os
import shutil
import zipfile
from itertools import izip
from collections import defaultdict
from functools import partial
from biom.parse import parse_biom_table
from americangut.util import check_file
# These are the data files in the American-Gut repository that are used for
# results processing
_data_files = [
# (directory, filename)
('PGP', 'PGP_100nt.biom.gz'),
('PGP', 'PGP_100nt.txt'),
('HMP', 'HMPv35_100nt.biom.gz'),
('HMP', 'HMPv35_100nt.txt'),
('GG', 'GG_100nt.biom.gz'),
('GG', 'GG_100nt.txt'),
('AG', 'pgp_agp_barcodes.txt'),
('AG', 'BLOOM.fasta')
]
# These are the Latex templates for the different results types
_templates = {
'fecal': 'template_gut.tex',
'oral': 'template_oralskin.tex',
'skin': 'template_oralskin.tex'
}
_mod1_bits = ['metadata_charts.json',
'mod1_main.tex',
'stacked_plots_key_taxa.txt',
'fecal_identified.txt',
'skin_identified.txt',
'oral_identified.txt']
def get_path(d, f):
"""Check and get a path, or throw IOError"""
path = os.path.join(d, f)
check_file(path)
return path
def get_repository_dir():
"""Get the root of the American-Gut repository"""
expected = os.path.abspath(__file__).rsplit('/', 2)[0]
# get_path verifies the existance of these directories
get_path(expected, 'data')
get_path(expected, 'latex')
return expected
def get_repository_data():
"""Get the path to the data"""
return get_path(get_repository_dir(), 'data')
def get_repository_latex():
"""Get the path to the latex directory"""
return get_path(get_repository_dir(), 'latex')
def get_repository_latex_pdfs(sample_type):
"""Get the Latex static PDFs directory"""
latex_dir = get_repository_latex()
sample_type = sample_type.lower()
if sample_type == 'skin':
pdfs_dir = get_path(latex_dir, 'pdfs-oralskin')
elif sample_type == 'oral':
pdfs_dir = get_path(latex_dir, 'pdfs-oralskin')
elif sample_type == 'fecal':
pdfs_dir = get_path(latex_dir, 'pdfs-gut')
elif sample_type == 'mod1':
pdfs_dir = get_path(latex_dir, 'pdfs-mod1')
else:
raise ValueError("Unknown sample type: %s" % sample_type)
check_file(pdfs_dir)
return pdfs_dir
def stage_static_latex(sample_type, working_dir):
latex_dir = get_repository_latex()
item = _templates[sample_type.lower()]
src = get_path(latex_dir, item)
shutil.copy(src, working_dir)
def stage_static_pdfs(sample_type, working_dir):
pdfs_dir = get_repository_latex_pdfs(sample_type)
for f in os.listdir(pdfs_dir):
if f.endswith('.pdf'):
src = get_path(pdfs_dir, f)
shutil.copy(src, working_dir)
def _stage_static_data(working_dir, debug):
data_dir = get_repository_data()
for d, f in _data_files:
if debug:
d = d + '_debug'
src = get_path(get_path(data_dir, d), f)
shutil.copy(src, working_dir)
def _stage_static_mod1(working_dir):
latex_dir = get_repository_latex()
statics = get_repository_latex_pdfs('mod1')
for f in _mod1_bits:
src = get_path(latex_dir, f)
shutil.copy(src, working_dir)
for f in os.listdir(statics):
src = get_path(statics, f)
shutil.copy(src, working_dir)
def stage_static_files(sample_type, working_dir, debug=False):
"""Stage static files in the current working directory"""
stage_static_latex(sample_type, working_dir)
stage_static_pdfs(sample_type, working_dir)
# use participant names only if the data are available.
# NOTE: these data are _not_ part of the github repository for
# privacy reasons.
def parse_identifying_data(path, passwd, embedded_file='participants.txt'):
"""Process identifying data if available
The expected format of the file is a passworded zipfile that contains
an embedded, tab delimited file. The format of the tab delimited file
is expected to be barcode TAB participant name
"""
if path is not None:
zf = zipfile.ZipFile(path)
zf.setpassword(passwd)
participants = {}
for l in zf.read(embedded_file).splitlines():
if l.startswith('#'):
continue
bc, name = l.strip().split('\t')[:2]
participants[bc] = name.replace(",", "")
print "Using identified data!"
else:
participants = None
return participants
def parse_previously_printed(path):
"""Returns the set of previously printed barcodes
The format of the file to be parsed is a single column of sample barcodes
"""
if path is not None:
prev_printed = set([l.strip() for l in open(path)])
else:
prev_printed = set([])
return prev_printed
def filter_mapping_file(in_fp, out_fp, columns_to_keep):
"""Filter out columns in a mapping file
in_fp : the input file-like object
out_fp : the output file-like object
columns_to_keep : a dict of the columns to keep, valued by specific
category value if desired to filter out samples that don't meet a given
criteria. In other words, a row is retained if the function associated
with the key "foo" returns True, or the row is retained if the value
associated with "foo" is None.
"""
lines = [l.strip().split('\t') for l in in_fp]
header = lines[0][:]
header_lower = [x.lower() for x in header]
# ensure SampleID is always first
new_header = ["#SampleID"]
indices = [0] # always keep SampleID
for c in columns_to_keep:
if c.lower() not in header_lower:
raise ValueError("Cannot find %s!" % c)
indices.append(header_lower.index(c.lower()))
new_header.append(c)
columns_to_keep['#SampleID'] = None # add for consistency
new_lines = [new_header]
for l in lines[1:]:
new_line = []
keep = True
# fetch values from specific columns
for column, index in zip(new_header, indices):
try:
value = l[index]
except:
raise
if columns_to_keep[column] is None:
new_line.append(value)
elif not columns_to_keep[column](value):
keep = False
break
else:
new_line.append(value)
if keep:
new_lines.append(new_line)
out_fp.write('\n'.join(['\t'.join(l) for l in new_lines]))
out_fp.write('\n')
def construct_svg_smash_commands(files, ids, cmd_format, cmd_args):
"""Format the SVG smashing commands
files : list of files
ids : set of ids
cmd_format : a string to format
cmd_args : a dict of strings that can get filled into cmd_format
"""
commands = []
for f in files:
if not f.startswith('Figure'):
continue
prefix, remainder = f.split('.', 1)
try:
id_, remainder = remainder.rsplit('_', 1)
except:
# GLOBAL SVG for each figure
assert remainder == 'GLOBAL'
continue
# ignore svgs for non-AG points
if id_ not in ids:
continue
args = cmd_args.copy()
args['sample_id'] = id_
args['prefix'] = prefix
commands.append(cmd_format % args)
return commands
def chunk_list(items, chunk_size=25):
"""Chunk up a list of items"""
start = 0
for end in range(chunk_size, len(items) + chunk_size, chunk_size):
chunk = items[start:end]
start = end
yield chunk
def construct_phyla_plots_cmds(sample_ids, cmd_format, cmd_args):
"""Constuct the phlya plots commands"""
commands = []
for chunk in chunk_list(sample_ids):
args = cmd_args.copy()
args['samples'] = ','.join(chunk)
commands.append(cmd_format % args)
return commands
def per_sample_taxa_summaries(open_table, output_format):
"""Write out per-sample taxonomy summaries
open_table : an open file-like biom table
output_format : a path that supports a string format, eg:
foo/bar_%s.txt
"""
t = parse_biom_table(open_table)
header = "#taxon\trelative_abundance\n"
for v, id_, md in t.iter():
with open(output_format % id_, 'w') as f:
f.write(header)
for sorted_v, taxa in \
sorted(zip(v, t.ids(axis='observation')))[::-1]:
if sorted_v:
f.write("%s\t%f\n" % (taxa, sorted_v))
class MissingFigure(Exception):
pass
def bootstrap_result(rel_existing_path, static_paths, base_cmd_fmt,
to_pdf_fmt, sample_id, name):
"""Stage for results
sample_id : an id
name : None or str
rel_existing_path : a function that gets an existing path
static_paths : a dict of paths
base_cmd_fmt : base format for the commands to execute
to_pdf_fmt : base format for the call to construct the latex PDF
"""
if name is None:
unidentified = rel_existing_path('unidentified')
def bootstrap_path(x):
return os.path.join(unidentified, x)
else:
identified = rel_existing_path('identified')
def bootstrap_path(x):
return os.path.join(identified, x)
template_path = rel_existing_path('template_files')
indiv_dir = bootstrap_path(sample_id)
pdf_dir = os.path.join(indiv_dir, 'pdfs-gut')
def tex_path(x):
return os.path.join(indiv_dir, x)
def fig_pdf_path(x):
return os.path.join(pdf_dir, x)
def template_files_path(x):
return os.path.join(template_path, x)
fig1_src = template_files_path("Figure_1.%s_huge.pdf" % sample_id)
fig2_src = template_files_path("Figure_2.%s_huge.pdf" % sample_id)
fig3_src = template_files_path("Figure_3.%s_huge.pdf" % sample_id)
fig4_src = template_files_path("Figure_4_%s.pdf" % sample_id)
fig6_src = template_files_path("Figure_6_%s.txt" % sample_id)
macros_src = template_files_path("macros_%s.tex" % sample_id)
fig1_dst = fig_pdf_path("figure1.pdf")
fig2_dst = fig_pdf_path("figure2.pdf")
fig3_dst = fig_pdf_path("figure3.pdf")
fig4_dst = fig_pdf_path("figure4.pdf")
fig6_dst = tex_path("%s_taxa.txt" % sample_id)
macros_dst = tex_path("macros_gut.tex")
template_dst = tex_path('%s.tex' % sample_id)
check_file(fig1_src, e=MissingFigure)
check_file(fig2_src, e=MissingFigure)
check_file(fig3_src, e=MissingFigure)
check_file(fig4_src, e=MissingFigure)
check_file(fig6_src, e=MissingFigure)
check_file(macros_src, e=MissingFigure)
cmds = []
cmds.append('mkdir -p %s' % pdf_dir)
cmds.append('cp %s %s' % (fig1_src, fig1_dst))
cmds.append('cp %s %s' % (fig2_src, fig2_dst))
cmds.append('cp %s %s' % (fig3_src, fig3_dst))
cmds.append('cp %s %s' % (fig4_src, fig4_dst))
cmds.append('cp %s %s' % (fig6_src, fig6_dst))
cmds.append('cp %s %s' % (macros_src, macros_dst))
cmds.append('cp %s %s' % (static_paths['template'], template_dst))
cmds.append('cp %s %s' % (static_paths['aglogo'], pdf_dir))
cmds.append('cp %s %s' % (static_paths['fig1_legend'], pdf_dir))
cmds.append('cp %s %s' % (static_paths['fig2_legend'], pdf_dir))
cmds.append('cp %s %s' % (static_paths['fig2_2ndlegend'], pdf_dir))
cmds.append('cp %s %s' % (static_paths['fig3_legend'], pdf_dir))
cmds.append('cp %s %s' % (static_paths['fig4_overlay'], pdf_dir))
cmds.append('cp %s %s' % (static_paths['fig1_ovals'], pdf_dir))
cmds.append('cp %s %s' % (static_paths['fig2_ovals'], pdf_dir))
cmds.append('cp %s %s' % (static_paths['ball_legend'], pdf_dir))
cmds.append('cp %s %s' % (static_paths['title'], pdf_dir))
name_fmt = "echo '\n\def\yourname{%s}\n' >> %s"
if name is None:
cmds.append(name_fmt % ("unidentified", macros_dst))
else:
cmds.append(name_fmt % (name, macros_dst))
indiv_cmd = base_cmd_fmt % (static_paths['working_dir'], '; '.join(cmds))
latex_cmd = to_pdf_fmt % {'path': indiv_dir, 'input': template_dst}
return (indiv_cmd, latex_cmd)
def construct_bootstrap_and_latex_commands(ids, participants,
rel_existing_path,
static_paths, base_cmd_fmt,
to_pdf_fmt):
"""Construct the commands to bootstrap results and latex generation
ids : an iterable of ids
participants : None or a dict mapping barcodes to participant names
rel_existing_path : a function that gets an existing path
static_paths : a dict of paths
base_cmd_fmt : base format for the commands to execute
to_pdf_fmt : base format for the call to construct the latex PDF
"""
bs_f = partial(bootstrap_result, rel_existing_path, static_paths,
base_cmd_fmt, to_pdf_fmt)
indiv_cmds = []
latex_cmds = []
missing = []
for sample_id in ids:
name = None
if participants is not None:
bc = sample_id.split('.')[0]
if bc in participants:
name = participants[bc]
# unidentified
try:
indiv_cmd, latex_cmd = bs_f(sample_id, None)
except MissingFigure:
missing.append(sample_id)
continue
indiv_cmds.append(indiv_cmd)
latex_cmds.append(latex_cmd)
# identified
if name:
indiv_cmd, latex_cmd = bs_f(sample_id, name)
indiv_cmds.append(indiv_cmd)
latex_cmds.append(latex_cmd)
return (indiv_cmds, latex_cmds, missing)
def harvest(path):
"""harvest PDFs and taxa summaries"""
harvest_path = os.path.join(path, 'harvested')
if not os.path.exists(harvest_path):
os.mkdir(harvest_path)
for name in os.listdir(path):
if 'harvested' in name:
continue
if 'pdfs' in name:
continue
try:
float(name) # should work for 000001111.123123 and 000001111
except ValueError:
continue
dst_name = name.split('.')[0] if '.' in name else name
src = os.path.join(path, name, "%s.pdf" % name)
dst = os.path.join(harvest_path, "%s.pdf" % dst_name)
if os.path.exists(src):
yield "mv %s %s" % (src, dst)
src = os.path.join(path, name, "%s_taxa.txt" % name)
dst = os.path.join(harvest_path, "%s.txt" % dst_name)
if os.path.exists(src):
yield "mv %s %s" % (src, dst)
def pdf_smash(path, tag, pdf_smash_fmt, n_per_result=30,
previously_printed=None):
"""Combine sets of PDFs into single documents
path : a path to where the PDFs are
tag : some tag to put on the file names
pdf_smash_fmt : command format to use
n_per_result : number of PDFs to smash together
previously_printed : set of previously printed barcodes or None
"""
if previously_printed is None:
previously_printed = set([])
files = []
for f in os.listdir(path):
bc, extension = os.path.splitext(f)
if extension != '.pdf':
continue
if bc in previously_printed:
continue
files.append(os.path.join(path, f))
result_path = os.path.join(path, 'pdf_smash')
if not os.path.exists(result_path):
os.mkdir(result_path)
# sort and then filter out. filtering is by barcode without prefix
def sort_key(x):
return int(x.rsplit('/')[-1].split('.')[0])
files_ordered = sorted(files, key=sort_key)
smash_set = []
barcode_set = []
def bc_f(ch):
return '\n'.join([f.rsplit('/')[-1].split('.')[0] for f in ch])
for chunk in chunk_list(files_ordered, n_per_result):
smash_set.append(' '.join(chunk))
barcode_set.append(bc_f(chunk))
smash_cmds = []
smash_basename = os.path.join(result_path, "%s_smashset_%d")
for set_number, (pdfs, barcodes) in enumerate(zip(smash_set, barcode_set)):
filename_base = smash_basename % (tag, set_number)
filename_pdf = filename_base + '.pdf'
filename_txt = filename_base + '.txt'
with open(filename_txt, 'w') as f:
f.write(barcodes)
f.write('\n')
smash_cmds.append(pdf_smash_fmt % {'output': filename_pdf,
'pdfs': pdfs})
ordered_barcodes_path = os.path.join(result_path, 'ordered_barcodes.txt')
with open(ordered_barcodes_path, 'w') as ordered_barcodes:
ordered_barcodes.write('\n'.join(barcode_set))
ordered_barcodes.write('\n')
return smash_cmds
def count_unique_sequences_per_otu(otu_ids, otu_map_file, input_seqs_file):
"""Counts unique sequences per-OTU for a given set of OTUs
otu_ids: a set of OTU IDs
otu_map_file: file-like object in the format of an OTU map
input_seqs_file: FASTA containing sequences that were used to generate
the otu_map_file
Returns a nested dict structure: {otu_id: {sequence: count}}
"""
# This will hold the OTU map for the OTUs in otu_ids
otu_map = {x: set() for x in otu_ids}
# go through the otu map and save the lines of interest to the otu_map
# data structure above
print "Reading OTU map..."
for line in otu_map_file:
otu_id, seq_ids = line.strip().split('\t', 1)
if otu_id in otu_ids:
otu_map[otu_id] = set(seq_ids.split('\t'))
# this will hold, for each OTU in otus, counts of each unique sequence
# observed in that OTU
unique_counts = {x: defaultdict(int) for x in otu_ids}
# go through input fasta file TWO LINES AT A TIME, counting unique
# sequences in each OTU of intrest
print "Reading FASTA file and counting unique sequences..."
for header, sequence in izip(input_seqs_file, input_seqs_file):
header = header.strip()
sequence = sequence.strip()
seq_id = header.split(' ', 1)[0][1:]
for otu_id in otu_ids:
if seq_id in otu_map[otu_id]:
unique_counts[otu_id][sequence] += 1
break
return unique_counts
def write_bloom_fasta(unique_counts, output_file, abundance_threshold):
"""Writes a FASTA file of sequences determined to be the result of a bloom
If one unique sequences composes more than abundance_threshold of the OTU,
that sequences is marked as a bloom sequence and written to output_file.
unique_counts: a nested dict of the form {otu_id: {sequence: count}}
E.g., the output of count_unique_sequences_per_otu
output_file: a file-like object ready for writing
abundance_threshold: If a sequence composes more than this percent of an
OTU, then it is marked as a bloom sequence
"""
for otu_id, otu_counts in unique_counts.iteritems():
otu_total_count = sum([count for seq, count in otu_counts.iteritems()])
counter = 0
for seq, count in sorted(otu_counts.items(), key=lambda x: x[1],
reverse=True):
counter += 1
if 1.0*count/otu_total_count > abundance_threshold:
output_file.write('>%s_%d\n%s\n' % (otu_id, counter, seq))