This repository has been archived by the owner on Nov 9, 2023. It is now read-only.
/
filter.py
571 lines (491 loc) · 24.8 KB
/
filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
#!/usr/bin/env python
# File created on 18 May 2010
from __future__ import division
__author__ = "Greg Caporaso"
__copyright__ = "Copyright 2011, The QIIME Project"
__credits__ = ["Greg Caporaso", "Will Van Treuren", "Daniel McDonald",
"Jai Ram Rideout", "Yoshiki Vazquez Baeza"]
__license__ = "GPL"
__version__ = "1.8.0"
__maintainer__ = "Greg Caporaso"
__email__ = "gregcaporaso@gmail.com"
from collections import defaultdict
from random import shuffle, sample
from numpy import array, inf
from cogent.parse.fasta import MinimalFastaParser
from qiime.parse import parse_distmat, parse_mapping_file, parse_metadata_state_descriptions
from qiime.format import format_otu_table, format_distance_matrix, format_mapping_file
from qiime.util import MetadataMap
from biom.parse import parse_biom_table
def get_otu_ids_from_taxonomy_f(positive_taxa=None,
negative_taxa=None,
metadata_field="taxonomy"):
""" return function to pass to Table.filterObservations for taxon-based filtering
positive_taxa : a list of strings that will be compared to each
taxonomy level in an observation's (i.e., OTU's) metadata_field. If
one of the levels matches exactly (except for case) to an item in
positive_taxa, that OTU will be marked for retention. Default: All
OTUs are retained.
negative_taxa : a list of strings that will be compared to each
taxonomy level in an observation's (i.e., OTU's) metadata_field. If
one of the levels matches exactly (except for case) to an item in
negative_taxa, that OTU will be marked for removal. Default: All
OTUs are retained.
metadata_field : the metadata field to look up in the
observation metadata
Note: string matches are case insensitive.
"""
# define a positive screening function - if the user doesn't pass
# positive_taxa, all OTUs will pass this filter
# (i.e., be marked for retention)
if positive_taxa == None:
positive_taxa = set()
def positive_screen(e):
return True
else:
positive_taxa = set([t.strip().lower() for t in positive_taxa])
def positive_screen(e):
return e in positive_taxa
# define a negative screening function - if the user doesn't pass
# negative_taxa, all OTUs will pass this filter
# (i.e., be marked for retention)
if negative_taxa == None:
negative_taxa = set()
def negative_screen(e):
return False
else:
negative_taxa = set([t.strip().lower() for t in negative_taxa])
def negative_screen(e):
return e in negative_taxa
# The positive_taxa and negative_taxa lists must be mutually exclusive.
if len(positive_taxa & negative_taxa) != 0:
raise ValueError, \
("Your positive and negative taxa lists contain overlapping values. "
"These lists must be mutually exclusive.\n"
"Offending values are: %s" % ' '.join(positive_taxa & negative_taxa))
# Define the function that can be passed to Table.filterObservations
def result(v,oid,md):
positive_hit = False
negative_hit = False
for e in md[metadata_field]:
if positive_screen(e.strip().lower()):
# Note that we don't want to just do
# positive_hit = positive_screen(e.strip())
# we're checking whether any e hits the positive taxa
# and doing that be the same as
# positive_hit = md[metadata_field][-1]
positive_hit = True
if negative_screen(e.strip().lower()):
# see note in previous if statement for why we don't use
# negative_hit = negative_screen(e.strip())
negative_hit = True
return positive_hit and not negative_hit
return result
def sample_ids_from_metadata_description(mapping_f,valid_states_str):
""" Given a description of metadata, return the corresponding sample ids
"""
map_data, map_header, map_comments = parse_mapping_file(mapping_f)
valid_states = parse_metadata_state_descriptions(valid_states_str)
sample_ids = get_sample_ids(map_data, map_header, valid_states)
if len(sample_ids)<1:
raise ValueError,"All samples have been filtered out for the criteria"+\
" described in the valid states"
return sample_ids
def get_sample_ids(map_data, map_header, states):
"""Takes col states in {col:[vals]} format.
If val starts with !, exclude rather than include.
Combines cols with and, states with or.
For example, Study:Dog,Hand will return rows where Study is Dog or Hand;
Study:Dog,Hand;BodySite:Palm,Stool will return rows where Study is Dog
or Hand _and_ BodySite is Palm or Stool; Study:*,!Dog;BodySite:*,!Stool
will return all rows except the ones where the Study is Dog or the BodySite
is Stool.
"""
name_to_col = dict([(s,map_header.index(s)) for s in states])
good_ids = []
for row in map_data: #remember to exclude header
include = True
for s, vals in states.items():
curr_state = row[name_to_col[s]]
include = include and (curr_state in vals or '*' in vals) \
and not '!'+curr_state in vals
if include:
good_ids.append(row[0])
return good_ids
def sample_ids_from_category_state_coverage(mapping_f,
coverage_category,
subject_category,
min_num_states=None,
required_states=None,
considered_states=None,
splitter_category=None):
"""Filter sample IDs based on subject's coverage of a category.
Given a category that groups samples by subject (subject_category), samples
are filtered by how well a subject covers (i.e. has at least one sample
for) the category states in coverage_category.
Two filtering criteria are provided (min_num_states and required_states). At
least one must be provided. If both are provided, the subject must meet
both criteria to pass the filter (i.e. providing both filters is an AND,
not an OR, operation).
A common use case is to provide a 'time' category for coverage_category and
an 'individual' category for subject_category in order to filter out
individuals from a study that do not have samples for some minimum number
of timepoints (min_num_states) and that do not have samples for certain
timepoints (required_states). For example, this could be the first and last
timepoints in the study.
Returns a set of sample IDs to keep, the number of subjects that were
kept, and a set of the unique category states in coverage_category that
were kept. The set of sample IDs is not guaranteed to be in any specific
order relative to the order of sample IDs or subjects in the mapping file.
Arguments:
mapping_f - metadata mapping file (file-like object)
coverage_category - category to test subjects' coverage (string)
subject_category - category to group samples by subject (string)
min_num_states - minimum number of category states in coverage_category
that a subject must cover (i.e. have at least one sample for) to be
included in results (integer)
required_states - category states in coverage_category that must be
covered by a subject's samples in order to be included in results
(list of strings or items that can be converted to strings)
considered_states - category states that are counted toward the
min_num_states (list of strings or items that can be converted to
strings)
splitter_category - category to split input mapping file on prior to
processing. If not supplied, the mapping file will not be split. If
supplied, a dictionary mapping splitter_category state to results
will be returned instead of the three-element tuple. The supplied
filtering criteria will apply to each split piece of the mapping
file independently (e.g. if an individual passes the filters for
the tongue samples, his/her tongue samples will be included for
the tongue results, even if he/she doesn't pass the filters for the
palm samples)
"""
metadata_map = MetadataMap.parseMetadataMap(mapping_f)
# Make sure our input looks sane.
categories_to_test = [coverage_category, subject_category]
if splitter_category is not None:
categories_to_test.append(splitter_category)
if 'SampleID' in categories_to_test:
raise ValueError("The 'SampleID' category is not suitable for use in "
"this function. Please choose a different category "
"from the metadata mapping file.")
for category in categories_to_test:
if category not in metadata_map.CategoryNames:
raise ValueError("The category '%s' is not in the metadata "
"mapping file." % category)
if len(set(categories_to_test)) < len(categories_to_test):
raise ValueError("The coverage, subject, and (optional) splitter "
"categories must all be unique.")
if required_states is not None:
# required_states must be in coverage_category's states in the mapping
# file.
required_states = set(map(str,required_states))
valid_coverage_states = set(metadata_map.getCategoryValues(
metadata_map.SampleIds, coverage_category))
invalid_coverage_states = required_states - valid_coverage_states
if invalid_coverage_states:
raise ValueError("The category state(s) '%s' are not in the '%s' "
"category in the metadata mapping file." %
(', '.join(invalid_coverage_states),
coverage_category))
if considered_states is not None:
# considered_states is not as restrictive as required_states - we don't
# require that these are present, so it's OK if some of the states
# listed here don't actually show up in the mapping file (allowing
# the user to pass something like range(100) to consider only states
# that fall in some range)
considered_states = set(map(str,considered_states))
# define a function to determine if a state should be considered
consider_state = lambda s: s in considered_states
else:
# define a dummy function to consider all states (the default
# if the user does not provide a list of considered_states)
consider_state = lambda s: True
if min_num_states is None and required_states is None:
raise ValueError("You must specify either the minimum number of "
"category states the subject must have samples for "
"(min_num_states), or the minimal category states "
"the subject must have samples for (required_states), "
"or both. Supplying neither filtering criteria is "
"not supported.")
if splitter_category is None:
results = _filter_sample_ids_from_category_state_coverage(
metadata_map, metadata_map.SampleIds, coverage_category,
subject_category, consider_state, min_num_states,
required_states)
else:
# "Split" the metadata mapping file by extracting only sample IDs that
# match the current splitter category state and using those for the
# actual filtering.
splitter_category_states = defaultdict(list)
for samp_id in metadata_map.SampleIds:
splitter_category_state = \
metadata_map.getCategoryValue(samp_id, splitter_category)
splitter_category_states[splitter_category_state].append(samp_id)
results = {}
for splitter_category_state, sample_ids in \
splitter_category_states.items():
results[splitter_category_state] = \
_filter_sample_ids_from_category_state_coverage(
metadata_map, sample_ids, coverage_category,
subject_category, consider_state, min_num_states,
required_states)
return results
def _filter_sample_ids_from_category_state_coverage(metadata_map,
sample_ids,
coverage_category,
subject_category,
consider_state_fn,
min_num_states=None,
required_states=None):
"""Helper function to perform filtering based on category state coverage.
Not explicitly unit-tested because it is implicitly tested by
sample_ids_from_category_state_coverage's unit tests.
"""
# Build mapping from subject to sample IDs.
subjects = defaultdict(list)
for samp_id in sample_ids:
subject = metadata_map.getCategoryValue(samp_id, subject_category)
subjects[subject].append(samp_id)
# Perform filtering.
samp_ids_to_keep = []
num_subjects_kept = 0
states_kept = []
for subject, samp_ids in subjects.items():
subject_covered_states = set(
metadata_map.getCategoryValues(samp_ids, coverage_category))
# Short-circuit evaluation of ANDing filters.
keep_subject = True
if min_num_states is not None:
# note: when summing a list of boolean values, True == 1 and
# False == 0
if sum([consider_state_fn(s) for s in subject_covered_states]) < \
min_num_states:
keep_subject = False
if keep_subject and required_states is not None:
if len(subject_covered_states & required_states) != \
len(required_states):
keep_subject = False
if keep_subject:
samp_ids_to_keep.extend(samp_ids)
states_kept.extend(subject_covered_states)
num_subjects_kept += 1
return set(samp_ids_to_keep), num_subjects_kept, set(states_kept)
def filter_fasta(input_seqs,output_seqs_f,seqs_to_keep,negate=False):
""" Write filtered input_seqs to output_seqs_f which contains only seqs_to_keep
input_seqs can be the output of MinimalFastaParser or MinimalFastqParser
"""
seqs_to_keep_lookup = {}.fromkeys([seq_id.split()[0]
for seq_id in seqs_to_keep])
# Define a function based on the value of negate
if not negate:
def keep_seq(seq_id):
return seq_id.split()[0] in seqs_to_keep_lookup
else:
def keep_seq(seq_id):
return seq_id.split()[0] not in seqs_to_keep_lookup
for seq_id,seq in input_seqs:
if keep_seq(seq_id):
output_seqs_f.write('>%s\n%s\n' % (seq_id, seq))
output_seqs_f.close()
def filter_fastq(input_seqs,output_seqs_f,seqs_to_keep,negate=False):
""" Write filtered input_seqs to output_seqs_f which contains only seqs_to_keep
input_seqs can be the output of MinimalFastaParser or MinimalFastqParser
"""
seqs_to_keep_lookup = {}.fromkeys([seq_id.split()[0]
for seq_id in seqs_to_keep])
# Define a function based on the value of negate
if not negate:
def keep_seq(seq_id):
return seq_id.split()[0] in seqs_to_keep_lookup
else:
def keep_seq(seq_id):
return seq_id.split()[0] not in seqs_to_keep_lookup
for seq_id,seq,qual in input_seqs:
if keep_seq(seq_id):
output_seqs_f.write('@%s\n%s\n+\n%s\n' % (seq_id, seq, qual))
output_seqs_f.close()
def filter_mapping_file(map_data, map_header, good_sample_ids,
include_repeat_cols=False, column_rename_ids=None):
"""Filters map according to several criteria.
- keep only sample ids in good_sample_ids
- drop cols that are different in every sample (except id)
- drop cols that are the same in every sample
"""
# keeping samples
to_keep = []
to_keep.extend([i for i in map_data if i[0] in good_sample_ids])
# keeping columns
headers = []
to_keep = zip(*to_keep)
headers.append(map_header[0])
result = [to_keep[0]]
if column_rename_ids:
# reduce in 1 as we are not using the first colum (SampleID)
column_rename_ids = column_rename_ids-1
for i,l in enumerate(to_keep[1:-1]):
if i==column_rename_ids:
if len(set(l))!=len(result[0]):
raise ValueError, "The column to rename the samples is not unique."
result.append(result[0])
result[0] = l
headers.append('SampleID_was_' + map_header[i+1])
elif include_repeat_cols or len(set(l))>1:
headers.append(map_header[i+1])
result.append(l)
else:
for i,l in enumerate(to_keep[1:-1]):
if include_repeat_cols or len(set(l))>1:
headers.append(map_header[i+1])
result.append(l)
headers.append(map_header[-1])
result.append(to_keep[-1])
result = map(list,zip(*result))
return headers, result
def filter_mapping_file_from_mapping_f(mapping_f,sample_ids_to_keep,negate=False):
""" Filter rows from a metadata mapping file """
mapping_data, header, comments = parse_mapping_file(mapping_f)
filtered_mapping_data = []
sample_ids_to_keep = {}.fromkeys(sample_ids_to_keep)
for mapping_datum in mapping_data:
hit = mapping_datum[0] in sample_ids_to_keep
if hit and not negate:
filtered_mapping_data.append(mapping_datum)
elif not hit and negate:
filtered_mapping_data.append(mapping_datum)
else:
pass
return format_mapping_file(header,filtered_mapping_data)
def filter_mapping_file_by_metadata_states(mapping_f,valid_states_str):
sample_ids_to_keep = sample_ids_from_metadata_description(mapping_f,valid_states_str)
mapping_f.seek(0)
return filter_mapping_file_from_mapping_f(mapping_f,sample_ids_to_keep)
def filter_samples_from_distance_matrix(dm,samples_to_discard,negate=False):
""" Remove specified samples from distance matrix
dm: (sample_ids, dm_data) tuple, as returned from
qiime.parse.parse_distmat; or a file handle that can be passed
to qiime.parse.parse_distmat
"""
try:
sample_ids, dm_data = dm
except ValueError:
# input was provide as a file handle
sample_ids, dm_data = parse_distmat(dm)
sample_lookup = {}.fromkeys([e.split()[0] for e in samples_to_discard])
temp_dm_data = []
new_dm_data = []
new_sample_ids = []
if negate:
def keep_sample(s):
return s in sample_lookup
else:
def keep_sample(s):
return s not in sample_lookup
for row,sample_id in zip(dm_data,sample_ids):
if keep_sample(sample_id):
temp_dm_data.append(row)
new_sample_ids.append(sample_id)
temp_dm_data = array(temp_dm_data).transpose()
for col,sample_id in zip(temp_dm_data,sample_ids):
if keep_sample(sample_id):
new_dm_data.append(col)
new_dm_data = array(new_dm_data).transpose()
return format_distance_matrix(new_sample_ids, new_dm_data)
def negate_tips_to_keep(tips_to_keep, tree):
""" Return the list of tips in the tree that are not in tips_to_keep"""
tips_to_keep = set(tips_to_keep)
# trees can return node names in ways that have multiple quotes, e.g.
# '"node_1"' or ''node_1''. remove them or it can cause problems with
# tips_to_keep not matching
tmp_tips = set([tip.Name for tip in tree.tips()])
tips = set([t.strip('\'').strip('\"') for t in tmp_tips])
return tips - tips_to_keep
def get_seqs_to_keep_lookup_from_biom(biom_f):
otu_table = parse_biom_table(biom_f)
return {}.fromkeys(otu_table.ObservationIds)
def get_seqs_to_keep_lookup_from_seq_id_file(id_to_keep_f):
"""generate a lookup dict of chimeras in chimera file."""
return set([l.split()[0].strip() for l in id_to_keep_f if not l.startswith('#') and l])
get_seq_ids_from_seq_id_file = get_seqs_to_keep_lookup_from_seq_id_file
def get_seqs_to_keep_lookup_from_fasta_file(fasta_f):
"""return the sequence ids within the fasta file"""
return set([seq_id.split()[0] for seq_id,seq in MinimalFastaParser(fasta_f)])
get_seq_ids_from_fasta_file = get_seqs_to_keep_lookup_from_fasta_file
# start functions used by filter_samples_from_otu_table.py and filter_otus_from_otu_table.py
def get_filter_function(ids_to_keep,min_count,max_count,min_nonzero,max_nonzero,negate_ids_to_keep=False):
if negate_ids_to_keep:
def f(data_vector, id_, metadata):
return (id_ not in ids_to_keep) and \
(min_count <= data_vector.sum() <= max_count) and \
(min_nonzero <= (data_vector > 0).sum() <= max_nonzero)
else:
def f(data_vector, id_, metadata):
return (id_ in ids_to_keep) and \
(min_count <= data_vector.sum() <= max_count) and \
(min_nonzero <= (data_vector > 0).sum() <= max_nonzero)
return f
def filter_samples_from_otu_table(otu_table,ids_to_keep,min_count,max_count):
filter_f = get_filter_function({}.fromkeys(ids_to_keep),
min_count,
max_count,
0,inf)
return otu_table.filterSamples(filter_f)
def filter_otus_from_otu_table(otu_table,ids_to_keep,min_count,max_count,
min_samples,max_samples,negate_ids_to_keep=False):
filter_f = get_filter_function({}.fromkeys(ids_to_keep),
min_count,
max_count,
min_samples,max_samples,
negate_ids_to_keep)
return otu_table.filterObservations(filter_f)
# end functions used by filter_samples_from_otu_table.py and filter_otus_from_otu_table.py
def filter_otu_table_to_n_samples(otu_table,n):
""" Filter OTU table to n random samples.
If n is greater than the number of samples or less than zero a
ValueError will be raised.
"""
try:
ids_to_keep = sample(otu_table.SampleIds,n)
except ValueError:
raise ValueError,\
"Number of samples to filter must be between 0 and the number of samples."
return filter_samples_from_otu_table(otu_table, ids_to_keep, 0, inf)
def filter_otus_from_otu_map(input_otu_map_fp,
output_otu_map_fp,
min_count,
min_sample_count=1):
""" Filter otus with fewer than min_count sequences from input_otu_map_fp
With very large data sets the number of singletons can be very large, and it becomes
more efficent to filter them at the otu map stage than the otu table stage.
There are two outputs from this function: the output file (which is the
filtered otu map) and the list of retained otu ids as a set. Since I need
to return the retained ids for pick_open_reference_otus,
this takes filepaths instead of file handles (since it can't be a generator
and return something).
"""
results = set()
output_otu_map_f = open(output_otu_map_fp,'w')
for line in open(input_otu_map_fp,'U'):
fields = line.strip().split('\t')
sample_ids = set([e.split('_')[0] for e in fields[1:]])
# only write this line if the otu has more than n sequences (so
# greater than n tab-separated fields including the otu identifier)
if (len(fields) > min_count) and (len(sample_ids) >= min_sample_count):
output_otu_map_f.write(line)
results.add(fields[0].split('\t')[0])
output_otu_map_f.close()
return results
def filter_tree(tree,tips_to_keep):
result = tree.copy()
## don't use this, it doesn't eliminate tips!
# result = tree.getSubTree(tips_to_keep,ignore_missing=True)
def f(node):
if node.istip() and\
node.Name != None and\
node.Name not in tips_to_keep and\
node.Name.strip().strip('"').strip("'") not in tips_to_keep:
return True
return False
result.removeDeleted(f)
result.prune()
return result