This repository has been archived by the owner on Nov 9, 2023. It is now read-only.
/
filter_by_metadata.py
144 lines (124 loc) · 5.69 KB
/
filter_by_metadata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/env python
#filter_by_metadata: reads otu table and map, returns only allowed states
from string import strip
from sys import argv, stdout, stderr
from numpy import array
from StringIO import StringIO
from qiime.parse import (parse_mapping_file,
parse_metadata_state_descriptions)
__author__ = "Rob Knight"
__copyright__ = "Copyright 2010, The QIIME Project"
__credits__ = ["Rob Knight", "Antonio Gonzalez Pena"] #remember to add yourself if you make changes
__license__ = "GPL"
__version__ = "1.2.0"
__maintainer__ = "Tony Walters"
__email__ = "William.A.Walters@colorado.edu"
__status__ = "Release"
"""This filter allows for the removal of sequences and OTUs that either do or don't match specified
metadata, for instance, isolating samples from a specific set of studies or body sites. This script
identifies samples matching the specified metadata criteria, and outputs a filtered mapping file
and OTU table containing only the specified samples."""
def get_sample_ids(map_data, map_header, states):
"""Takes col states in {col:[vals]} format.
If val starts with !, exclude rather than include.
Combines cols with and, states with or.
For example, Study:Dog,Hand will return rows where Study is Dog or Hand;
Study:Dog,Hand;BodySite:Palm,Stool will return rows where Study is Dog
or Hand _and_ BodySite is Palm or Stool; Study:*,!Dog;BodySite:*,!Stool
will return all rows except the ones where the Study is Dog or the BodySite
is Stool.
"""
name_to_col = dict([(s,map_header.index(s)) for s in states])
good_ids = []
for row in map_data: #remember to exclude header
include = True
for s, vals in states.items():
curr_state = row[name_to_col[s]]
include = include and (curr_state in vals or '*' in vals) \
and not '!'+curr_state in vals
if include:
good_ids.append(row[0])
return good_ids
def find_good_cols(header_line, good_ids):
"""Takes OTU tab-delim header line of sample ids, returns indices.
Note: always includes index 0 (the OTU id); will also include the last
index if it is taxonomy.
"""
fields = map(strip, header_line.split('\t'))
result = [fields.index(i) for i in good_ids if i in fields]
#include consensus lineage if present
if fields[-1] == 'Consensus Lineage':
result.append(-1)
return [0] + result #always include first field with id
def filter_line(line, good_fields, min_count=None, outfile=stdout):
"""Returns a filtered line of the OTU table, keeping only good samples.
line: line of the OTU table
good_fields: list of field indices to include
min_count: minimum count of OTUs to accept (if None, doesn't try to check
the counts, so can use to get header info).
outfile: file handle to write result into (we are doing this rather than
keeping in memory because the OTU tables are often very large).
"""
#find slice of the data that will be numeric
if good_fields[-1] == -1: #includes taxomony:
num_slice = slice(1,-1)
else:
num_slice = slice(1,None)
fields = map(strip, line.split('\t'))
result = [fields[i] for i in good_fields]
if min_count is not None:
if array(map(float, result[num_slice]), dtype=float).sum() < min_count:
return
outfile.write('\t'.join([fields[i] for i in good_fields])+'\n')
def filter_map(map_data, map_header, good_sample_ids):
"""Filters map according to several criteria.
- keep only sample ids in good_sample_ids
- drop cols that are different in every sample (except id)
- drop cols that are the same in every sample
"""
d = array(map_data) #note, will contain row headers
first_col = list(d[:,0]) #assume contains sample ids
good_row_indices = [first_col.index(i) for i in good_sample_ids
if i in first_col]
d = d[good_row_indices]
cols = d.T
# Determine the good column indices
# always include the sample id col
good_col_indices = [0]
# for all cols between the first and last,
# include the col if there are more than
# 2 states, and less than 1 state per sample ID
good_col_indices +=\
[i+1 for (i, col) in enumerate(cols[1:-1])
if 2 <= len(set(col)) < (len(col))]
# always include the description col
good_col_indices += [len(cols)-1]
h = [map_header[i] for i in good_col_indices]
d = d[:,good_col_indices]
return h, map(list, d)
def filter_otus_and_map(map_infile, otu_infile, map_outfile, otu_outfile,
valid_states_str, num_seqs_per_otu):
"""Filters OTU and map files according to specified criteria."""
map_data, map_header, map_comments = parse_mapping_file(map_infile)
map_infile.close()
valid_states = parse_metadata_state_descriptions(valid_states_str)
sample_ids = get_sample_ids(map_data, map_header, valid_states)
# write out the filtered mapping file
out_headers, out_data = filter_map(map_data, map_header, sample_ids)
header_line = '#' + '\t'.join(out_headers)
map_outfile.write('\n'.join([header_line] + map('\t'.join, out_data)))
if not isinstance(map_outfile, StringIO):
map_outfile.close()
# write out the filtered OTU file
for line in otu_infile:
if line.startswith('#OTU ID'):
fields = map(strip, line.split('\t'))
cols = find_good_cols(line, sample_ids)
filter_line(line, cols, min_count=None, outfile=otu_outfile)
elif line.startswith('#'):
otu_outfile.write(line)
else:
filter_line(line, cols, min_count=num_seqs_per_otu,
outfile=otu_outfile)
if not isinstance(otu_outfile, StringIO):
otu_outfile.close()