This repository has been archived by the owner on Nov 9, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 269
/
summarize_taxa.py
102 lines (81 loc) · 3.41 KB
/
summarize_taxa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/usr/bin/env python
__author__ = "Rob Knight"
__copyright__ = "Copyright 2011, The QIIME Project"
__credits__ = ["Rob Knight", "Catherine Lozupone", "Justin Kuczynski","Julia Goodrich", \
"Antonio Gonzalez Pena"]
__license__ = "GPL"
__version__ = "1.3.0"
__maintainer__ = "Daniel McDonald"
__email__ = "wasade@gmail.com"
__status__ = "Release"
"""Contains code for summarizing OTU table with taxa in last field.
"""
from collections import defaultdict
from sys import stdout, stderr
from optparse import OptionParser
from string import strip
from numpy import array
from qiime.parse import parse_otu_table, parse_mapping_file
def make_summary(otu_table, level, upper_percentage, lower_percentage):
"""Returns taxonomy summary data
header is a list of:
[(Taxon),sample1,sample2,...]
taxonomy_summary is a list of lists of:
[[(taxon1),count,count,...],[(taxon2),count,count,...]...]
"""
header = ['Taxon']
header.extend(otu_table[0]) # sample ids
counts_by_consensus, sample_map = sum_counts_by_consensus(otu_table, level)
total_counts = float(sum([sum(i) for i in counts_by_consensus.values()]))
taxonomy_summary = []
for consensus, otu_counts in sorted(counts_by_consensus.items()):
if lower_percentage!=None and \
otu_counts.sum()/total_counts>lower_percentage:
continue
elif upper_percentage!=None and \
otu_counts.sum()/total_counts<upper_percentage:
continue
new_row = [(consensus)]
new_row.extend(otu_counts)
taxonomy_summary.append(new_row)
return taxonomy_summary, header
def sum_counts_by_consensus(otu_table, level, missing_name='Other'):
"""Returns a dict keyed by consensus, valued by otu counts
otu counts are summed together if they have the same consensus
if the consensus string doesn't reach to level, missing_name is appended on
until the taxonomy string is of length level
"""
result = {}
sample_map = dict([(s,i) for i,s in enumerate(otu_table[0])])
for counts, consensus in zip(otu_table[2], otu_table[3]):
n_ranks = len(consensus)
if n_ranks > level:
consensus = consensus[:level]
elif n_ranks < level:
consensus.extend([missing_name for i in range(level - n_ranks)])
else:
# consensus is the correct number of levels
pass
consensus = tuple(consensus)
if consensus in result:
result[consensus] += counts
else:
result[consensus] = counts.copy()
return result, sample_map
def add_summary_mapping(otu_table, mapping, level):
"""Returns sample summary of sample counts by taxon
Summary is keyed by sample_id, valued by otu counts for each taxon
Taxon order is a list of taxons where idx n corresponds to otu count idx n
"""
counts_by_consensus, sample_map = sum_counts_by_consensus(otu_table, level)
summary = defaultdict(list)
for row in mapping:
# grab otu idx if the sample exists, otherwise ignore it
sample_id = row[0]
if sample_id not in sample_map:
continue
otu_idx = sample_map[sample_id]
for consensus, counts in sorted(counts_by_consensus.items()):
summary[sample_id].append(counts[otu_idx])
taxon_order = sorted(counts_by_consensus.keys())
return summary, taxon_order