This repository has been archived by the owner on Nov 9, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 269
/
collate_alpha.py
executable file
·125 lines (105 loc) · 5.72 KB
/
collate_alpha.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/env python
# File created on 09 Feb 2010
from __future__ import division
import operator
import numpy
import os
import sys
from qiime.collate_alpha import write_output_file, make_output_row
from qiime.parse import (filter_otus_by_lineage, parse_matrix,
parse_rarefaction_fname)
from qiime.util import FunctionWithParams
from qiime.util import parse_command_line_parameters, make_option
__author__ = "Justin Kuczynski"
__copyright__ = "Copyright 2011, The QIIME Project"
__credits__ = ["Justin Kuczynski"]
__license__ = "GPL"
__version__ = "1.7.0"
__maintainer__ = "Justin Kuczynski"
__email__ = "justinak@gmail.com"
__status__ = "Release"
#collate_alpha.py
script_info={}
script_info['brief_description']="""Collate alpha diversity results"""
script_info['script_description']="""When performing batch analyses on the OTU table (e.g. rarefaction followed by alpha diversity), the result of alpha_diversity.py comprises many files, which need to be concatenated into a single file for generating rarefaction curves. This script joins those files.
Input files are:
each file represents one (rarefied) otu table
each row in a file represents one sample
each column in a file represents one diversity metric
Output files are:
each file represents one diversity metric
each row in a file represents one (rarefied) otu table
each column in a file represents one sample
The input directory should contain only otu tables. The output directory should be empty or nonexistant and the example file is optional.
If you have a set of rarefied OTU tables, make sure the example file contains every sample present in the otu talbes. You should typically choose the file with the fewest sequences per sample, to avoid files with sparse samples omitted.
"""
script_info['script_usage']=[]
script_info['script_usage'].append(("""Example:""","""The user inputs the results from batch alpha diversity (e.g. alpha_div/) and the location where the results should be written (e.g. collated_alpha/), as shown by the following command:""","""%prog -i alpha_div/ -o collated_alpha/"""))
script_info['output_description']="""This script takes the resulting files from batch alpha diversity and collates them into (one file for each metric used).
This script transforms a series of files, named (e.g. alpha_rarefaction_20_0.txt, alpha_rarefaction_20_1.txt, etc.) into a (usually much smaller) set of files named (e.g. chao1.txt, PD_whole_tree.txt, etc.), where the columns correspond to samples and rows to the rarefaction files inputted, as shown by the following:
========================== ==================== ========= ====== ======
\ sequences per sample iteration PC.354 PC.355
========================== ==================== ========= ====== ======
alpha_rarefaction_20_0.txt 20 0 0.925 0.915
alpha_rarefaction_20_1.txt 20 1 0.9 0.89
alpha_rarefaction_20_2.txt 20 2 0.88 0.915
alpha_rarefaction_20_3.txt 20 3 0.91 0.93
... ... ... ... ...
========================== ==================== ========= ====== ======
"""
script_info['required_options']=[\
make_option('-i', '--input_path', type='existing_path',
help='input path (a directory)'),
make_option('-o', '--output_path', type='new_dirpath',
help='output path (a directory). will be created if needed')
]
script_info['optional_options']=[\
make_option('-e', '--example_path',type='existing_filepath',
help='example alpha_diversity analysis file, containing all samples'+\
' and all metrics to be included in the collated result'+\
'[Default: chosen automatically (see usage string)]')
]
script_info['version'] = __version__
def main():
option_parser, opts, args = parse_command_line_parameters(**script_info)
if len(args) != 0:
parser.error("Positional argument detected. make sure all"+\
' parameters are identified.' +\
'\ne.g.: include the \"-m\" in \"-m MINIMUM_LENGTH\"')
input_dir = opts.input_path
output_dir = opts.output_path
example_filepath = opts.example_path
if not os.path.exists(output_dir):
os.makedirs(output_dir)
file_names = os.listdir(input_dir)
file_names = [fname for fname in file_names if not fname.startswith('.')]
if example_filepath is None:
# table row is base_name, seqs_per_sam, iters, ext
file_name_table = map(parse_rarefaction_fname, file_names)
# sort on seqs/sam
sorted_fname_table = sorted(file_name_table, key=operator.itemgetter(1))
# now map back to file name
example_fname = file_names[file_name_table.index(sorted_fname_table[0])]
example_filepath = os.path.join(input_dir,example_fname)
f = open(example_filepath, 'U')
all_metrics, all_samples, example_data = parse_matrix(f)
num_cols = len(all_samples)
f.close()
# make the table 1 row at a time
# we're building a rarefaction by sample mtx from
# a sample by metric matrix
# each metric is one output file
for metric in all_metrics:
metric_file_data = []
for fname in file_names:
# f_ here refers to the input file currently being processed
# to distinguish from the output file we're building
f = open(os.path.join(input_dir,fname), 'U')
f_metrics, f_samples, f_data = parse_matrix(f)
f.close()
metric_file_data.append(\
make_output_row(f_metrics, metric, f_samples,
f_data, fname,num_cols,all_samples))
write_output_file(metric_file_data, output_dir, metric, all_samples)
if __name__ == "__main__":
main()