This repository has been archived by the owner on Nov 9, 2023. It is now read-only.
/
split.py
121 lines (99 loc) · 4.97 KB
/
split.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/env python
# File created on 24 Feb 2012
from __future__ import division
__author__ = "Greg Caporaso"
__copyright__ = "Copyright 2011, The QIIME project"
__credits__ = ["Greg Caporaso"]
__license__ = "GPL"
__version__ = "1.6.0"
__maintainer__ = "Greg Caporaso"
__email__ = "gregcaporaso@gmail.com"
__status__ = "Release"
from cogent.parse.fasta import MinimalFastaParser
from cogent.util.misc import create_dir
from qiime.parse import parse_mapping_file
from qiime.filter import filter_mapping_file, sample_ids_from_metadata_description
from qiime.format import format_mapping_file, format_biom_table
from biom.parse import parse_biom_table
from biom.table import TableException
def get_mapping_values(mapping_f,mapping_field):
mapping_data, mapping_headers, comments = parse_mapping_file(mapping_f)
try:
field_index = mapping_headers.index(mapping_field)
except ValueError:
option_parser.error("Field is not in mapping file (search is case "+\
"and white-space sensitive). \n\tProvided field: "+\
"%s. \n\tValid fields: %s" % (mapping_field,' '.join(mapping_headers)))
return set([e[field_index] for e in mapping_data])
def split_mapping_file_on_field(mapping_f,
mapping_field,
column_rename_ids=None,
include_repeat_cols=True):
""" split mapping file based on value in field """
mapping_f = list(mapping_f)
mapping_values = get_mapping_values(mapping_f,mapping_field)
mapping_data, mapping_headers, _ = parse_mapping_file(mapping_f)
if column_rename_ids:
try:
column_rename_ids = mapping_headers.index(column_rename_ids)
except ValueError:
option_parser.error("Field is not in mapping file (search is case "+\
"and white-space sensitive). \n\tProvided field: "+\
"%s. \n\tValid fields: %s" % (mapping_field,' '.join(mapping_headers)))
for v in mapping_values:
v_fp_str = v.replace(' ','_')
sample_ids_to_keep = sample_ids_from_metadata_description(
mapping_f,valid_states_str="%s:%s" % (mapping_field,v))
# parse mapping file each time though the loop as filtering operates on values
mapping_data, mapping_headers, _ = parse_mapping_file(mapping_f)
mapping_headers, mapping_data = filter_mapping_file(
mapping_data,
mapping_headers,
sample_ids_to_keep,
include_repeat_cols=include_repeat_cols,
column_rename_ids=column_rename_ids)
yield v_fp_str, format_mapping_file(mapping_headers, mapping_data)
def split_otu_table_on_sample_metadata(otu_table_f,mapping_f,mapping_field):
""" split otu table into sub otu tables where each represent samples corresponding to only a certain value in mapping_field
"""
mapping_f = list(mapping_f)
mapping_values = get_mapping_values(mapping_f,mapping_field)
otu_table = parse_biom_table(otu_table_f)
for v in mapping_values:
v_fp_str = v.replace(' ','_')
sample_ids_to_keep = sample_ids_from_metadata_description(
mapping_f,valid_states_str="%s:%s" % (mapping_field,v))
try:
filtered_otu_table = otu_table.filterSamples(
lambda values,id_,metadata: id_ in sample_ids_to_keep)
except TableException:
# all samples are filtered out, so no otu table to write
continue
yield v_fp_str, format_biom_table(filtered_otu_table)
def split_fasta(infile, seqs_per_file, outfile_prefix, working_dir=''):
""" Split infile into files with seqs_per_file sequences in each
infile: list of fasta lines or open file object
seqs_per_file: the number of sequences to include in each file
out_fileprefix: string used to create output filepath - output filepaths
are <out_prefix>.<i>.fasta where i runs from 0 to number of output files
working_dir: directory to prepend to temp filepaths (defaults to
empty string -- files written to cwd)
List of output filepaths is returned.
"""
seq_counter = 0
out_files = []
if working_dir and not working_dir.endswith('/'):
working_dir += '/'
create_dir(working_dir)
for seq_id,seq in MinimalFastaParser(infile):
if seq_counter == 0:
current_out_fp = '%s%s.%d.fasta' \
% (working_dir,outfile_prefix,len(out_files))
current_out_file = open(current_out_fp, 'w')
out_files.append(current_out_fp)
current_out_file.write('>%s\n%s\n' % (seq_id, seq))
seq_counter += 1
if seq_counter == seqs_per_file:
current_out_file.close()
seq_counter = 0
return out_files