This repository has been archived by the owner on Nov 9, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 269
/
make_per_library_sff.py
111 lines (87 loc) · 3.48 KB
/
make_per_library_sff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/env python
from __future__ import division
__author__ = "Kyle Bittinger"
__copyright__ = "Copyright 2011, The QIIME Project"
__credits__ = ["Kyle Bittinger"]
__license__ = "GPL"
__version__ = "1.3.0"
__maintainer__ = "Kyle Bittinger"
__email__ = "kylebittinger@gmail.com"
__status__ = "Release"
import itertools
import os
import subprocess
from qiime.process_sff import (
check_sfffile,
)
from cogent.parse.binary_sff import (
parse_binary_sff, write_binary_sff,
)
def filter_sff_reads(sff_data, ids_to_keep=None, ids_to_remove=None):
"""Retain reads where the ID is in ids_to_keep but not in ids_to_remove.
This function reproduces the behavior of the -i and -e options in
Roche's sfffile program.
"""
# TODO: Move to PyCogent
header, reads = sff_data
# Force evaluation of all reads. We have no choice, since we need
# the total number of reads to be returned with the header.
# Another design choice would be to go back and correct the header
# when we write the binary SFF file to disk -- maybe we'll switch
# to that strategy in the future.
if ids_to_keep is not None:
reads = [r for r in reads if r['Name'] in ids_to_keep]
if ids_to_remove is not None:
reads = [r for r in reads if r['Name'] not in ids_to_remove]
header['number_of_reads'] = len(reads)
return header, reads
def parse_id_list(id_list_file):
ids = set()
for line in id_list_file:
words = line.split()
if words:
ids.add(words[0].lstrip('>'))
return ids
def combine_sff_data(*sff_datasets):
combined_header = {'number_of_reads': 0}
combined_reads = []
for header, reads in sff_datasets:
prev_num_reads = combined_header['number_of_reads']
combined_header = header.copy()
combined_header['number_of_reads'] += prev_num_reads
combined_reads = itertools.chain(combined_reads, reads)
combined_header['index_offset'] = 0
combined_header['index_length'] = 0
return combined_header, combined_reads
def make_per_library_sff(sff_fps, id_list_fp, debug=False):
id_list_basepath, _ = os.path.splitext(id_list_fp)
output_fp = id_list_basepath + '.sff'
sff_datasets = [parse_binary_sff(open(fp), True) for fp in sff_fps]
sff_data = combine_sff_data(*sff_datasets)
ids = parse_id_list(open(id_list_fp))
filtered_sff_data = filter_sff_reads(sff_data, ids_to_keep=ids)
if debug:
print 'Creating SFF file for %s' % id_list_fp
write_binary_sff(open(output_fp, 'w'), *filtered_sff_data)
def make_per_library_sff_with_sfffile(
sff_fps, id_list_fp, sfffile_path=None, debug=False):
id_list_basepath, _ = os.path.splitext(id_list_fp)
output_fp = id_list_basepath + '.sff'
check_sfffile()
args = ['sfffile', '-i', id_list_fp, '-o', output_fp] + sff_fps
if debug:
print args
subprocess.check_call(
args, stdout=open(os.devnull, 'w'), stderr=open(os.devnull, 'w'))
def make_per_library_sffs(
sff_fps, id_list_dir, use_sfftools=False, sfffile_path=None, debug=False):
for dirpath, dirnames, filenames in os.walk(id_list_dir):
for filename in filenames:
if filename.startswith('.'):
continue
id_list_fp = os.path.join(dirpath, filename)
if use_sfftools:
make_per_library_sff_with_sfffile(
sff_fps, id_list_fp, sfffile_path, debug)
else:
make_per_library_sff(sff_fps, id_list_fp, debug)