This repository has been archived by the owner on Nov 9, 2023. It is now read-only.
/
trim_sff_primers.py
169 lines (141 loc) · 5.91 KB
/
trim_sff_primers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
#!/usr/bin/env python
#file trim_sff_primers.py: resets trim values in sff file based on primers.
__author__ = "Rob Knight"
__copyright__ = "Copyright 2010, The QIIME Project"
__credits__ = ["Rob Knight", 'Kyle Bittinger']
__license__ = "GPL"
__version__ = "1.2.0"
__maintainer__ = "Kyle Bittinger"
__email__ = "kylebittinger@gmail.com"
__status__ = "Release"
"""Finds the technical read regions for each library, and resets the left trim.
Replaces the sff files with the trimmed versions.
"""
import itertools
import os
import shutil
import subprocess
import sys
import tempfile
from cogent.util.misc import app_path
from cogent.app.util import ApplicationNotFoundError
from qiime.parse import parse_mapping_file
from cogent.parse.binary_sff import (
parse_binary_sff, write_binary_sff
)
def get_technical_lengths(input_map, debug=False):
"""Returns per-sample info on technical lengths.
Note: KEY_SEQ, BARCODE and PRIMER fields are required. The LINKER
field is optional.
"""
if debug:
print "Making debug output"
body, header, comments = parse_mapping_file(input_map)
if debug:
print "HEADER:", header
key_index = header.index('KEY_SEQ')
bc_index = header.index('BARCODE')
if 'LINKER' in header:
linker_index = header.index('LINKER')
else:
linker_index = None
primer_index = header.index('PRIMER')
technical_lengths = {}
for fields in body:
curr_tech_len = len(fields[key_index]) + len(fields[bc_index]) + \
len(fields[primer_index])
if linker_index is not None:
curr_tech_len += len(fields[linker_index])
technical_lengths[fields[0]] = curr_tech_len
if debug:
print "Technical lengths:"
print technical_lengths
return technical_lengths
def get_per_lib_sff_fps(sff_dir):
"""Return a dict mapping library ID's to SFF filepaths in a directory.
"""
for dirpath, dirnames, fnames in os.walk(sff_dir):
for fname in fnames:
if fname.endswith('.sff'):
libname, _ = os.path.splitext(fname)
yield libname, os.path.join(dirpath, fname)
def set_clip_qual_left(sff_data, technical_read_length):
"""Resets the value of clip_qual_left for each read in the SFF data.
For efficiency, the original read objects are not copied, and are
modified as a side effect of this function.
"""
# TODO: Move to PyCogent
header, reads = sff_data
# sfftools use 1-based indexing
clip_idx = technical_read_length + 1
def adjust_read(read):
read['clip_qual_left'] = clip_idx
return read
return header, itertools.imap(adjust_read, reads)
def set_sff_trimpoints(sff_dir, technical_lengths):
"""Set trimpoints to end of technical read for all SFF files in directory.
"""
for lib_id, sff_fp in get_per_lib_sff_fps(sff_dir):
try:
readlength = technical_lengths[lib_id]
except KeyError:
continue
sff_data = parse_binary_sff(open(sff_fp), True)
clipped_header, clipped_reads = set_clip_qual_left(sff_data, readlength)
_, temp_fp = tempfile.mkstemp(dir=sff_dir)
with open(temp_fp, 'w') as f:
write_binary_sff(f, clipped_header, clipped_reads)
shutil.move(temp_fp, sff_fp)
def set_sff_trimpoints_with_sfftools(
sff_dir, technical_lengths, sffinfo_path='sffinfo', sfffile_path='sfffile',
debug=False):
"""Set trimpoints to end of technical read for all SFF files in directory.
This function essentially provides the reference implementation.
It uses the official sfftools from Roche to process the SFF files.
"""
if not (os.path.exists(sffinfo_path) or app_path(sffinfo_path)):
raise ApplicationNotFoundError(
'sffinfo executable not found. Is it installed and in your $PATH?')
if not (os.path.exists(sfffile_path) or app_path(sfffile_path)):
raise ApplicationNotFoundError(
'sfffile executable not found. Is it installed and in your $PATH?')
for lib_id, sff_fp in get_per_lib_sff_fps(sff_dir):
try:
readlength = technical_lengths[lib_id]
except KeyError:
continue
sffinfo_args = [sffinfo_path, '-s', sff_fp]
if debug:
print "Running sffinfo command %s" % sffinfo_args
sffinfo_output_file = tempfile.TemporaryFile()
subprocess.check_call(sffinfo_args, stdout=sffinfo_output_file)
sffinfo_output_file.seek(0)
seqlengths = {}
for line in sffinfo_output_file:
if line.startswith('>'):
fields = line[1:].split()
seq_len = fields[1].split('=')[1]
seqlengths[fields[0]] = seq_len
trim_fp = sff_fp + '.trim'
trim_file = open(trim_fp, 'w')
for id_, length in seqlengths.items():
curr_length = int(seqlengths[id_])
# Sfftools use 1-based index
left_trim = readlength + 1
# Key sequence not included in FASTA length
right_trim = curr_length + 4
if curr_length > left_trim:
trim_file.write(
"%s\t%s\t%s\n" % (id_, left_trim, right_trim))
else:
sys.stderr.write(
'Rejected read %s with trim points %s and %s (orig '
'length %s)' % (id_, left_trim, curr_length, length))
trim_file.close()
trimmed_sff_fp = sff_fp + '.trimmed'
sfffile_args = [
sfffile_path, '-t', trim_fp, '-o', trimmed_sff_fp, sff_fp]
if debug:
print "Running sfffile command:", sfffile_args
subprocess.check_call(sfffile_args, stdout=open(os.devnull, 'w'))
subprocess.check_call(['mv', trimmed_sff_fp, sff_fp])