Skip to content

Commit

Permalink
Optimise FASTQ to QUAL for Bio.SeqIO.convert, see mailing list
Browse files Browse the repository at this point in the history
  • Loading branch information
peterjc committed Jan 17, 2011
1 parent 1eeed11 commit e26030a
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 0 deletions.
60 changes: 60 additions & 0 deletions Bio/SeqIO/_convert.py
Expand Up @@ -35,6 +35,7 @@ def _genbank_convert_fasta(in_handle, out_handle, alphabet=None):
#For FASTA output we can ignore the alphabet too
return SeqIO.write(records, out_handle, "fasta")


def _embl_convert_fasta(in_handle, out_handle, alphabet=None):
"""Fast EMBL to FASTA (PRIVATE)."""
#We don't need to parse the features...
Expand All @@ -43,6 +44,7 @@ def _embl_convert_fasta(in_handle, out_handle, alphabet=None):
#For FASTA output we can ignore the alphabet too
return SeqIO.write(records, out_handle, "fasta")


def _fastq_generic(in_handle, out_handle, mapping):
"""FASTQ helper function where can't have data loss by truncation (PRIVATE)."""
from Bio.SeqIO.QualityIO import FastqGeneralIterator
Expand All @@ -57,6 +59,7 @@ def _fastq_generic(in_handle, out_handle, mapping):
raise ValueError("Invalid character in quality string")
out_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
return count


def _fastq_generic2(in_handle, out_handle, mapping, truncate_char, truncate_msg):
"""FASTQ helper function where there could be data loss by truncation (PRIVATE)."""
Expand All @@ -77,6 +80,7 @@ def _fastq_generic2(in_handle, out_handle, mapping, truncate_char, truncate_msg)
out_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
return count


def _fastq_sanger_convert_fastq_sanger(in_handle, out_handle, alphabet=None):
"""Fast Sanger FASTQ to Sanger FASTQ conversion (PRIVATE).
Expand All @@ -93,6 +97,7 @@ def _fastq_sanger_convert_fastq_sanger(in_handle, out_handle, alphabet=None):
assert len(mapping)==256
return _fastq_generic(in_handle, out_handle, mapping)


def _fastq_solexa_convert_fastq_solexa(in_handle, out_handle, alphabet=None):
"""Fast Solexa FASTQ to Solexa FASTQ conversion (PRIVATE).
Expand All @@ -108,6 +113,7 @@ def _fastq_solexa_convert_fastq_solexa(in_handle, out_handle, alphabet=None):
assert len(mapping)==256
return _fastq_generic(in_handle, out_handle, mapping)


def _fastq_illumina_convert_fastq_illumina(in_handle, out_handle, alphabet=None):
"""Fast Illumina 1.3+ FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE).
Expand All @@ -123,6 +129,7 @@ def _fastq_illumina_convert_fastq_illumina(in_handle, out_handle, alphabet=None)
assert len(mapping)==256
return _fastq_generic(in_handle, out_handle, mapping)


def _fastq_illumina_convert_fastq_sanger(in_handle, out_handle, alphabet=None):
"""Fast Illumina 1.3+ FASTQ to Sanger FASTQ conversion (PRIVATE).
Expand All @@ -136,6 +143,7 @@ def _fastq_illumina_convert_fastq_sanger(in_handle, out_handle, alphabet=None):
assert len(mapping)==256
return _fastq_generic(in_handle, out_handle, mapping)


def _fastq_sanger_convert_fastq_illumina(in_handle, out_handle, alphabet=None):
"""Fast Sanger FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE).
Expand All @@ -153,6 +161,7 @@ def _fastq_sanger_convert_fastq_illumina(in_handle, out_handle, alphabet=None):
return _fastq_generic2(in_handle, out_handle, mapping, trunc_char,
"Data loss - max PHRED quality 62 in Illumina 1.3+ FASTQ")


def _fastq_solexa_convert_fastq_sanger(in_handle, out_handle, alphabet=None):
"""Fast Solexa FASTQ to Sanger FASTQ conversion (PRIVATE).
Expand Down Expand Up @@ -203,6 +212,7 @@ def _fastq_solexa_convert_fastq_illumina(in_handle, out_handle, alphabet=None):
assert len(mapping)==256
return _fastq_generic(in_handle, out_handle, mapping)


def _fastq_illumina_convert_fastq_solexa(in_handle, out_handle, alphabet=None):
"""Fast Illumina 1.3+ FASTQ to Solexa FASTQ conversion (PRIVATE).
Expand Down Expand Up @@ -257,6 +267,52 @@ def _fastq_convert_tab(in_handle, out_handle, alphabet=None):
out_handle.write("%s\t%s\n" % (title.split(None, 1)[0], seq))
return count

def _fastq_convert_qual(in_handle, out_handle, mapping):
"""FASTQ helper function for QUAL output (PRIVATE).
Mapping should be a dictionary mapping expected ASCII characters from the
FASTQ quality string to PHRED quality scores (as strings).
"""
from Bio.SeqIO.QualityIO import FastqGeneralIterator
#For real speed, don't even make SeqRecord and Seq objects!
count = 0
for title, seq, qual in FastqGeneralIterator(in_handle):
count += 1
out_handle.write(">%s\n" % title)
#map the qual...
try:
qualities_strs = [mapping[ascii] for ascii in qual]
except KeyError:
raise ValueError("Invalid character in quality string")
while qualities_strs:
line = qualities_strs.pop(0)
while qualities_strs \
and len(line) + 1 + len(qualities_strs[0]) < 60:
line += " " + qualities_strs.pop(0)
out_handle.write(line + "\n")
return count


def _fastq_sanger_convert_qual(in_handle, out_handle, alphabet=None):
"""Fast Sanger FASTQ to QUAL conversion (PRIVATE)."""
mapping = dict((chr(q+33), str(q)) for q in range(0,93+1))
return _fastq_convert_qual(in_handle, out_handle, mapping)


def _fastq_solexa_convert_qual(in_handle, out_handle, alphabet=None):
"""Fast Solexa FASTQ to QUAL conversion (PRIVATE)."""
from Bio.SeqIO.QualityIO import phred_quality_from_solexa
mapping = dict((chr(q+64), str(int(round(phred_quality_from_solexa(q))))) \
for q in range(-5,62+1))
return _fastq_convert_qual(in_handle, out_handle, mapping)


def _fastq_illumina_convert_qual(in_handle, out_handle, alphabet=None):
"""Fast Illumina 1.3+ FASTQ to QUAL conversion (PRIVATE)."""
mapping = dict((chr(q+64), str(q)) for q in range(0,62+1))
return _fastq_convert_qual(in_handle, out_handle, mapping)


#TODO? - Handling aliases explicitly would let us shorten this list:
_converter = {
("genbank", "fasta") : _genbank_convert_fasta,
Expand Down Expand Up @@ -286,6 +342,10 @@ def _fastq_convert_tab(in_handle, out_handle, alphabet=None):
("fastq-sanger", "fastq-illumina") : _fastq_sanger_convert_fastq_illumina,
("fastq-solexa", "fastq-illumina") : _fastq_solexa_convert_fastq_illumina,
("fastq-illumina", "fastq-illumina") : _fastq_illumina_convert_fastq_illumina,
("fastq", "qual") : _fastq_sanger_convert_qual,
("fastq-sanger", "qual") : _fastq_sanger_convert_qual,
("fastq-solexa", "qual") : _fastq_solexa_convert_qual,
("fastq-illumina", "qual") : _fastq_illumina_convert_qual,
}

def _handle_convert(in_handle, in_format, out_handle, out_format, alphabet=None):
Expand Down
3 changes: 3 additions & 0 deletions NEWS
Expand Up @@ -27,6 +27,9 @@ The SeqRecord object now as a reverse_complement method (similar to that of
the Seq object). This is most useful to reversing per-letter-annotation (such
as quality scores from FASTQ) or features (such as annotation from GenBank).

FASTQ to QUAL conversion Using Bio.SeqIO.convert(...) has been optimised, and
is now about three times faster.

(At least) 11 people have contributed to this release, including 3 new people:

Brad Chapman
Expand Down

0 comments on commit e26030a

Please sign in to comment.