Skip to content

Commit

Permalink
Adding low level FASTA parser under Bio.SeqIO.FastaIO
Browse files Browse the repository at this point in the history
  • Loading branch information
peterjc committed Oct 22, 2012
1 parent 126c944 commit 751fe39
Showing 1 changed file with 41 additions and 31 deletions.
72 changes: 41 additions & 31 deletions Bio/SeqIO/FastaIO.py
Expand Up @@ -16,21 +16,13 @@
from Bio.SeqRecord import SeqRecord
from Bio.SeqIO.Interfaces import SequentialSequenceWriter

def SimpleFastaParser(handle):
"""Generator function to iterator over Fasta records (as string tuples).
def FastaIterator(handle, alphabet=single_letter_alphabet, title2ids=None):
"""Generator function to iterate over Fasta records (as SeqRecord objects).
handle - input file
alphabet - optional alphabet
title2ids - A function that, when given the title of the FASTA
file (without the beginning >), will return the id, name and
description (in that order) for the record as a tuple of strings.
If this is not given, then the entire title line will be used
as the description, and the first word as the id and name.
Note that use of title2ids matches that of Bio.Fasta.SequenceParser
but the defaults are slightly different.
For each record a tuple of two strings is returned, the FASTA title
line (without the leading '>' character), and the sequence (with any
whitespace removed). The title line is not divided up into an
identifier (the first word) and comment or description.
"""
#Skip any text before the first record (e.g. blank lines, comments)
while True:
Expand All @@ -44,18 +36,7 @@ def FastaIterator(handle, alphabet=single_letter_alphabet, title2ids=None):
if line[0] != ">":
raise ValueError(
"Records in Fasta files should start with '>' character")
if title2ids:
id, name, descr = title2ids(line[1:].rstrip())
else:
descr = line[1:].rstrip()
try:
id = descr.split()[0]
except IndexError:
assert not descr, repr(line)
#Should we use SeqRecord default for no ID?
id = ""
name = id

title = line[1:].rstrip()
lines = []
line = handle.readline()
while True:
Expand All @@ -69,16 +50,45 @@ def FastaIterator(handle, alphabet=single_letter_alphabet, title2ids=None):
#Remove trailing whitespace, and any internal spaces
#(and any embedded \r which are possible in mangled files
#when not opened in universal read lines mode)
result = "".join(lines).replace(" ", "").replace("\r", "")

#Return the record and then continue...
yield SeqRecord(Seq(result, alphabet),
id=id, name=name, description=descr)
yield title, "".join(lines).replace(" ", "").replace("\r", "")

if not line:
return # StopIteration

assert False, "Should not reach this line"

def FastaIterator(handle, alphabet=single_letter_alphabet, title2ids=None):
"""Generator function to iterate over Fasta records (as SeqRecord objects).
handle - input file
alphabet - optional alphabet
title2ids - A function that, when given the title of the FASTA
file (without the beginning >), will return the id, name and
description (in that order) for the record as a tuple of strings.
If this is not given, then the entire title line will be used
as the description, and the first word as the id and name.
Note that use of title2ids matches that of Bio.Fasta.SequenceParser
but the defaults are slightly different.
"""
for title, sequence in SimpleFastaParser(handle):
if title2ids:
id, name, descr = title2ids(title)
else:
descr = title
try:
id = descr.split()[0]
except IndexError:
assert not descr, repr(line)
#Should we use SeqRecord default for no ID?
id = ""
name = id

#Return the record and then continue...
yield SeqRecord(Seq(sequence, alphabet),
id=id, name=name, description=descr)


class FastaWriter(SequentialSequenceWriter):
"""Class to write Fasta format files."""
Expand Down

1 comment on commit 751fe39

@peterjc
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The diff isn't as clear as it could be - this splits function FastaIterator in two, new low level function SimpleFastaParser which is now called by the high level FastaIterator with the same API as before.

Please sign in to comment.