Browse files

Parse COMMENT blocks correctly (see bug #2446), add a 'read' function to

replace ACEParser, and deprecate the previous parsers.
  • Loading branch information...
1 parent 9606868 commit df16d52ab11cb8ba9b33d715c946811458d8a3df mdehoon committed Aug 1, 2008
Showing with 133 additions and 76 deletions.
  1. +133 −76 Bio/Sequencing/Ace.py
View
209 Bio/Sequencing/Ace.py
@@ -3,25 +3,23 @@
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""
-Parser for (new) ACE files output by PHRAP.
+Parser for ACE files output by PHRAP.
-version 1.3, 05/06/2004
Written by Frank Kauff (fkauff@duke.edu) and
Cymon J. Cox (cymon@duke.edu)
Uses the Biopython Parser interface: ParserSupport.py
Usage:
-There are two ways of reading an ace file: The ACEParser() reads
-the whole file at once and the RecordParser() reads contig after
-contig.
+There are two ways of reading an ace file:
+1) The function 'read' reads the whole file at once;
+2) The function 'parse' reads the file contig after contig.
1) Parse whole ace file at once:
from Bio.Sequencing import Ace
- aceparser=Ace.ACEParser()
- acefilerecord=aceparser.parse(open('my_ace_file.ace','r'))
+ acefilerecord=Ace.read(open('my_ace_file.ace'))
This gives you:
acefilerecord.ncontigs (the number of contigs in the ace file)
@@ -44,9 +42,8 @@
2) Or you can iterate over the contigs of an ace file one by one in the ususal way:
from Bio.Sequencing import Ace
- recordparser=Ace.RecordParser()
- iterator=Ace.Iterator(open('my_ace_file.ace','r'),recordparser)
- for contig in iterator :
+ contigs=Ace.parse(open('my_ace_file.ace'))
+ for contig in contigs:
print contig.name
...
@@ -56,11 +53,8 @@
Because the parser doesn't see this data until the final record, it cannot be added to
the appropriate records. Instead these tags will be returned with the last contig record.
Thus an ace file does not entirerly suit the concept of iterating. If WA, CT, RT, WR tags
-are needed, the ACEParser instead of the RecordParser might be appropriate.
+are needed, the 'read' function rather than the 'parse' function might be more appropriate.
"""
-import os
-from Bio import File
-from Bio.ParserSupport import *
class rd:
@@ -161,6 +155,7 @@ def __init__(self, line=None):
self.date=''
self.notrans=''
self.info=[]
+ self.comment=[]
if line:
header=line.split()
self.name = header[0]
@@ -239,6 +234,19 @@ def __init__(self, line=None):
self.uorc = header[5]
def parse(handle):
+ """parse(handle)
+
+ where handle is a file-like object.
+
+ This function returns an iterator that allows you to iterate
+ over the ACE file record by record:
+
+ records = parse(handle)
+ for record in records:
+ # do something with the record
+
+ where each record is a Contig object.
+ """
handle = iter(handle)
@@ -399,8 +407,16 @@ def parse(handle):
record.ct.append(ct(line))
for line in handle:
line=line.strip()
- if line=='}': break
- record.ct[-1].info.append(line)
+ if line=="COMMENT{":
+ for line in handle:
+ line = line.strip()
+ if line.endswith("C}"):
+ break
+ record.ct[-1].comment.append(line)
+ elif line=='}':
+ break
+ else:
+ record.ct[-1].info.append(line)
line = ""
else:
break
@@ -409,7 +425,102 @@ def parse(handle):
break
yield record
+
+class ACEFileRecord:
+ """Holds data of an ACE file.
+ """
+ def __init__(self):
+ self.ncontigs=None
+ self.nreads=None
+ self.contigs=[]
+ self.wa=None # none or many
+
+ def sort(self):
+ """Sorts wr, rt and ct tags into the appropriate contig / read instance, if possible. """
+
+ ct=[]
+ rt=[]
+ wr=[]
+ # search for tags that aren't in the right position
+ for i in range(len(self.contigs)):
+ c = self.contigs[i]
+ if c.wa:
+ if not self.wa:
+ self.wa=[]
+ self.wa.extend(c.wa)
+ if c.ct:
+ newcts=[ct_tag for ct_tag in c.ct if ct_tag.name!=c.name]
+ map(self.contigs[i].ct.remove,newcts)
+ ct.extend(newcts)
+ for j in range(len(c.reads)):
+ r = c.reads[j]
+ if r.rt:
+ newrts=[rt_tag for rt_tag in r.rt if rt_tag.name!=r.rd.name]
+ map(self.contigs[i].reads[j].rt.remove,newrts)
+ rt.extend(newrts)
+ if r.wr:
+ newwrs=[wr_tag for wr_tag in r.wr if wr_tag.name!=r.rd.name]
+ map(self.contigs[i].reads[j].wr.remove,newwrs)
+ wr.extend(newwrs)
+ # now sort them into their proper place
+ for i in range(len(self.contigs)):
+ c = self.contigs[i]
+ for ct_tag in ct:
+ if ct_tag.name==c.name:
+ if self.contigs[i].ct is None:
+ self.contigs[i].ct=[]
+ self.contigs[i].ct.append(ct_tag)
+ if rt or wr:
+ for j in range(len(c.reads)):
+ r = c.reads[j]
+ for rt_tag in rt:
+ if rt_tag.name==r.rd.name:
+ if self.contigs[i].reads[j].rt is None:
+ self.contigs[i].reads[j].rt=[]
+ self.contigs[i].reads[j].rt.append(rt_tag)
+ for wr_tag in wr:
+ if wr_tag.name==r.rd.name:
+ if self.contigs[i].reads[j].wr is None:
+ self.contigs[i].reads[j].wr=[]
+ self.contigs[i].reads[j].wr.append(wr_tag)
+
+def read(handle):
+ """Parses the full ACE file in list of contigs.
+
+ """
+
+ handle = iter(handle)
+
+ record=ACEFileRecord()
+
+ try:
+ line = handle.next()
+ except StopIteration:
+ raise ValueError, "Premature end of file"
+
+ # check if the file starts correctly
+ if not line.startswith('AS'):
+ raise ValueError, "File does not start with 'AS'."
+
+ words = line.split()
+ record.ncontigs, record.nreads = map(int, words[1:3])
+
+ # now read all the records
+ record.contigs = list(parse(handle))
+ # wa, ct, rt rags are usually at the end of the file, but not necessarily (correct?).
+ # If the iterator is used, the tags are returned with the contig or the read after which they appear,
+ # if all tags are at the end, they are read with the last contig. The concept of an
+ # iterator leaves no other choice. But if the user uses the ACEParser, we can check
+ # them and put them into the appropriate contig/read instance.
+ # Conclusion: An ACE file is not a filetype for which iteration is 100% suitable...
+ record.sort()
+ return record
+
+#### Everything below is deprecated
+from Bio import File
+from Bio.ParserSupport import *
+
class Iterator:
"""Iterates over an ACE-file with multiple contigs.
@@ -424,6 +535,8 @@ def __init__(self, handle, parser=None):
is an optional Parser object to change the results into another form.
If set to None, then the raw contents of the file will be returned.
"""
+ import warnings
+ warnings.warn("Ace.Iterator is deprecated. Instead of Ace.Iterator(handle, Ace.RecordParser()), please use Ace.parse(handle)", DeprecationWarning)
self._uhandle = File.UndoHandle(handle)
self._parser = parser
@@ -479,70 +592,14 @@ def parse(self, handle):
self._scanner.feed(uhandle, self._consumer)
return self._consumer.data
-class ACEFileRecord:
- """Holds data of an ACE file.
- """
- def __init__(self):
- self.ncontigs=None
- self.nreads=None
- self.contigs=[]
- self.wa=None # none or many
-
- def sort(self):
- """Sorts wr, rt and ct tags into the appropriate contig / read instance, if possible. """
-
- ct=[]
- rt=[]
- wr=[]
- # search for tags that aren't in the right position
- for i in range(len(self.contigs)):
- c = self.contigs[i]
- if c.wa:
- if not self.wa:
- self.wa=[]
- self.wa.extend(c.wa)
- if c.ct:
- newcts=[ct_tag for ct_tag in c.ct if ct_tag.name!=c.name]
- map(self.contigs[i].ct.remove,newcts)
- ct.extend(newcts)
- for j in range(len(c.reads)):
- r = c.reads[j]
- if r.rt:
- newrts=[rt_tag for rt_tag in r.rt if rt_tag.name!=r.rd.name]
- map(self.contigs[i].reads[j].rt.remove,newrts)
- rt.extend(newrts)
- if r.wr:
- newwrs=[wr_tag for wr_tag in r.wr if wr_tag.name!=r.rd.name]
- map(self.contigs[i].reads[j].wr.remove,newwrs)
- wr.extend(newwrs)
- # now sort them into their proper place
- for i in range(len(self.contigs)):
- c = self.contigs[i]
- for ct_tag in ct:
- if ct_tag.name==c.name:
- if self.contigs[i].ct is None:
- self.contigs[i].ct=[]
- self.contigs[i].ct.append(ct_tag)
- if rt or wr:
- for j in range(len(c.reads)):
- r = c.reads[j]
- for rt_tag in rt:
- if rt_tag.name==r.rd.name:
- if self.contigs[i].reads[j].rt is None:
- self.contigs[i].reads[j].rt=[]
- self.contigs[i].reads[j].rt.append(rt_tag)
- for wr_tag in wr:
- if wr_tag.name==r.rd.name:
- if self.contigs[i].reads[j].wr is None:
- self.contigs[i].reads[j].wr=[]
- self.contigs[i].reads[j].wr.append(wr_tag)
-
class ACEParser(AbstractParser):
"""Parses full ACE file in list of contigs.
"""
def __init__(self):
+ import warnings
+ warnings.warn("Ace.ACEParser is deprecated. Instead of Ace.ACEParser().parse(handle), please use Ace.read(handle)", DeprecationWarning)
self.data=ACEFileRecord()
def parse(self,handle):
@@ -840,8 +897,8 @@ def wr_data(self,taglines):
print "Quick self test"
#Test the iterator,
handle = open("../../Tests/Ace/contig1.ace")
- iterator = parse(handle)
- for contig in iterator :
+ contigs = parse(handle)
+ for contig in contigs:
print contig.name, len(contig.sequence), len(contig.reads)
handle.close()
print "Done"

0 comments on commit df16d52

Please sign in to comment.