Permalink
Browse files

A more efficient implementation of a Newick parser (linear time vs. q…

…uadratic) that makes only a single pass over the text and handles quoted labels correctly.
  • Loading branch information...
1 parent d678376 commit b7cd97a0702295f326fc4f4f2be29371454c4deb @bendmorris bendmorris committed with etal Feb 5, 2013
Showing with 94 additions and 76 deletions.
  1. +94 −76 Bio/Phylo/NewickIO.py
View
@@ -64,92 +64,110 @@ def from_string(cls, treetext):
def parse(self, values_are_confidence=False, rooted=False):
"""Parse the text stream this object was initialized with."""
self.values_are_confidence = values_are_confidence
- self.rooted = rooted # XXX this attribue is useless
+ self.rooted = rooted
buf = ''
for line in self.handle:
buf += line.rstrip()
if buf.endswith(';'):
- yield self._parse_tree(buf, rooted)
+ yield self._parse_tree(buf)
buf = ''
if buf:
# Last tree is missing a terminal ';' character -- that's OK
- yield self._parse_tree(buf, rooted)
+ yield self._parse_tree(buf)
- def _parse_tree(self, text, rooted):
+ def _parse_tree(self, text):
"""Parses the text representation into an Tree object."""
- # XXX Pass **kwargs along from Parser.parse?
- return Newick.Tree(root=self._parse_subtree(text), rooted=self.rooted)
-
- def _parse_subtree(self, text):
- """Parse ``(a,b,c...)[[[xx]:]yy]`` into subcomponents, recursively."""
- text = text.strip().rstrip(';')
- if text.count('(')!=text.count(')'):
- raise NewickError("Parentheses do not match in (sub)tree: " + text)
- # Text is now "(...)..." (balanced parens) or "..." (leaf node)
- if text.count('(') == 0:
- # Leaf/terminal node -- recursion stops here
- return self._parse_tag(text)
- # Handle one layer of the nested subtree
- # XXX what if there's a paren in a comment or other string?
- close_posn = text.rfind(')')
- subtrees = []
- # Locate subtrees by counting nesting levels of parens
- plevel = 0
- prev = 1
- for posn in range(1, close_posn):
- if text[posn] == '(':
- plevel += 1
- elif text[posn] == ')':
- plevel -= 1
- elif text[posn] == ',' and plevel == 0:
- subtrees.append(text[prev:posn])
- prev = posn + 1
- subtrees.append(text[prev:close_posn])
- # Construct a new clade from trailing text, then attach subclades
- clade = self._parse_tag(text[close_posn+1:])
- clade.clades = [self._parse_subtree(st) for st in subtrees]
- return clade
-
- def _parse_tag(self, text):
- """Extract the data for a node from text.
-
- :returns: Clade instance containing any available data
- """
- # Extract the comment
- comment_start = text.find(NODECOMMENT_START)
- if comment_start != -1:
- comment_end = text.find(NODECOMMENT_END)
- if comment_end == -1:
- raise NewickError('Error in tree description: '
- 'Found %s without matching %s'
- % (NODECOMMENT_START, NODECOMMENT_END))
- comment = text[comment_start+len(NODECOMMENT_START):comment_end]
- text = text[:comment_start] + text[comment_end+len(NODECOMMENT_END):]
- else:
- comment = None
- clade = Newick.Clade(comment=comment)
- # Extract name (taxon), and optionally support, branch length
- # Float values are support and branch length, the string is name/taxon
- values = []
- for part in (t.strip() for t in text.split(':')):
- if part:
- try:
- values.append(float(part))
- except ValueError:
- assert clade.name is None, "Two string taxonomies?"
- clade.name = part
- if len(values) == 1:
- # Real branch length, or support as branch length
- if self.values_are_confidence:
- clade.confidence = values[0]
+ text = text.lstrip()
+
+ def process_clade(clade):
+ if not clade.name:
+ clade.name = None
+ if hasattr(clade, 'branch_length_string'):
+ clade.branch_length = float(clade.branch_length_string)
+ del clade.branch_length_string
+ if hasattr(clade, 'parent'):
+ parent = clade.parent
+ parent.clades.append(clade)
+ del clade.parent
+ return parent
+
+ def make_new_clade(parent=None):
+ clade = Newick.Clade(name='')
+ if parent: clade.parent = parent
+ return clade
+
+ current_clade = make_new_clade()
+ entering_quoted_string = False
+ escaped = False
+ entering_comment = False
+ entering_branch_length = False
+
+ for char in text:
+ if entering_quoted_string:
+ # add characters to clade name until closing quote is found
+ if char == "'" and not escaped:
+ entering_quoted_string = False
+ elif char == '\\':
+ escaped = True
+ else:
+ current_clade.name += char
+
+ elif entering_comment:
+ # add characters to comment until closing square bracket is found
+ if char == ']':
+ entering_comment = False
+ else:
+ current_clade.comment += char
+
else:
- clade.branch_length = values[0]
- elif len(values) == 2:
- # Two non-taxon values: support comes first. (Is that always so?)
- clade.confidence, clade.branch_length = values
- elif len(values) > 2:
- raise NewickError("Too many colons in tag: " + text)
- return clade
+ if char == '(':
+ # start a new clade, which is a child of the current clade
+ current_clade = make_new_clade(current_clade)
+ entering_branch_length = False
+
+ elif char == ',':
+ # start a new child clade at the same level as the current clade
+ parent = process_clade(current_clade)
+ current_clade = make_new_clade(parent)
+ entering_branch_length = False
+
+ elif char == ')':
+ # done adding children for this parent clade
+ parent = process_clade(current_clade)
+ if not parent:
+ raise NewickError('Parenthesis mismatch.')
+ current_clade = parent
+ entering_branch_length = False
+
+ elif char == '[':
+ # start entering comment
+ current_clade.comment = ''
+ entering_comment = True
+
+ elif char == ':':
+ # start entering branch length
+ entering_branch_length = True
+ current_clade.branch_length_string = ''
+
+ elif char == "'":
+ # start entering quoted label
+ entering_quoted_string = True
+
+ elif char == ';': pass
+
+ elif entering_branch_length:
+ # add characters to branch length
+ current_clade.branch_length_string += char
+
+ else:
+ # add characters to node label
+ current_clade.name += char
+
+ escape = False
+
+ process_clade(current_clade)
+
+ return Newick.Tree(root=current_clade, rooted=self.rooted)
# ---------------------------------------------------------

0 comments on commit b7cd97a

Please sign in to comment.