Permalink
Browse files

Switch BLAST XML indexing to be line based.

This now *requires* that the <Iteration> marker be the first tag on a
line (which is how the NCBI BLAST XML files look), although in theory
an edited XML file could remove new line whitespace.

The reason for this change is to facilitate indexing BGZF compressed
BLAST XML files.

In principle we can still use the original approach, but determining
the start offset for a BGZF compressed BLAST XML file is far harder.
  • Loading branch information...
1 parent a6ec303 commit 600b231a1817035141c8de80e5689dcfd31290b5 @peterjc peterjc committed Dec 3, 2012
Showing with 41 additions and 71 deletions.
  1. +37 −67 Bio/SearchIO/BlastIO/blast_xml.py
  2. +4 −4 Tests/test_SearchIO_index.py
@@ -171,7 +171,6 @@
class BlastXmlParser(object):
-
"""Parser for the BLAST XML format"""
def __init__(self, handle):
@@ -485,7 +484,6 @@ def _parse_hsp(self, root_hsp_frag_elem, query_id, hit_id):
class BlastXmlIndexer(SearchIndexer):
-
"""Indexer class for BLAST XML output."""
_parser = BlastXmlParser
@@ -513,77 +511,51 @@ def __iter__(self):
counter = 0
while True:
- # read file content (every block size) and store into block
- block = handle.read(block_size)
- # for each iteration start mark
- for qstart_idx in self._get_offsets(block, qstart_mark):
- # get the id of the query (re.search gets the 1st result)
- regx = re.search(re_desc, block[qstart_idx:])
- try:
- qstart_desc = regx.group(2)
- qstart_id = regx.group(1)
- # handle cases where the iteration query ID tag lies in a block
- # split
- except AttributeError:
- # if we've found the end of the Iteration_query-def element
- # use the fallback values
- if re.search(re_desc_end, block[qstart_idx:]):
- qstart_desc = _as_bytes(self._fallback['description'])
- qstart_id = _as_bytes(self._fallback['id'])
- # otherwise, extend the read block and retrieve the ID and
- # desc from the extended read
- else:
- # extend the cached read
- block_ext = block + handle.read(block_size)
- regx = re.search(re_desc, block_ext[qstart_idx:])
- qstart_desc = regx.group(2)
- qstart_id = regx.group(1)
- # set file pointer to the position before block_ext
- handle.seek((counter + 1) * block_size)
-
- # now for getting the length
- # try finding it in the block
- qlen = block[qstart_idx:].find(qend_mark)
- # if not there, loop until query end is found
- block_ext = block
- while qlen < 0:
- ext = handle.read(block_size)
- if not ext:
- raise ValueError("Query end for %r not found" %
- qstart_id)
- block_ext += ext
- qlen = block_ext[qstart_idx:].find(qend_mark)
-
- # adjust for read blocks
- qstart_idx = counter * block_size + qstart_idx
- qlen = len(qend_mark) + qlen
- # move pointer to original position
- handle.seek((counter + 1) * block_size)
- if qstart_id.startswith(blast_id_mark):
- qstart_id = qstart_desc.split(_as_bytes(' '), 1)[0]
- # yield key, offset, length
- yield _bytes_to_string(qstart_id), qstart_idx, qlen
-
- counter += 1
-
- if not block:
+ start_offset = handle.tell()
+ line = handle.readline()
+ if not line:
break
-
- def _get_offsets(self, string, sub, offset=0):
- """Retrieves the offsets of a given substring in a string."""
- idx = string.find(sub, offset)
- # yield offset as long as it's present in the string
- while idx >= 0:
- yield idx
- idx = string.find(sub, idx + 1)
+ if qstart_mark not in line:
+ continue
+ # The following requirements are to make supporting BGZF compressed
+ # BLAST XML files simpler (avoids complex offset manipulations):
+ assert line.count(qstart_mark) == 1, "XML without line breaks?"
+ assert line.lstrip().startswith(qstart_mark), line
+ if qend_mark in line:
+ # Should cope with <Iteration>...</Iteration> on one long line
+ block = line
+ else:
+ # Load the rest of this block up to and including </Iteration>
+ block = [line]
+ while line and qend_mark not in line:
+ line = handle.readline()
+ assert qstart_mark not in line, line
+ block.append(line)
+ assert line.rstrip().endswith(qend_mark), line
+ block = "".join(block)
+ assert block.count(qstart_mark) == 1, "XML without line breaks? %r" % block
+ assert block.count(qend_mark) == 1, "XML without line breaks? %r" % block
+ #Now we have a full <Iteration>...</Iteration> block, find the ID
+ regx = re.search(re_desc, block)
+ try:
+ qstart_desc = regx.group(2)
+ qstart_id = regx.group(1)
+ except AttributeError:
+ # use the fallback values
+ assert re.search(re_desc_end, block)
+ qstart_desc = _as_bytes(self._fallback['description'])
+ qstart_id = _as_bytes(self._fallback['id'])
+ if qstart_id.startswith(blast_id_mark):
+ qstart_id = qstart_desc.split(_as_bytes(' '), 1)[0]
+ yield _bytes_to_string(qstart_id), start_offset, len(block)
+ counter += 1
def _parse(self, handle):
# overwrites SearchIndexer._parse, since we need to set the meta and
# fallback dictionaries to the parser
generator = self._parser(handle, **self._kwargs)
generator._meta = self._meta
generator._fallback = self._fallback
-
return iter(generator).next()
def get_raw(self, offset):
@@ -612,7 +584,6 @@ def get_raw(self, offset):
class _BlastXmlGenerator(XMLGenerator):
-
"""Event-based XML Generator."""
def __init__(self, out, encoding='utf-8', indent=" ", increment=2):
@@ -704,7 +675,6 @@ def characters(self, content):
class BlastXmlWriter(object):
-
"""Stream-based BLAST+ XML Writer."""
def __init__(self, handle):
@@ -30,7 +30,7 @@ def test_blastxml_2226_multiple_first(self):
"""Test blast-xml raw string retrieval, BLAST 2.2.26+, multiple queries, first (xml_2226_blastp_001.xml)"""
filename = 'Blast/xml_2226_blastp_001.xml'
idx = SearchIO.index(filename, self.fmt)
- raw = """<Iteration>
+ raw = """ <Iteration>
<Iteration_iter-num>1</Iteration_iter-num>
<Iteration_query-ID>Query_1</Iteration_query-ID>
<Iteration_query-def>random_s00</Iteration_query-def>
@@ -56,7 +56,7 @@ def test_blastxml_2226_multiple_middle(self):
"""Test blast-xml raw string retrieval, BLAST 2.2.26+, multiple queries, middle (xml_2226_blastp_001.xml)"""
filename = 'Blast/xml_2226_blastp_001.xml'
idx = SearchIO.index(filename, self.fmt)
- raw = """<Iteration>
+ raw = """ <Iteration>
<Iteration_iter-num>2</Iteration_iter-num>
<Iteration_query-ID>Query_2</Iteration_query-ID>
<Iteration_query-def>gi|16080617|ref|NP_391444.1| membrane bound lipoprotein [Bacillus subtilis subsp. subtilis str. 168]</Iteration_query-def>
@@ -222,7 +222,7 @@ def test_blastxml_2226_multiple_last(self):
"""Test blast-xml raw string retrieval, BLAST 2.2.26+, multiple queries, last (xml_2226_blastp_001.xml)"""
filename = 'Blast/xml_2226_blastp_001.xml'
idx = SearchIO.index(filename, self.fmt)
- raw = """<Iteration>
+ raw = """ <Iteration>
<Iteration_iter-num>3</Iteration_iter-num>
<Iteration_query-ID>Query_3</Iteration_query-ID>
<Iteration_query-def>gi|11464971:4-101 pleckstrin [Mus musculus]</Iteration_query-def>
@@ -483,7 +483,7 @@ def test_blastxml_2226_single(self):
"""Test blast-xml raw string retrieval, BLAST 2.2.26+, single query (xml_2226_blastp_004.xml)"""
filename = 'Blast/xml_2226_blastp_004.xml'
idx = SearchIO.index(filename, self.fmt)
- raw = """<Iteration>
+ raw = """ <Iteration>
<Iteration_iter-num>1</Iteration_iter-num>
<Iteration_query-ID>Query_1</Iteration_query-ID>
<Iteration_query-def>gi|11464971:4-101 pleckstrin [Mus musculus]</Iteration_query-def>

0 comments on commit 600b231

Please sign in to comment.