Permalink
Browse files

Refactor hmmer*-text indexers

Done because HMMER2 hmmsearch seems to be limited to a single query without any
clear query delimiter.
  • Loading branch information...
1 parent c6717f1 commit f7af04b52a5075e373d0ed9a89fe82f549f71b49 @bow bow committed Dec 9, 2012
Showing with 58 additions and 29 deletions.
  1. +2 −26 Bio/SearchIO/HmmerIO/_base.py
  2. +35 −2 Bio/SearchIO/HmmerIO/hmmer2_text.py
  3. +21 −1 Bio/SearchIO/HmmerIO/hmmer3_text.py
@@ -5,38 +5,14 @@
"""Bio.SearchIO base classes for HMMER-related code."""
-import re
-
-from Bio._py3k import _as_bytes, _bytes_to_string
+from Bio._py3k import _as_bytes
from Bio.SearchIO._index import SearchIndexer
-from Bio.SearchIO._utils import read_forward
class _BaseHmmerTextIndexer(SearchIndexer):
"""Base indexer class for HMMER plain text output."""
- def __iter__(self):
- handle = self._handle
- handle.seek(0)
- start_offset = handle.tell()
-
- while True:
- line = read_forward(handle)
- end_offset = handle.tell()
-
- if line.startswith(self.qresult_start):
- regx = re.search(self.regex_id, line)
- qresult_key = regx.group(1).strip()
- # qresult start offset is the offset of this line
- # (starts with the start mark)
- start_offset = end_offset - len(line)
- elif line.startswith(self.qresult_end):
- yield _bytes_to_string(qresult_key), start_offset, 0
- start_offset = end_offset
- elif not line:
- break
-
def get_raw(self, offset):
handle = self._handle
qresult_raw = _as_bytes('')
@@ -57,7 +33,7 @@ def get_raw(self, offset):
qresult_raw += line
# break when we've reached qresult end
- if line.startswith(self.qresult_end):
+ if line.startswith(self.qresult_end) or not line:
break
return qresult_raw
@@ -7,9 +7,10 @@
import re
-from Bio._py3k import _as_bytes
+from Bio._py3k import _as_bytes, _bytes_to_string
from Bio.Alphabet import generic_protein
from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment
+from Bio.SearchIO._utils import read_forward
from _base import _BaseHmmerTextIndexer
@@ -289,9 +290,41 @@ class Hmmer2TextIndexer(_BaseHmmerTextIndexer):
_parser = Hmmer2TextParser
qresult_start = _as_bytes('Query ')
+ # qresults_ends for hmmpfam and hmmsearch
+ # need to anticipate both since hmmsearch have different query end mark
qresult_end = _as_bytes('//')
- regex_id = re.compile(_as_bytes(r'Query (?:sequence|HMM):\s*(.*)'))
+ def __iter__(self):
+ handle = self._handle
+ handle.seek(0)
+ start_offset = handle.tell()
+ regex_id = re.compile(_as_bytes(r'Query (?:sequence|HMM):\s*(.*)'))
+
+ # determine flag for hmmsearch
+ is_hmmsearch = False
+ line = read_forward(handle)
+ if line.startswith('hmmsearch'):
+ is_hmmsearch = True
+
+ while True:
+ end_offset = handle.tell()
+
+ if line.startswith(self.qresult_start):
+ regx = re.search(regex_id, line)
+ qresult_key = regx.group(1).strip()
+ # qresult start offset is the offset of this line
+ # (starts with the start mark)
+ start_offset = end_offset - len(line)
+ elif line.startswith(self.qresult_end):
+ yield _bytes_to_string(qresult_key), start_offset, 0
+ start_offset = end_offset
+ elif not line:
+ # HACK: since hmmsearch can only have one query result
+ if is_hmmsearch:
+ yield _bytes_to_string(qresult_key), start_offset, 0
+ break
+
+ line = read_forward(handle)
# if not used as a module, run the doctest
if __name__ == "__main__":
@@ -374,8 +374,28 @@ class Hmmer3TextIndexer(_BaseHmmerTextIndexer):
_parser = Hmmer3TextParser
qresult_start = _as_bytes('Query: ')
qresult_end = _as_bytes('//')
- regex_id = re.compile(_as_bytes(_QRE_ID_LEN_PTN))
+ def __iter__(self):
+ handle = self._handle
+ handle.seek(0)
+ start_offset = handle.tell()
+ regex_id = re.compile(_as_bytes(_QRE_ID_LEN_PTN))
+
+ while True:
+ line = read_forward(handle)
+ end_offset = handle.tell()
+
+ if line.startswith(self.qresult_start):
+ regx = re.search(regex_id, line)
+ qresult_key = regx.group(1).strip()
+ # qresult start offset is the offset of this line
+ # (starts with the start mark)
+ start_offset = end_offset - len(line)
+ elif line.startswith(self.qresult_end):
+ yield _bytes_to_string(qresult_key), start_offset, 0
+ start_offset = end_offset
+ elif not line:
+ break
# if not used as a module, run the doctest
if __name__ == "__main__":

0 comments on commit f7af04b

Please sign in to comment.