Permalink
Browse files

Refactor hmmer3-text indexer to a new base class

  • Loading branch information...
1 parent ef9f50d commit be6b144ae79a0c76d55ef4f206cb85b7a9cb74f5 @bow bow committed Dec 9, 2012
Showing with 67 additions and 50 deletions.
  1. +63 −0 Bio/SearchIO/HmmerIO/_base.py
  2. +4 −50 Bio/SearchIO/HmmerIO/hmmer3_text.py
@@ -0,0 +1,63 @@
+# Copyright 2012 by Wibowo Arindrarto. All rights reserved.
+# This code is part of the Biopython distribution and governed by its
+# license. Please see the LICENSE file that should have been included
+# as part of this package.
+
+"""Bio.SearchIO base classes for HMMER-related code."""
+
+import re
+
+from Bio._py3k import _as_bytes, _bytes_to_string
+from Bio.SearchIO._index import SearchIndexer
+from Bio.SearchIO._utils import read_forward
+
+
+class _BaseHmmerTextIndexer(SearchIndexer):
+
+ """Base indexer class for HMMER plain text output."""
+
+ def __iter__(self):
+ handle = self._handle
+ handle.seek(0)
+ start_offset = handle.tell()
+
+ while True:
+ line = read_forward(handle)
+ end_offset = handle.tell()
+
+ if line.startswith(self.qresult_start):
+ regx = re.search(self.regex_id, line)
+ qresult_key = regx.group(1).strip()
+ # qresult start offset is the offset of this line
+ # (starts with the start mark)
+ start_offset = end_offset - len(line)
+ elif line.startswith(self.qresult_end):
+ yield _bytes_to_string(qresult_key), start_offset, 0
+ start_offset = end_offset
+ elif not line:
+ break
+
+ def get_raw(self, offset):
+ handle = self._handle
+ qresult_raw = _as_bytes('')
+
+ # read header first
+ handle.seek(0)
+ while True:
+ line = handle.readline()
+ if line.startswith(self.qresult_start):
+ break
+ qresult_raw += line
+
+ # and read the qresult raw string
+ handle.seek(offset)
+ while True:
+ # preserve whitespace, don't use read_forward
+ line = handle.readline()
+ qresult_raw += line
+
+ # break when we've reached qresult end
+ if line.startswith(self.qresult_end):
+ break
+
+ return qresult_raw
@@ -9,10 +9,10 @@
from Bio._py3k import _as_bytes, _bytes_to_string
from Bio.Alphabet import generic_protein
-from Bio.SearchIO._index import SearchIndexer
from Bio.SearchIO._model import QueryResult, Hit, HSP, HSPFragment
from Bio.SearchIO._utils import read_forward
+from _base import _BaseHmmerTextIndexer
__all__ = ['Hmmer3TextParser', 'Hmmer3TextIndexer']
@@ -24,10 +24,9 @@
_RE_VERSION = re.compile(r'# \w+ ([\w+\.]+) .*; http.*$')
# regex for option string capture
_RE_OPT = re.compile(r'^# (.+):\s+(.+)$')
-# regex for parsing query id and length, for parsing and indexing
+# regex for parsing query id and length, for parsing
_QRE_ID_LEN_PTN = r'^Query:\s*(.*)\s+\[\w=(\d+)\]'
_QRE_ID_LEN = re.compile(_QRE_ID_LEN_PTN)
-_QRE_ID_LEN_IDX = re.compile(_as_bytes(_QRE_ID_LEN_PTN))
# regex for hsp validation
_HRE_VALIDATE = re.compile(r'score:\s(-?\d+\.?\d+)\sbits.*value:\s(.*)')
# regexes for parsing hsp alignment blocks
@@ -368,59 +367,14 @@ def _parse_aln_block(self, hid, hsp_list):
self.line = self.handle.readline()
-class Hmmer3TextIndexer(SearchIndexer):
+class Hmmer3TextIndexer(_BaseHmmerTextIndexer):
"""Indexer class for HMMER plain text output."""
_parser = Hmmer3TextParser
qresult_start = _as_bytes('Query: ')
qresult_end = _as_bytes('//')
-
- def __iter__(self):
- handle = self._handle
- handle.seek(0)
- start_offset = handle.tell()
-
- while True:
- line = read_forward(handle)
- end_offset = handle.tell()
-
- if line.startswith(self.qresult_start):
- regx = re.search(_QRE_ID_LEN_IDX, line)
- qresult_key = regx.group(1).strip()
- # qresult start offset is the offset of this line
- # (starts with the start mark)
- start_offset = end_offset - len(line)
- elif line.startswith(self.qresult_end):
- yield _bytes_to_string(qresult_key), start_offset, 0
- start_offset = end_offset
- elif not line:
- break
-
- def get_raw(self, offset):
- handle = self._handle
- qresult_raw = _as_bytes('')
-
- # read header first
- handle.seek(0)
- while True:
- line = handle.readline()
- if line.startswith(self.qresult_start):
- break
- qresult_raw += line
-
- # and read the qresult raw string
- handle.seek(offset)
- while True:
- # preserve whitespace, don't use read_forward
- line = handle.readline()
- qresult_raw += line
-
- # break when we've reached qresult end
- if line.startswith(self.qresult_end):
- break
-
- return qresult_raw
+ regex_id = re.compile(_as_bytes(_QRE_ID_LEN_PTN))
# if not used as a module, run the doctest

0 comments on commit be6b144

Please sign in to comment.