Skip to content

Commit

Permalink
Add hmmer-domtab indexing and its tests
Browse files Browse the repository at this point in the history
  • Loading branch information
bow committed Jun 21, 2012
1 parent fcf0781 commit 4e7aefd
Show file tree
Hide file tree
Showing 3 changed files with 97 additions and 36 deletions.
45 changes: 13 additions & 32 deletions Bio/SearchIO/HmmerIO/hmmerdomtab.py
Expand Up @@ -10,6 +10,7 @@
from Bio._py3k import _as_bytes, _bytes_to_string
from Bio.SearchIO._objects import QueryResult, Hit, HSP
from Bio.SearchIO._index import SearchIndexer
from hmmertab import HmmerTabIndexer


def hmmer_domtab_hmmhit_iterator(handle):
Expand Down Expand Up @@ -183,48 +184,28 @@ def parse_qresult(self):
self.line = read_forward(self.handle)


class HmmerDomtabHmmhitIndexer(SearchIndexer):
class HmmerDomtabHmmhitIndexer(HmmerTabIndexer):

"""Indexer class for HMMER domain table output that assumes HMM profile
coordinates are hit coordinates."""

def __init__(self, *args, **kwargs):
SearchIndexer.__init__(self, *args, **kwargs)
HmmerTabIndexer.__init__(self, *args, **kwargs)
# set parser for on-the-fly parsing
self._parser = hmmer_domtab_iterator
self._handle.seek(0)
self._parser = hmmer_domtab_hmmhit_iterator
self._query_id_idx = 3

def __iter__(self):
"""Iterates over the file handle; yields key, start offset, and length."""
handle = self._handle
handle.seek(0)
split_char = _as_bytes(' ')
qresult_key = None

# read through header
while True:
start_offset = handle.tell()
line = read_forward(handle, strip=False)
if not line.startswith('#'):
break

# and index the qresults
#while True:
# end_offset = handle.tell()
class HmmerDomtabHmmqueryIndexer(HmmerTabIndexer):

# if not line:
# break
# if qresult_key is None:

def get_raw(self, offset):
"""Returns the raw string of a QueryResult object from the given offset."""
handle = self._handle
handle.seek(offset)
split_char = _as_bytes(' ')
qresult_key = None
qresult_raw = ''
"""Indexer class for HMMER domain table output that assumes HMM profile
coordinates are query coordinates."""

return qresult_raw
def __init__(self, *args, **kwargs):
HmmerTabIndexer.__init__(self, *args, **kwargs)
# set parser for on-the-fly parsing
self._parser = hmmer_domtab_hmmquery_iterator
self._query_id_idx = 3


class HmmerDomtabHmmhitWriter(object):
Expand Down
16 changes: 12 additions & 4 deletions Bio/SearchIO/HmmerIO/hmmertab.py
Expand Up @@ -152,12 +152,15 @@ def __init__(self, *args, **kwargs):
# set parser for on-the-fly parsing
self._parser = hmmer_tab_iterator
self._handle.seek(0)
# denotes column location for query identifier
self._query_id_idx = 2

def __iter__(self):
"""Iterates over the file handle; yields key, start offset, and length."""
handle = self._handle
handle.seek(0)
split_char = _as_bytes(' ')
query_id_idx = self._query_id_idx
qresult_key = None

# read through header
Expand All @@ -174,9 +177,11 @@ def __iter__(self):
if not line:
break
if qresult_key is None:
qresult_key = filter(None, line.strip().split(split_char))[2]
qresult_key = filter(None, \
line.strip().split(split_char))[query_id_idx]
else:
curr_key = filter(None, line.strip().split(split_char))[2]
curr_key = filter(None, \
line.strip().split(split_char))[query_id_idx]

if curr_key != qresult_key:
yield _bytes_to_string(qresult_key), start_offset, \
Expand All @@ -195,6 +200,7 @@ def get_raw(self, offset):
handle = self._handle
handle.seek(offset)
split_char = _as_bytes(' ')
query_id_idx = self._query_id_idx
qresult_key = None
qresult_raw = ''

Expand All @@ -203,9 +209,11 @@ def get_raw(self, offset):
if not line:
break
if qresult_key is None:
qresult_key = filter(None, line.strip().split(split_char))[2]
qresult_key = filter(None, \
line.strip().split(split_char))[query_id_idx]
else:
curr_key = filter(None, line.strip().split(split_char))[2]
curr_key = filter(None, \
line.strip().split(split_char))[query_id_idx]
if curr_key != qresult_key:
break
qresult_raw += line
Expand Down
72 changes: 72 additions & 0 deletions Tests/test_SearchIO_index.py
Expand Up @@ -1189,6 +1189,50 @@ def test_hmmertab_30_single(self):
self.assertEqual(raw, idx.get_raw('gi|126362951:116-221'))


class HmmerDomtabRawCases(unittest.TestCase):

fmt = 'hmmscan-domtab'

def test_hmmerdomtab_30_multiple_first(self):
"""Test hmmscan-domtab raw string retrieval, HMMER 3.0, multiple queries, first (domtab_30_hmmscan_001.out)"""
filename = os.path.join('Hmmer', 'domtab_30_hmmscan_001.out')
idx = SearchIO.index(filename, self.fmt)
raw = """Globin PF00042.17 108 gi|4885477|ref|NP_005359.1| - 154 6e-21 74.6 0.3 1 1 6.7e-25 9.2e-21 74.0 0.2 1 107 7 112 7 113 0.97 Globin
"""
self.assertEqual(raw, idx.get_raw('gi|4885477|ref|NP_005359.1|'))

def test_hmmerdomtab_30_multiple_middle(self):
"""Test hmmscan-domtab raw string retrieval, HMMER 3.0, multiple queries, middle (domtab_30_hmmscan_001.out)"""
filename = os.path.join('Hmmer', 'domtab_30_hmmscan_001.out')
idx = SearchIO.index(filename, self.fmt)
raw = """Ig_3 PF13927.1 75 gi|126362951:116-221 - 106 1.4e-09 38.2 0.4 1 1 3e-13 2.1e-09 37.6 0.3 1 73 9 84 9 88 0.94 Immunoglobulin domain
Ig_2 PF13895.1 80 gi|126362951:116-221 - 106 3.5e-05 23.7 0.1 1 1 6.2e-09 4.3e-05 23.4 0.1 1 80 9 104 9 104 0.71 Immunoglobulin domain
"""
self.assertEqual(raw, idx.get_raw('gi|126362951:116-221'))

def test_hmmerdomtab_30_multiple_last(self):
"""Test hmmscan-domtab raw string retrieval, HMMER 3.0, multiple queries, last (domtab_30_hmmscan_001.out)"""
filename = os.path.join('Hmmer', 'domtab_30_hmmscan_001.out')
idx = SearchIO.index(filename, self.fmt)
raw = """Pou PF00157.12 75 gi|125490392|ref|NP_038661.2| - 352 7e-37 124.8 0.5 1 1 5e-40 1.4e-36 123.9 0.3 3 75 133 205 131 205 0.97 Pou domain - N-terminal to homeobox domain
Homeobox PF00046.24 57 gi|125490392|ref|NP_038661.2| - 352 2.1e-18 65.5 1.1 1 1 1.5e-21 4.1e-18 64.6 0.7 1 57 224 280 224 280 0.98 Homeobox domain
HTH_31 PF13560.1 64 gi|125490392|ref|NP_038661.2| - 352 0.012 15.6 0.0 1 2 5.7e-05 0.16 12.0 0.0 1 35 141 181 141 184 0.96 Helix-turn-helix domain
HTH_31 PF13560.1 64 gi|125490392|ref|NP_038661.2| - 352 0.012 15.6 0.0 2 2 0.19 5.2e+02 0.8 0.0 39 62 245 268 243 270 0.86 Helix-turn-helix domain
Homeobox_KN PF05920.6 40 gi|125490392|ref|NP_038661.2| - 352 0.039 13.5 0.0 1 1 3.5e-05 0.095 12.3 0.0 7 39 244 276 241 277 0.91 Homeobox KN domain
DUF521 PF04412.8 400 gi|125490392|ref|NP_038661.2| - 352 0.14 10.5 0.1 1 1 9.4e-05 0.26 9.6 0.1 273 334 221 280 197 294 0.77 Protein of unknown function (DUF521)
"""
self.assertEqual(raw, idx.get_raw('gi|125490392|ref|NP_038661.2|'))

def test_hmmerdomtab_30_single(self):
"""Test hmmscan-domtab raw string retrieval, HMMER 3.0, single query (domtab_30_hmmscan_004.out)"""
filename = os.path.join('Hmmer', 'domtab_30_hmmscan_004.out')
idx = SearchIO.index(filename, self.fmt)
raw = """Ig_3 PF13927.1 75 gi|126362951:116-221 - 106 1.4e-09 38.2 0.4 1 1 3e-13 2.1e-09 37.6 0.3 1 73 9 84 9 88 0.94 Immunoglobulin domain
Ig_2 PF13895.1 80 gi|126362951:116-221 - 106 3.5e-05 23.7 0.1 1 1 6.2e-09 4.3e-05 23.4 0.1 1 80 9 104 9 104 0.71 Immunoglobulin domain
"""
self.assertEqual(raw, idx.get_raw('gi|126362951:116-221'))


class SearchIndexCases(unittest.TestCase):

def check_index(self, filename, format):
Expand Down Expand Up @@ -1359,6 +1403,34 @@ def test_hmmertab_30_hmmscan_004(self):
self.check_index(filename, self.fmt)


class HmmerDomtabIndexCases(SearchIndexCases):

def test_hmmerdomtab_30_hmmscan_001(self):
"""Test hmmscan-domtab indexing, HMMER 3.0, multiple queries"""
filename = os.path.join('Hmmer', 'domtab_30_hmmscan_001.out')
self.check_index(filename, 'hmmscan-domtab')

def test_hmmerdomtab_30_hmmscan_002(self):
"""Test hmmscan-domtab indexing, HMMER 3.0, single query, no hits"""
filename = os.path.join('Hmmer', 'domtab_30_hmmscan_002.out')
self.check_index(filename, 'hmmscan-domtab')

def test_hmmerdomtab_30_hmmscan_003(self):
"""Test hmmscan-domtab indexing, HMMER 3.0, single query, multiple hits"""
filename = os.path.join('Hmmer', 'domtab_30_hmmscan_003.out')
self.check_index(filename, 'hmmscan-domtab')

def test_hmmerdomtab_30_hmmscan_004(self):
"""Test hmmscan-domtab indexing, HMMER 3.0, single query, no alignments"""
filename = os.path.join('Hmmer', 'domtab_30_hmmscan_004.out')
self.check_index(filename, 'hmmscan-domtab')

def test_hmmerdomtab_30_hmmsearch_001(self):
"""Test hmmsearch-domtab indexing, HMMER 3.0, single query, no alignments"""
filename = os.path.join('Hmmer', 'domtab_30_hmmsearch_001.out')
self.check_index(filename, 'hmmsearch-domtab')


if __name__ == "__main__":
runner = unittest.TextTestRunner(verbosity = 2)
unittest.main(testRunner=runner)

0 comments on commit 4e7aefd

Please sign in to comment.