Permalink
Browse files

Support BGZF compressed files in Bio.SearchIO.index and index_db

Currently only tested with BLAST XML via a doctest, full test
suite validation needed.
  • Loading branch information...
1 parent 600b231 commit cf063bf6a2dca4d534d00699310548e43bf2e14f @peterjc peterjc committed Dec 3, 2012
Showing with 25 additions and 2 deletions.
  1. +16 −0 Bio/SearchIO/__init__.py
  2. +9 −2 Bio/SearchIO/_index.py
View
16 Bio/SearchIO/__init__.py
@@ -448,6 +448,16 @@ def index(filename, format=None, key_function=None, **kwargs):
>>> search_idx['gi|195230749:301-1383']
QueryResult(id='gi|195230749:301-1383', 5 hits)
+ If the file is BGZF compressed, this is detected automatically. Ordinary
+ GZIP files are not supported:
+
+ >>> from Bio import SearchIO
+ >>> search_idx = SearchIO.index('Blast/wnts.xml.bgz', 'blast-xml')
+ >>> search_idx
+ SearchIO.index('Blast/wnts.xml.bgz', 'blast-xml', key_function=None)
+ >>> search_idx['gi|195230749:301-1383']
+ QueryResult(id='gi|195230749:301-1383', 5 hits)
+
You can supply a custom callback function to alter the default identifier
string. This function should accept as its input the QueryResult ID string
and return a modified version of it.
@@ -519,6 +529,12 @@ def index_db(index_filename, filenames=None, format=None,
>>> db_idx['33212']
QueryResult(id='33212', 44 hits)
+ Note that ':memory:' rather than an index filename tells SQLite to hold
+ the index database in memory. This is useful for quick tests, but using
+ the Bio.SearchIO.index(...) function instead would use less memory.
+
+ BGZF compressed files are supported, and detected automatically. Ordinary
+ GZIP compressed files are not supported.
"""
# cast filenames to list if it's a string
# (can we check if it's a string or a generator?)
View
11 Bio/SearchIO/_index.py
@@ -10,6 +10,7 @@
from StringIO import StringIO
from Bio._py3k import _bytes_to_string
+from Bio import bgzf
from Bio.File import _IndexedSeqFileProxy
class SearchIndexer(_IndexedSeqFileProxy):
@@ -20,8 +21,14 @@ class SearchIndexer(_IndexedSeqFileProxy):
"""
def __init__(self, filename, **kwargs):
- self._handle = open(filename, 'rb')
- self._handle.seek(0)
+ h = open(filename, 'rb')
+ try:
+ self._handle = bgzf.BgzfReader(mode="rb", fileobj=h)
+ except ValueError, e:
+ assert "BGZF" in str(e)
+ #Not a BGZF file
+ h.seek(0)
+ self._handle = h
self._kwargs = kwargs
def _parse(self, handle):

0 comments on commit cf063bf

Please sign in to comment.