Skip to content

Commit

Permalink
Several changes to how binning works in interval_index_file.
Browse files Browse the repository at this point in the history
First, the maximum has been increased (to 4096*1024*1024, bigger than the 32bit
integer we use to actaully store the positions can hold, thus MAX is actually 2**31).

Second, the number of levels of bins to use is determined from the max size passed
to create the index. Thus, where small intervals (e.g. contigs) previously the smallest
number of bins one could use had to include space for all the high level bins (
512 + 64 + 8 + 1 bins!). Now a small contig (under 128KB) has exactly one bin. Not
only does this save a ton of space, but it makes finding the bin for an interval much
faster.

The version number has been incremented, files with version < 2 will always use the
old binning scheme.

Impact: it is now tractable to index species such as

 possum: extremely large chromosomes were not supported before, now an
         additional bin will be added when creating indexes on large regions

 platypus: highly fragmented assembly with many small contigs, was taking a
           prohibitively long time to index, and wasting a ton of space, now
           can be indexed rapidly and compactly, e.g.:

		 2906252445 2007-07-11 11:02 ornAna1.maf
		   36021451 2007-07-11 13:22 ornAna1.maf.index
  • Loading branch information
jxtx committed Jul 11, 2007
1 parent e8cbfab commit cc69fcc
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 15 deletions.
60 changes: 46 additions & 14 deletions lib/bx/interval_index_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,21 +101,44 @@
__all__ = [ 'Indexes', 'Index' ]

MAGIC=0x2cff800a
VERSION=1
VERSION=2

MIN=0
MAX=512*1024*1024
OLD_MAX=512*1024*1024
DEFAULT_MAX=512*1024*1024
MAX=2147483647

BIN_OFFSETS = [ 512 + 64 + 8 + 1, 64 + 8 + 1, 8 + 1, 1, 0 ]
BIN_OFFSETS = [ 4096 + 512 + 64 + 8 + 1, 512 + 64 + 8 + 1, 64 + 8 + 1, 8 + 1, 1, 0 ]
BIN_FIRST_SHIFT = 17
BIN_NEXT_SHIFT = 3

def bin_for_range( start, end):
# Size for each level
size = ( 1 << BIN_FIRST_SHIFT )
BIN_OFFSETS_MAX = [ size ]
for i in range( len( BIN_OFFSETS ) - 1 ):
size <<= BIN_NEXT_SHIFT
BIN_OFFSETS_MAX.insert( 0, size )
del size

def offsets_for_max_size( max_size ):
"""
Return the subset of offsets needed to contain intervals over (0,max_size)
"""
for i, max in enumerate( reversed( BIN_OFFSETS_MAX ) ):
if max_size < max:
break
else:
raise Exception( "%d is larger than the maximum possible size (%d)" % ( max_size, BIN_OFFSETS_MAX[0] ) )
return BIN_OFFSETS[ ( len(BIN_OFFSETS) - i - 1 ) : ]

def bin_for_range( start, end, offsets=None ):
"""Find the smallest bin that can contain interval (start,end)"""
if offsets is None:
offsets = BIN_OFFSETS
start_bin, end_bin = start, end - 1
start_bin >>= BIN_FIRST_SHIFT
end_bin >>= BIN_FIRST_SHIFT
for offset in BIN_OFFSETS:
for offset in offsets:
if start_bin == end_bin:
return offset + start_bin
else:
Expand Down Expand Up @@ -233,15 +256,15 @@ def __init__( self, filename=None ):
self.indexes = dict()
if filename is not None: self.open( filename )

def add( self, name, start, end, val ):
def add( self, name, start, end, val, max=DEFAULT_MAX ):
if name not in self.indexes:
self.indexes[name] = Index()
self.indexes[name] = Index( max=max )
self.indexes[name].add( start, end, val )

def get( self, name ):
if self.indexes[name] is None:
offset, value_size = self.offsets[name]
self.indexes[name] = Index( filename=self.filename, offset=offset, value_size=value_size )
self.indexes[name] = Index( filename=self.filename, offset=offset, value_size=value_size, version=self.version )
return self.indexes[name]

def find( self, name, start, end ):
Expand All @@ -259,6 +282,7 @@ def open( self, filename ):
raise "File does not have expected header"
if version > VERSION:
warn( "File claims version %d, I don't known anything about versions beyond %d. Attempting to continue", version, VERSION )
self.version = version
for i in range( length ):
key_len = read_packed( f, ">I" )
key = f.read( key_len )
Expand Down Expand Up @@ -301,13 +325,13 @@ def write( self, f ):

class Index:

def __init__( self, min=MIN, max=MAX, filename=None, offset=0, value_size=None ):
def __init__( self, min=MIN, max=DEFAULT_MAX, filename=None, offset=0, value_size=None, version=None ):
self._value_size = value_size
self.max_val = 1 # (1, rather than 0, to force value_size > 0)
if filename is None:
self.new( min, max )
else:
self.open( filename, offset )
self.open( filename, offset, version )

def get_value_size ( self ):
if self._value_size != None:
Expand All @@ -322,12 +346,14 @@ def new( self, min, max ):
assert MIN <= min <= max <= MAX
self.min = min
self.max = max
# Determine offsets to use
self.offsets = offsets_for_max_size( max )
# Determine the largest bin we will actually use
self.bin_count = bin_for_range( max - 1, max ) + 1
self.bin_count = bin_for_range( max - 1, max, offsets = self.offsets ) + 1
# Create empty bins
self.bins = [ [] for i in range( self.bin_count ) ]

def open( self, filename, offset ):
def open( self, filename, offset, version ):
self.filename = filename
self.offset = offset
# Open the file and seek to where we expect our header
Expand All @@ -336,6 +362,12 @@ def open( self, filename, offset ):
# Read min/max
min, max = read_packed( f, ">2I" )
self.new( min, max )
# Decide how many levels of bins based on 'max'
if version < 2:
# Prior to version to all files used the bins for 512MB
self.offsets = offsets_for_max_size( OLD_MAX - 1 )
else:
self.offsets = offsets_for_max_size( max )
# Read bin indexes
self.bin_offsets = []
self.bin_sizes = []
Expand All @@ -348,15 +380,15 @@ def open( self, filename, offset ):

def add( self, start, end, val ):
"""Add the interval (start,end) with associated value val to the index"""
insort( self.bins[ bin_for_range( start, end ) ], ( start, end, val ) )
insort( self.bins[ bin_for_range( start, end, offsets=self.offsets ) ], ( start, end, val ) )
assert val >= 0
self.max_val = max(self.max_val,val)

def find( self, start, end ):
rval = []
start_bin = ( max( start, self.min ) ) >> BIN_FIRST_SHIFT
end_bin = ( min( end, self.max ) - 1 ) >> BIN_FIRST_SHIFT
for offset in BIN_OFFSETS:
for offset in self.offsets:
for i in range( start_bin + offset, end_bin + offset + 1 ):
if self.bins[i] is None: self.load_bin( i )
# Iterate over bin and insert any overlapping elements into return value
Expand Down
53 changes: 53 additions & 0 deletions lib/bx/interval_index_file_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import interval_index_file
from interval_index_file import Indexes
from tempfile import mktemp
import random

def test():
ix = Indexes()
chrs = []
for i in range( 5 ):
intervals = []
name = "seq%d" % i
max = random.randint( 0, interval_index_file.MAX )
print name, "size", max
for i in range( 500 ):
start = random.randint( 0, max )
end = random.randint( 0, max )
if end < start:
end, start = start, end
ix.add( name, start, end, i )
intervals.append( ( start, end, i ) )
chrs.append( intervals )
fname = mktemp()
f = open( fname, "w" )
ix.write( f )
f.close()
del ix

ix = Indexes( fname )
for i in range( 5 ):
intervals = chrs[i]
name = "seq%d" % i
for i in range( 100 ):
start = random.randint( 0, max )
end = random.randint( 0, max )
if end < start:
end, start = start, end
query_intervals = set()
for ( s, e, i ) in intervals:
if e > start and s < end:
query_intervals.add( ( s, e, i ) )
result = ix.find( name, start, end )
for inter in result:
assert inter in query_intervals










2 changes: 1 addition & 1 deletion scripts/maf_build_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def main():
for c in block.components:
if species is not None and c.src.split('.')[0] not in species:
continue
indexes.add( c.src, c.forward_strand_start, c.forward_strand_end, pos )
indexes.add( c.src, c.forward_strand_start, c.forward_strand_end, pos, max=c.src_size )

out = open( index_file, 'w' )
indexes.write( out )
Expand Down

0 comments on commit cc69fcc

Please sign in to comment.