Skip to content

Commit

Permalink
implement ChromPosIndex (#239)
Browse files Browse the repository at this point in the history
  • Loading branch information
alimanfoo committed Dec 21, 2018
1 parent 4a89636 commit 20b66ba
Show file tree
Hide file tree
Showing 5 changed files with 300 additions and 3 deletions.
217 changes: 216 additions & 1 deletion allel/model/ndarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@

__all__ = ['Genotypes', 'GenotypeArray', 'GenotypeVector', 'HaplotypeArray', 'AlleleCountsArray',
'GenotypeAlleleCounts', 'GenotypeAlleleCountsArray', 'GenotypeAlleleCountsVector',
'SortedIndex', 'UniqueIndex', 'SortedMultiIndex', 'VariantTable', 'FeatureTable']
'SortedIndex', 'UniqueIndex', 'SortedMultiIndex', 'VariantTable', 'FeatureTable',
'ChromPosIndex']


# noinspection PyTypeChecker
Expand Down Expand Up @@ -4223,6 +4224,220 @@ def locate_range(self, key, start=None, stop=None):
return loc


# noinspection PyMissingConstructor
class ChromPosIndex(DisplayAs1D):
"""Two-level index of variant positions from two or more chromosomes/contigs.
Parameters
----------
chrom : array_like
Chromosome values.
pos : array_like
Position values, in ascending order within each chromosome.
copy : bool, optional
If True, inputs will be copied into new arrays.
Notes
-----
Chromosomes do not need to be sorted, but all values for a given chromosome must be
grouped together, and all positions for a given chromosome must be sorted in ascending
order.
Examples
--------
>>> import allel
>>> chrom = ['chr2', 'chr2', 'chr1', 'chr1', 'chr1', 'chr3']
>>> pos = [1, 4, 2, 5, 5, 3]
>>> idx = allel.ChromPosIndex(chrom, pos)
>>> idx
<ChromPosIndex shape=(6,), dtype=<U4/int64>
chr2:1 chr2:4 chr1:2 chr1:5 chr1:5 chr3:3
>>> len(idx)
6
See Also
--------
SortedIndex, UniqueIndex
"""

def __init__(self, chrom, pos, copy=False):
chrom = np.array(chrom, copy=copy)
pos = np.array(pos, copy=copy)
check_ndim(chrom, 1)
check_ndim(pos, 1)
check_dim0_aligned(chrom, pos)
self.chrom = chrom
self.pos = pos
# cache mapping of chromosomes to regions
self.chrom_ranges = dict()

def __repr__(self):
s = '<ChromPosIndex shape=(%s,), dtype=%s/%s>' % \
(len(self), self.chrom.dtype, self.pos.dtype)
s += '\n' + str(self)
return s

def str_items(self):
return ['%s:%s' % (x, y) for x, y in zip(self.chrom, self.pos)]

def to_str(self, threshold=10, edgeitems=5):
_, items = self.get_display_items(threshold, edgeitems)
s = ' '.join(items)
return s

def __len__(self):
return len(self.chrom)

def __getitem__(self, item):
l1 = self.chrom[item]
l2 = self.pos[item]
if isinstance(item, integer_types):
return l1, l2
else:
return ChromPosIndex(l1, l2, copy=False)

def compress(self, condition, axis=0, out=None):
if out is not None:
raise NotImplementedError('out argument not supported')
l1 = self.chrom.compress(condition, axis=axis)
l2 = self.pos.compress(condition, axis=axis)
return ChromPosIndex(l1, l2, copy=False)

def take(self, indices, axis=0, out=None, mode='raise'):
if out is not None:
raise NotImplementedError('out argument not supported')
l1 = self.chrom.take(indices, axis=axis, mode=mode)
l2 = self.pos.take(indices, axis=axis, mode=mode)
return ChromPosIndex(l1, l2, copy=False)

@property
def shape(self):
return len(self),

def locate_key(self, chrom, pos=None):
"""
Get index location for the requested key.
Parameters
----------
chrom : object
Chromosome or contig.
pos : int, optional
Position within chromosome or contig.
Returns
-------
loc : int or slice
Location of requested key (will be slice if there are duplicate
entries).
Examples
--------
>>> import allel
>>> chrom = ['chr2', 'chr2', 'chr1', 'chr1', 'chr1', 'chr3']
>>> pos = [1, 4, 2, 5, 5, 3]
>>> idx = allel.ChromPosIndex(chrom, pos)
>>> idx.locate_key('chr1')
slice(2, 5, None)
>>> idx.locate_key('chr2', 4)
1
>>> idx.locate_key('chr1', 5)
slice(3, 5, None)
>>> try:
... idx.locate_key('chr3', 4)
... except KeyError as e:
... print(e)
...
('chr3', 4)
"""

if pos is None:
# we just want the region for a chromosome
if chrom in self.chrom_ranges:
# return previously cached result
return self.chrom_ranges[chrom]
else:
loc_chrom = np.nonzero(self.chrom == chrom)[0]
if len(loc_chrom) == 0:
raise KeyError(chrom)
slice_chrom = slice(min(loc_chrom), max(loc_chrom) + 1)
# cache the result
self.chrom_ranges[chrom] = slice_chrom
return slice_chrom

else:
slice_chrom = self.locate_key(chrom)
pos_chrom = SortedIndex(self.pos[slice_chrom])
try:
idx_within_chrom = pos_chrom.locate_key(pos)
except KeyError:
raise KeyError(chrom, pos)
if isinstance(idx_within_chrom, slice):
return slice(slice_chrom.start + idx_within_chrom.start,
slice_chrom.start + idx_within_chrom.stop)
else:
return slice_chrom.start + idx_within_chrom

def locate_range(self, chrom, start=None, stop=None):
"""Locate slice of index containing all entries within the range
`key`:`start`-`stop` **inclusive**.
Parameters
----------
chrom : object
Chromosome or contig.
start : int, optional
Position start value.
stop : int, optional
Position stop value.
Returns
-------
loc : slice
Slice object.
Examples
--------
>>> import allel
>>> chrom = ['chr2', 'chr2', 'chr1', 'chr1', 'chr1', 'chr3']
>>> pos = [1, 4, 2, 5, 5, 3]
>>> idx = allel.ChromPosIndex(chrom, pos)
>>> idx.locate_range('chr1')
slice(2, 5, None)
>>> idx.locate_range('chr2', 1, 4)
slice(0, 2, None)
>>> idx.locate_range('chr1', 3, 7)
slice(3, 5, None)
>>> try:
... idx.locate_range('chr3', 4, 9)
... except KeyError as e:
... print(e)
('chr3', 4, 9)
"""

slice_chrom = self.locate_key(chrom)

if start is None and stop is None:
return slice_chrom

else:

pos_chrom = SortedIndex(self.pos[slice_chrom])
try:
slice_within_chrom = pos_chrom.locate_range(start, stop)
except KeyError:
raise KeyError(chrom, start, stop)
loc = slice(slice_chrom.start + slice_within_chrom.start,
slice_chrom.start + slice_within_chrom.stop)
return loc


class VariantTable(NumpyRecArrayWrapper):
"""Table (catalogue) of variants.
Expand Down
12 changes: 10 additions & 2 deletions allel/test/model/ndarray/test_indexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@

# internal imports
from allel.test.tools import assert_array_equal as aeq
from allel import SortedIndex, UniqueIndex, SortedMultiIndex
from allel import SortedIndex, UniqueIndex, SortedMultiIndex, ChromPosIndex
from allel.test.model.test_api import SortedIndexInterface, UniqueIndexInterface, \
SortedMultiIndexInterface
SortedMultiIndexInterface, ChromPosIndexInterface


# noinspection PyMethodMayBeStatic
Expand Down Expand Up @@ -153,3 +153,11 @@ def setup_instance(self, chrom, pos):
return SortedMultiIndex(chrom, pos)

_class = SortedMultiIndex


class ChromPosIndexTests(ChromPosIndexInterface, unittest.TestCase):

def setup_instance(self, chrom, pos):
return ChromPosIndex(chrom, pos)

_class = ChromPosIndex
60 changes: 60 additions & 0 deletions allel/test/model/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -2277,6 +2277,66 @@ def test_locate_range(self):
f(3, 2, 4)


class ChromPosIndexInterface(object):

_class = None

def setup_instance(self, chrom, pos):
pass

def test_properties(self):
chrom = np.array([b'4', b'4', b'2', b'2', b'2', b'6'])
pos = np.array([1, 5, 3, 7, 7, 2])
idx = self.setup_instance(chrom, pos)
assert 6 == len(idx)

def test_locate_key(self):
chrom = np.array([b'4', b'4', b'2', b'2', b'2', b'6'])
pos = np.array([1, 5, 3, 7, 7, 2])
idx = self.setup_instance(chrom, pos)
f = idx.locate_key
assert slice(0, 2) == f(b'4')
assert slice(2, 5) == f(b'2')
assert slice(5, 6) == f(b'6')
assert 0 == f(b'4', 1)
assert 2 == f(b'2', 3)
assert 5 == f(b'6', 2)
assert slice(3, 5) == f(b'2', 7)
with pytest.raises(KeyError):
f(b'X')
with pytest.raises(KeyError):
f(b'4', 4)
with pytest.raises(KeyError):
f(b'2', 5)
with pytest.raises(KeyError):
f(b'6', 5)

def test_locate_range(self):
chrom = np.array([b'4', b'4', b'2', b'2', b'2', b'6'])
pos = np.array([1, 5, 3, 7, 7, 2])
idx = self.setup_instance(chrom, pos)
f = idx.locate_range

assert slice(0, 2) == f(b'4')
assert slice(2, 5) == f(b'2')
assert slice(5, 6) == f(b'6')
assert slice(0, 2) == f(b'4', 1, 5)
assert slice(0, 1) == f(b'4', 1, 3)
assert slice(2, 5) == f(b'2', 1, 9)
assert slice(2, 3) == f(b'2', 1, 6)
assert slice(5, 6) == f(b'6', 1, 6)
with pytest.raises(KeyError):
f(b'X')
with pytest.raises(KeyError):
f(b'4', 17, 19)
with pytest.raises(KeyError):
f(b'2', 1, 2)
with pytest.raises(KeyError):
f(b'2', 9, 12)
with pytest.raises(KeyError):
f(b'6', 3, 6)


class VariantTableInterface(object):

_class = None
Expand Down
8 changes: 8 additions & 0 deletions docs/model/ndarray.rst
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,14 @@ SortedIndex
.. automethod:: intersect_ranges


ChromPosIndex
-------------

.. autoclass:: allel.ChromPosIndex

.. automethod:: locate_key
.. automethod:: locate_range

SortedMultiIndex
----------------

Expand Down
6 changes: 6 additions & 0 deletions docs/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,12 @@ v1.2.0 (work in progress)
variants are returned. By :user:`Alistair Miles <alimanfoo>`,
:issue:`221`, :issue:`167`, :issue:`213`.

* Added a new index class :class:`allel.ChromPosIndex` for locating
data given chromosome and positio locations. This behaves similarly
to the existing :class:`allel.SortedMultiIndex` but the chromosome
values do not need to be sorted. By :user:`Alistair Miles
<alimanfoo>`, :issue:`201`, :issue:`239`.

* Added new parameters ``exclude_fields`` and ``rename_fields`` to VCF
parsing functions to add greater flexibility when selecting fields
to extract. Also added several measures to protect against name
Expand Down

0 comments on commit 20b66ba

Please sign in to comment.