Skip to content

Commit

Permalink
Merge pull request #63 from cggh/issue_55
Browse files Browse the repository at this point in the history
resolve #55
  • Loading branch information
alimanfoo committed Feb 1, 2016
2 parents 65306d3 + ede8947 commit 09ff6a3
Show file tree
Hide file tree
Showing 7 changed files with 88 additions and 22 deletions.
22 changes: 17 additions & 5 deletions allel/chunked/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,29 +357,41 @@ def take_table(tbl, indices, blen=None, storage=None, create='table',
create=create, **kwargs)


def subset(data, sel0, sel1, blen=None, storage=None, create='array',
def subset(data, sel0=None, sel1=None, blen=None, storage=None, create='array',
**kwargs):
"""Return selected rows and columns of an array."""

# setup
storage = _util.get_storage(storage)
blen = _util.get_blen_array(data, blen)
length = len(data)
sel0 = np.asanyarray(sel0)
sel1 = np.asanyarray(sel1)
if sel0 is not None:
sel0 = np.asanyarray(sel0)
if sel1 is not None:
sel1 = np.asanyarray(sel1)

# ensure boolean array for dim 0
if sel0.shape[0] < length:
if sel0 is not None and sel0.dtype.kind != 'b':
# assume indices, convert to boolean condition
tmp = np.zeros(length, dtype=bool)
tmp[sel0] = True
sel0 = tmp

# ensure indices for dim 1
if sel1.shape[0] == data.shape[1]:
if sel1 is not None and sel1.dtype.kind == 'b':
# assume boolean condition, convert to indices
sel1 = np.nonzero(sel1)[0]

# shortcuts
if sel0 is None and sel1 is None:
return copy(data, blen=blen, storage=storage, create=create, **kwargs)
elif sel1 is None:
return compress(data, sel0, axis=0, blen=blen, storage=storage,
create=create, **kwargs)
elif sel0 is None:
return take(data, sel1, axis=1, blen=blen, storage=storage,
create=create, **kwargs)

# build output
sel0_nnz = count_nonzero(sel0)
out = None
Expand Down
20 changes: 14 additions & 6 deletions allel/model/bcolz.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,19 +330,27 @@ def carray_block_subset(carr, sel0, sel1, blen=None, **kwargs):
blen = carr.chunklen

# check inputs
sel0 = asarray_ndim(sel0, 1)
sel1 = asarray_ndim(sel1, 1)
sel0 = asarray_ndim(sel0, 1, allow_none=True)
sel1 = asarray_ndim(sel1, 1, allow_none=True)

# ensure boolean array for dim 0
if sel0.size < carr.shape[0]:
if sel0 is not None and sel0.dtype.kind != 'b':
tmp = np.zeros((carr.shape[0],), dtype=bool)
tmp[sel0] = True
sel0 = tmp

# ensure indices for dim 1
if sel1.size == carr.shape[1]:
if sel1 is not None and sel1.dtype.kind == 'b':
sel1 = np.nonzero(sel1)[0]

# shortcuts
if sel0 is None and sel1 is None:
return carr.copy(**kwargs)
elif sel1 is None:
return carray_block_compress(carr, sel0, axis=0, blen=blen, **kwargs)
elif sel0 is None:
return carray_block_take(carr, sel1, axis=1, blen=blen, **kwargs)

# setup output
kwargs.setdefault('dtype', carr.dtype)
kwargs.setdefault('expectedlen', np.count_nonzero(sel0))
Expand Down Expand Up @@ -1162,7 +1170,7 @@ def fill_masked(self, value=-1, mask=None, copy=True, **kwargs):

return GenotypeCArray(out, copy=False)

def subset(self, sel0, sel1, **kwargs):
def subset(self, sel0=None, sel1=None, **kwargs):
carr = carray_block_subset(self.carr, sel0, sel1, **kwargs)
g = GenotypeCArray(carr, copy=False)
if self.mask is not None:
Expand Down Expand Up @@ -1476,7 +1484,7 @@ def n_haplotypes(self):
"""Number of haplotypes (length of second array dimension)."""
return self.carr.shape[1]

def subset(self, sel0, sel1, **kwargs):
def subset(self, sel0=None, sel1=None, **kwargs):
data = carray_block_subset(self.carr, sel0, sel1, **kwargs)
return HaplotypeCArray(data, copy=False)

Expand Down
9 changes: 8 additions & 1 deletion allel/model/chunked.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ def take(self, indices, axis=0, **storage_kwargs):
out.mask = self.mask.take(indices, axis=axis, **storage_kwargs)
return out

def subset(self, sel0, sel1, **storage_kwargs):
def subset(self, sel0=None, sel1=None, **storage_kwargs):
out = super(GenotypeChunkedArray, self).subset(sel0, sel1,
**storage_kwargs)
if self.mask is not None:
Expand Down Expand Up @@ -480,6 +480,11 @@ def f(block, bmapping):
out = _chunked.apply(domain, f, **storage_kwargs)
return HaplotypeChunkedArray(out)

def subset(self, sel0=None, sel1=None, **storage_kwargs):
out = super(HaplotypeChunkedArray, self).subset(sel0, sel1,
**storage_kwargs)
return out


# copy docstrings
copy_method_doc(HaplotypeChunkedArray.to_genotypes,
Expand All @@ -490,6 +495,8 @@ def f(block, bmapping):
_ndarray.HaplotypeArray.count_alleles_subpops)
copy_method_doc(HaplotypeChunkedArray.map_alleles,
_ndarray.HaplotypeArray.map_alleles)
copy_method_doc(HaplotypeChunkedArray.subset,
_ndarray.HaplotypeArray.subset)


class AlleleCountsChunkedArray(_chunked.ChunkedArray):
Expand Down
11 changes: 9 additions & 2 deletions allel/model/dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,15 @@ def take(self, indices, axis=None):
out = view_subclass(out, type(self))
return out

def subset(self, sel0, sel1):
out = self[sel0][:, sel1]
def subset(self, sel0=None, sel1=None):
if sel0 is None and sel1 is None:
out = self
elif sel1 is None:
out = self[sel0]
elif sel0 is None:
out = self[:, sel1]
else:
out = self[sel0][:, sel1]
return view_subclass(out, type(self))

def hstack(self, *others, **kwargs):
Expand Down
29 changes: 22 additions & 7 deletions allel/model/ndarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,17 +41,24 @@ def subset(data, sel0, sel1):
data = np.asarray(data)
if data.ndim < 2:
raise ValueError('data must have 2 or more dimensions')
sel0 = asarray_ndim(sel0, 1)
sel1 = asarray_ndim(sel1, 1)
sel0 = asarray_ndim(sel0, 1, allow_none=True)
sel1 = asarray_ndim(sel1, 1, allow_none=True)

# ensure indices
if sel0.size == data.shape[0]:
if sel0 is not None and sel0.dtype.kind == 'b':
sel0 = np.nonzero(sel0)[0]
if sel1.size == data.shape[1]:
if sel1 is not None and sel1.dtype.kind == 'b':
sel1 = np.nonzero(sel1)[0]

# ensure leading dimension indices can be broadcast correctly
sel0 = sel0[:, None]
if sel0 is not None and sel1 is not None:
sel0 = sel0[:, None]

# deal with None arguments
if sel0 is None:
sel0 = slice(None)
if sel1 is None:
sel1 = slice(None)

return data[sel0, sel1]

Expand Down Expand Up @@ -519,7 +526,7 @@ def fill_masked(self, value=-1, mask=None, copy=True):

return a.view(GenotypeArray)

def subset(self, sel0, sel1):
def subset(self, sel0=None, sel1=None):
"""Make a sub-selection of variants and samples.
Parameters
Expand Down Expand Up @@ -549,6 +556,10 @@ def subset(self, sel0, sel1):
[[0 1]
[1 2]]]
See Also
--------
GenotypeArray.take, GenotypeArray.compress.
"""

data = subset(self, sel0, sel1)
Expand Down Expand Up @@ -1858,7 +1869,7 @@ def n_haplotypes(self):
"""Number of haplotypes (length of second dimension)."""
return self.shape[1]

def subset(self, sel0, sel1):
def subset(self, sel0=None, sel1=None):
"""Make a sub-selection of variants and haplotypes.
Parameters
Expand All @@ -1874,6 +1885,10 @@ def subset(self, sel0, sel1):
out : HaplotypeArray
See Also
--------
HaplotypeArray.take, HaplotypeArray.compress.
"""

return HaplotypeArray(subset(self, sel0, sel1), copy=False)
Expand Down
15 changes: 14 additions & 1 deletion allel/test/test_model_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,20 @@ def test_subset(self):
.take(sel1, axis=1)
aeq(expect, s)

# check argument type inference
sel0 = list(range(g.shape[0]))
sel1 = None
s = g.subset(sel0, sel1)
expect = np.array(diploid_genotype_data)
aeq(expect, s)

# check argument type inference
sel0 = None
sel1 = list(range(g.shape[1]))
s = g.subset(sel0, sel1)
expect = np.array(diploid_genotype_data)
aeq(expect, s)

# genotype counting methods
###########################

Expand Down Expand Up @@ -757,7 +771,6 @@ def test_count_alleles(self):
eq(3, actual.n_alleles)

# polyploid
print('test polyploid')
g = self.setup_instance(triploid_genotype_data)
expect = np.array([[5, 1, 0],
[1, 5, 0],
Expand Down
4 changes: 4 additions & 0 deletions docs/release.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
Release notes
=============

* Changed behaviour of `subset` method on genotype and haplotype arrays to
better infer argument types and handle None argument values
(`#55 <https://github.com/cggh/scikit-allel/issues/55>`_).

v0.20.2
-------

Expand Down

0 comments on commit 09ff6a3

Please sign in to comment.