Merge pull request #63 from cggh/issue_55

resolve #55
cggh · Feb 1, 2016 · 09ff6a3 · 09ff6a3
2 parents 65306d3 + ede8947
commit 09ff6a3
Show file tree

Hide file tree

Showing 7 changed files with 88 additions and 22 deletions.
diff --git a/allel/chunked/core.py b/allel/chunked/core.py
@@ -357,29 +357,41 @@ def take_table(tbl, indices, blen=None, storage=None, create='table',
                           create=create, **kwargs)
 
 
-def subset(data, sel0, sel1, blen=None, storage=None, create='array',
+def subset(data, sel0=None, sel1=None, blen=None, storage=None, create='array',
            **kwargs):
     """Return selected rows and columns of an array."""
 
     # setup
     storage = _util.get_storage(storage)
     blen = _util.get_blen_array(data, blen)
     length = len(data)
-    sel0 = np.asanyarray(sel0)
-    sel1 = np.asanyarray(sel1)
+    if sel0 is not None:
+        sel0 = np.asanyarray(sel0)
+    if sel1 is not None:
+        sel1 = np.asanyarray(sel1)
 
     # ensure boolean array for dim 0
-    if sel0.shape[0] < length:
+    if sel0 is not None and sel0.dtype.kind != 'b':
         # assume indices, convert to boolean condition
         tmp = np.zeros(length, dtype=bool)
         tmp[sel0] = True
         sel0 = tmp
 
     # ensure indices for dim 1
-    if sel1.shape[0] == data.shape[1]:
+    if sel1 is not None and sel1.dtype.kind == 'b':
         # assume boolean condition, convert to indices
         sel1 = np.nonzero(sel1)[0]
 
+    # shortcuts
+    if sel0 is None and sel1 is None:
+        return copy(data, blen=blen, storage=storage, create=create, **kwargs)
+    elif sel1 is None:
+        return compress(data, sel0, axis=0, blen=blen, storage=storage,
+                        create=create, **kwargs)
+    elif sel0 is None:
+        return take(data, sel1, axis=1, blen=blen, storage=storage,
+                    create=create, **kwargs)
+
     # build output
     sel0_nnz = count_nonzero(sel0)
     out = None

diff --git a/allel/model/bcolz.py b/allel/model/bcolz.py
@@ -330,19 +330,27 @@ def carray_block_subset(carr, sel0, sel1, blen=None, **kwargs):
         blen = carr.chunklen
 
     # check inputs
-    sel0 = asarray_ndim(sel0, 1)
-    sel1 = asarray_ndim(sel1, 1)
+    sel0 = asarray_ndim(sel0, 1, allow_none=True)
+    sel1 = asarray_ndim(sel1, 1, allow_none=True)
 
     # ensure boolean array for dim 0
-    if sel0.size < carr.shape[0]:
+    if sel0 is not None and sel0.dtype.kind != 'b':
         tmp = np.zeros((carr.shape[0],), dtype=bool)
         tmp[sel0] = True
         sel0 = tmp
 
     # ensure indices for dim 1
-    if sel1.size == carr.shape[1]:
+    if sel1 is not None and sel1.dtype.kind == 'b':
         sel1 = np.nonzero(sel1)[0]
 
+    # shortcuts
+    if sel0 is None and sel1 is None:
+        return carr.copy(**kwargs)
+    elif sel1 is None:
+        return carray_block_compress(carr, sel0, axis=0, blen=blen, **kwargs)
+    elif sel0 is None:
+        return carray_block_take(carr, sel1, axis=1, blen=blen, **kwargs)
+
     # setup output
     kwargs.setdefault('dtype', carr.dtype)
     kwargs.setdefault('expectedlen', np.count_nonzero(sel0))
@@ -1162,7 +1170,7 @@ def fill_masked(self, value=-1, mask=None, copy=True, **kwargs):
 
         return GenotypeCArray(out, copy=False)
 
-    def subset(self, sel0, sel1, **kwargs):
+    def subset(self, sel0=None, sel1=None, **kwargs):
         carr = carray_block_subset(self.carr, sel0, sel1, **kwargs)
         g = GenotypeCArray(carr, copy=False)
         if self.mask is not None:
@@ -1476,7 +1484,7 @@ def n_haplotypes(self):
         """Number of haplotypes (length of second array dimension)."""
         return self.carr.shape[1]
 
-    def subset(self, sel0, sel1, **kwargs):
+    def subset(self, sel0=None, sel1=None, **kwargs):
         data = carray_block_subset(self.carr, sel0, sel1, **kwargs)
         return HaplotypeCArray(data, copy=False)
 

diff --git a/allel/model/chunked.py b/allel/model/chunked.py
@@ -168,7 +168,7 @@ def take(self, indices, axis=0, **storage_kwargs):
             out.mask = self.mask.take(indices, axis=axis, **storage_kwargs)
         return out
 
-    def subset(self, sel0, sel1, **storage_kwargs):
+    def subset(self, sel0=None, sel1=None, **storage_kwargs):
         out = super(GenotypeChunkedArray, self).subset(sel0, sel1,
                                                        **storage_kwargs)
         if self.mask is not None:
@@ -480,6 +480,11 @@ def f(block, bmapping):
         out = _chunked.apply(domain, f, **storage_kwargs)
         return HaplotypeChunkedArray(out)
 
+    def subset(self, sel0=None, sel1=None, **storage_kwargs):
+        out = super(HaplotypeChunkedArray, self).subset(sel0, sel1,
+                                                        **storage_kwargs)
+        return out
+
 
 # copy docstrings
 copy_method_doc(HaplotypeChunkedArray.to_genotypes,
@@ -490,6 +495,8 @@ def f(block, bmapping):
                 _ndarray.HaplotypeArray.count_alleles_subpops)
 copy_method_doc(HaplotypeChunkedArray.map_alleles,
                 _ndarray.HaplotypeArray.map_alleles)
+copy_method_doc(HaplotypeChunkedArray.subset,
+                _ndarray.HaplotypeArray.subset)
 
 
 class AlleleCountsChunkedArray(_chunked.ChunkedArray):

diff --git a/allel/model/dask.py b/allel/model/dask.py
@@ -129,8 +129,15 @@ def take(self, indices, axis=None):
             out = view_subclass(out, type(self))
         return out
 
-    def subset(self, sel0, sel1):
-        out = self[sel0][:, sel1]
+    def subset(self, sel0=None, sel1=None):
+        if sel0 is None and sel1 is None:
+            out = self
+        elif sel1 is None:
+            out = self[sel0]
+        elif sel0 is None:
+            out = self[:, sel1]
+        else:
+            out = self[sel0][:, sel1]
         return view_subclass(out, type(self))
 
     def hstack(self, *others, **kwargs):

diff --git a/allel/model/ndarray.py b/allel/model/ndarray.py
@@ -41,17 +41,24 @@ def subset(data, sel0, sel1):
     data = np.asarray(data)
     if data.ndim < 2:
         raise ValueError('data must have 2 or more dimensions')
-    sel0 = asarray_ndim(sel0, 1)
-    sel1 = asarray_ndim(sel1, 1)
+    sel0 = asarray_ndim(sel0, 1, allow_none=True)
+    sel1 = asarray_ndim(sel1, 1, allow_none=True)
 
     # ensure indices
-    if sel0.size == data.shape[0]:
+    if sel0 is not None and sel0.dtype.kind == 'b':
         sel0 = np.nonzero(sel0)[0]
-    if sel1.size == data.shape[1]:
+    if sel1 is not None and sel1.dtype.kind == 'b':
         sel1 = np.nonzero(sel1)[0]
 
     # ensure leading dimension indices can be broadcast correctly
-    sel0 = sel0[:, None]
+    if sel0 is not None and sel1 is not None:
+        sel0 = sel0[:, None]
+
+    # deal with None arguments
+    if sel0 is None:
+        sel0 = slice(None)
+    if sel1 is None:
+        sel1 = slice(None)
 
     return data[sel0, sel1]
 
@@ -519,7 +526,7 @@ def fill_masked(self, value=-1, mask=None, copy=True):
 
         return a.view(GenotypeArray)
 
-    def subset(self, sel0, sel1):
+    def subset(self, sel0=None, sel1=None):
         """Make a sub-selection of variants and samples.
 
         Parameters
@@ -549,6 +556,10 @@ def subset(self, sel0, sel1):
          [[0 1]
           [1 2]]]
 
+        See Also
+        --------
+        GenotypeArray.take, GenotypeArray.compress.
+
         """
 
         data = subset(self, sel0, sel1)
@@ -1858,7 +1869,7 @@ def n_haplotypes(self):
         """Number of haplotypes (length of second dimension)."""
         return self.shape[1]
 
-    def subset(self, sel0, sel1):
+    def subset(self, sel0=None, sel1=None):
         """Make a sub-selection of variants and haplotypes.
 
         Parameters
@@ -1874,6 +1885,10 @@ def subset(self, sel0, sel1):
 
         out : HaplotypeArray
 
+        See Also
+        --------
+        HaplotypeArray.take, HaplotypeArray.compress.
+
         """
 
         return HaplotypeArray(subset(self, sel0, sel1), copy=False)

diff --git a/allel/test/test_model_api.py b/allel/test/test_model_api.py
@@ -272,6 +272,20 @@ def test_subset(self):
             .take(sel1, axis=1)
         aeq(expect, s)
 
+        # check argument type inference
+        sel0 = list(range(g.shape[0]))
+        sel1 = None
+        s = g.subset(sel0, sel1)
+        expect = np.array(diploid_genotype_data)
+        aeq(expect, s)
+
+        # check argument type inference
+        sel0 = None
+        sel1 = list(range(g.shape[1]))
+        s = g.subset(sel0, sel1)
+        expect = np.array(diploid_genotype_data)
+        aeq(expect, s)
+
     # genotype counting methods
     ###########################
 
@@ -757,7 +771,6 @@ def test_count_alleles(self):
         eq(3, actual.n_alleles)
 
         # polyploid
-        print('test polyploid')
         g = self.setup_instance(triploid_genotype_data)
         expect = np.array([[5, 1, 0],
                            [1, 5, 0],

diff --git a/docs/release.rst b/docs/release.rst
@@ -1,6 +1,10 @@
 Release notes
 =============
 
+* Changed behaviour of `subset` method on genotype and haplotype arrays to
+  better infer argument types and handle None argument values
+  (`#55 <https://github.com/cggh/scikit-allel/issues/55>`_).
+
 v0.20.2
 -------