Skip to content

Commit

Permalink
AlleleCounts map missing alleles (#241)
Browse files Browse the repository at this point in the history
* test and correct reference implementation for allele counts map alleles

* optimise allele counts map alleles

* fix bugs

* release notes [ci skip]
  • Loading branch information
alimanfoo committed Dec 21, 2018
1 parent 481479e commit 3bff7d7
Show file tree
Hide file tree
Showing 8 changed files with 5,586 additions and 1,075 deletions.
8 changes: 6 additions & 2 deletions allel/model/chunked.py
Original file line number Diff line number Diff line change
Expand Up @@ -605,9 +605,13 @@ def count_doubleton(self, allele=1, **kwargs):
return self._count('is_doubleton', kwargs=dict(allele=allele),
**kwargs)

def map_alleles(self, mapping, **kwargs):
def map_alleles(self, mapping, max_allele=None, **kwargs):
if max_allele is None:
max_allele = np.max(mapping)

def f(block, bmapping):
return block.map_alleles(bmapping)
return block.map_alleles(bmapping, max_allele=max_allele)

domain = (self, mapping)
out = _chunked.map_blocks(domain, f, **kwargs)
return AlleleCountsChunkedArray(out)
Expand Down
14 changes: 9 additions & 5 deletions allel/model/dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -770,15 +770,19 @@ def count_singleton(self, allele=1):
def count_doubleton(self, allele=1):
return self._count('is_doubleton', allele=allele)

def map_alleles(self, mapping):

def f(block, bmapping):
ac = AlleleCountsArray(block)
return ac.map_alleles(bmapping)
def map_alleles(self, mapping, max_allele=None):

# obtain dask array
mapping = da.from_array(mapping, chunks=(self.chunks[0], None))

# determine output shape
if max_allele is None:
max_allele = mapping.max().compute()

def f(block, bmapping):
ac = AlleleCountsArray(block)
return ac.map_alleles(bmapping, max_allele=max_allele)

# map blocks
out = da.map_blocks(f, self.values, mapping, dtype=mapping.dtype)
return AlleleCountsDaskArray(out)
Expand Down
24 changes: 13 additions & 11 deletions allel/model/ndarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

# internal imports
from allel.util import check_integer_dtype, check_shape, check_dtype, ignore_invalid, \
check_dim0_aligned, check_ploidy, check_ndim, asarray_ndim
check_dim0_aligned, check_ploidy, check_ndim, asarray_ndim, check_dim1_aligned
from allel.compat import PY2, copy_method_doc, integer_types, memoryview_safe
from allel.io.vcf_write import write_vcf
from allel.io.gff import gff3_to_recarray
Expand All @@ -20,7 +20,7 @@
genotype_array_count_alleles, genotype_array_count_alleles_masked, \
genotype_array_count_alleles_subpop, genotype_array_count_alleles_subpop_masked, \
haplotype_array_count_alleles, haplotype_array_count_alleles_subpop, \
haplotype_array_map_alleles
haplotype_array_map_alleles, allele_counts_array_map_alleles
from .generic import index_genotype_vector, compress_genotypes, \
take_genotypes, concatenate_genotypes, index_genotype_array, subset_genotype_array, \
index_haplotype_array, compress_haplotype_array, take_haplotype_array, \
Expand Down Expand Up @@ -2968,13 +2968,16 @@ def count_singleton(self, allele=1):
def count_doubleton(self, allele=1):
return np.sum(self.is_doubleton(allele=allele))

def map_alleles(self, mapping):
def map_alleles(self, mapping, max_allele=None):
"""Transform alleles via a mapping.
Parameters
----------
mapping : ndarray, int8, shape (n_variants, max_allele)
An array defining the allele mapping for each variant.
max_allele : int, optional
Highest allele index expected in the output. If not provided
will be determined from maximum value in `mapping`.
Returns
-------
Expand All @@ -3000,7 +3003,7 @@ def map_alleles(self, mapping):
... [2, 1, 0],
... [1, 2, 0]]
>>> ac.map_alleles(mapping)
<AlleleCountsArray shape=(4, 3) dtype=int64>
<AlleleCountsArray shape=(4, 3) dtype=int32>
0 4 0
1 3 0
1 2 1
Expand All @@ -3012,16 +3015,15 @@ def map_alleles(self, mapping):
"""

mapping = asarray_ndim(mapping, 2)
# ensure correct dimensionality and matching dtype
mapping = asarray_ndim(mapping, 2, dtype=self.dtype)
check_dim0_aligned(self, mapping)
check_dim1_aligned(self, mapping)

# setup output array
out = np.empty_like(mapping)

# apply transformation
i = np.arange(self.shape[0]).reshape((-1, 1))
out[i, mapping] = self
# use optimisation
out = allele_counts_array_map_alleles(self.values, mapping, max_allele)

# wrap and return
return type(self)(out)


Expand Down

0 comments on commit 3bff7d7

Please sign in to comment.