Skip to content

Commit

Permalink
allele mapping
Browse files Browse the repository at this point in the history
  • Loading branch information
alimanfoo committed Mar 10, 2015
1 parent cdd3e5f commit 0f29e51
Show file tree
Hide file tree
Showing 5 changed files with 310 additions and 30 deletions.
33 changes: 27 additions & 6 deletions allel/bcolz.py
Original file line number Diff line number Diff line change
Expand Up @@ -1106,9 +1106,7 @@ def f(block):
def map_alleles(self, mapping, **kwargs):

# check inputs
if self.dtype.type != np.int8:
raise NotImplementedError('only implemented for dtype int8')
mapping = asarray_ndim(mapping, 2, dtype='i1')
mapping = asarray_ndim(mapping, 2)
check_dim0_aligned(self, mapping)

# setup output
Expand Down Expand Up @@ -1310,9 +1308,7 @@ def count_alleles_subpops(self, subpops, max_allele=None, **kwargs):
def map_alleles(self, mapping, **kwargs):

# check inputs
if self.dtype.type != np.int8:
raise NotImplementedError('only implemented for dtype int8')
mapping = asarray_ndim(mapping, 2, dtype='i1')
mapping = asarray_ndim(mapping, 2)
check_dim0_aligned(self, mapping)

# setup output
Expand Down Expand Up @@ -1476,6 +1472,31 @@ def count_singleton(self, allele=1):
def count_doubleton(self, allele=1):
return carray_block_sum(self.is_doubleton(allele=allele))

def map_alleles(self, mapping, **kwargs):

# check inputs
mapping = asarray_ndim(mapping, 2)
check_dim0_aligned(self, mapping)

# setup output
out = None
blen = kwargs.pop('blen', self.carr.chunklen)
kwargs.setdefault('expectedlen', self.carr.shape[0])
kwargs.setdefault('dtype', self.carr.dtype)

# block-wise iteration
for i in range(0, self.carr.shape[0], blen):
block = self.carr[i:i+blen]
bmapping = mapping[i:i+blen]
ac = AlleleCountsArray(block, copy=False)
acm = ac.map_alleles(bmapping)
if out is None:
out = bcolz.carray(acm, **kwargs)
else:
out.append(acm)

return AlleleCountsCArray(out, copy=False)


class _CTableWrapper(object):

Expand Down
193 changes: 182 additions & 11 deletions allel/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -638,7 +638,7 @@ def count_alleles(self, max_allele=None, subpop=None):
"""

if subpop is not None:
if np.any(subpop >= self.shape[1]):
if np.any(np.array(subpop) >= self.shape[1]):
raise ValueError('index out of bounds')
# convert to a haplotype selection
subpop = sample_to_haplotype_selection(subpop, self.ploidy)
Expand Down Expand Up @@ -1159,7 +1159,9 @@ def map_alleles(self, mapping, copy=True):
mapping : ndarray, int8, shape (n_variants, max_allele)
An array defining the allele mapping for each variant.
copy : bool, optional
If True, return a new array; if False, apply mapping in place.
If True, return a new array; if False, apply mapping in place
(only applies for arrays with dtype int8; all other dtypes
require a copy).
Returns
-------
Expand Down Expand Up @@ -1193,7 +1195,14 @@ def map_alleles(self, mapping, copy=True):
Notes
-----
Currently only implemented for arrays with dtype int8.
For arrays with dtype int8 an optimised implementation is used which is
faster and uses far less memory. It is recommended to convert arrays to
dtype int8 where possible before calling this method.
See Also
--------
create_allele_mapping
"""

Expand Down Expand Up @@ -1678,7 +1687,9 @@ def map_alleles(self, mapping, copy=True):
mapping : ndarray, int8, shape (n_variants, max_allele)
An array defining the allele mapping for each variant.
copy : bool, optional
If True, return a new array; if False, apply mapping in place.
If True, return a new array; if False, apply mapping in place
(only applies for arrays with dtype int8; all other dtypes
require a copy).
Returns
-------
Expand All @@ -1705,19 +1716,32 @@ def map_alleles(self, mapping, copy=True):
Notes
-----
Currently only implemented for arrays with dtype int8.
For arrays with dtype int8 an optimised implementation is used which is
faster and uses far less memory. It is recommended to convert arrays to
dtype int8 where possible before calling this method.
See Also
--------
create_allele_mapping
"""

# check inputs
if self.dtype.type != np.int8:
raise NotImplementedError('only implemented for dtype int8')
mapping = asarray_ndim(mapping, 2, dtype='i1')
mapping = asarray_ndim(mapping, 2)
check_dim0_aligned(self, mapping)

# use optimisaton
from allel.opt.model import haplotype_int8_map_alleles
data = haplotype_int8_map_alleles(self, mapping, copy=copy)
if self.dtype.type == np.int8:
# use optimisation
mapping = np.asarray(mapping, dtype='i1')
from allel.opt.model import haplotype_int8_map_alleles
data = haplotype_int8_map_alleles(self, mapping, copy=copy)

else:
# use numpy indexing
i = np.arange(self.shape[0]).reshape((-1, 1))
data = mapping[i, self]
data[self < 0] = -1

return HaplotypeArray(data, copy=False)

Expand Down Expand Up @@ -2115,6 +2139,65 @@ def count_singleton(self, allele=1):
def count_doubleton(self, allele=1):
return np.sum(self.is_doubleton(allele=allele))

def map_alleles(self, mapping):
"""Transform alleles via a mapping.
Parameters
----------
mapping : ndarray, int8, shape (n_variants, max_allele)
An array defining the allele mapping for each variant.
Returns
-------
ac : AlleleCountsArray
Examples
--------
>>> import allel
>>> g = allel.model.GenotypeArray([[[0, 0], [0, 0]],
... [[0, 0], [0, 1]],
... [[0, 2], [1, 1]],
... [[2, 2], [-1, -1]]])
>>> ac = g.count_alleles()
>>> ac
AlleleCountsArray((4, 3), dtype=int32)
[[4 0 0]
[3 1 0]
[1 2 1]
[0 0 2]]
>>> mapping = [[1, 0, 2],
... [1, 0, 2],
... [2, 1, 0],
... [1, 2, 0]]
>>> ac.map_alleles(mapping)
AlleleCountsArray((4, 3), dtype=int64)
[[0 4 0]
[1 3 0]
[1 2 1]
[2 0 0]]
See Also
--------
create_allele_mapping
"""

mapping = asarray_ndim(mapping, 2)
check_dim0_aligned(self, mapping)

# setup output array
out = np.empty_like(mapping)

# apply transformation
i = np.arange(self.shape[0]).reshape((-1, 1))
out[i, mapping] = self

return AlleleCountsArray(out)


class SortedIndex(np.ndarray):
"""Index of sorted values, e.g., positions from a single chromosome or
Expand Down Expand Up @@ -3678,3 +3761,91 @@ def from_gff3(path, attributes=None, region=None,
a = np.fromiter(recs, dtype=dtype)
ft = FeatureTable(a, copy=False)
return ft


def create_allele_mapping(ref, alt, alleles, dtype='i1'):
"""Create an array mapping variant alleles into a different allele index
system.
Parameters
----------
ref : array_like, S1, shape (n_variants,)
Reference alleles.
alt : array_like, S1, shape (n_variants, n_alt_alleles)
Alternate alleles.
alleles : array_like, S1, shape (n_variants, n_alleles)
Alleles defining the new allele indexing.
Returns
-------
mapping : ndarray, int8, shape (n_variants, n_alt_alleles + 1)
Examples
--------
Example with biallelic variants::
>>> import allel
>>> ref = [b'A', b'C', b'T', b'G']
>>> alt = [b'T', b'G', b'C', b'A']
>>> alleles = [[b'A', b'T'], # no transformation
... [b'G', b'C'], # swap
... [b'T', b'A'], # 1 missing
... [b'A', b'C']] # 1 missing
>>> mapping = allel.model.create_allele_mapping(ref, alt, alleles)
>>> mapping
array([[ 0, 1],
[ 1, 0],
[ 0, -1],
[-1, 0]], dtype=int8)
Example with multiallelic variants::
>>> ref = [b'A', b'C', b'T']
>>> alt = [[b'T', b'G'],
... [b'A', b'T'],
... [b'G', b'.']]
>>> alleles = [[b'A', b'T'],
... [b'C', b'T'],
... [b'G', b'A']]
>>> mapping = allel.model.create_allele_mapping(ref, alt, alleles)
>>> mapping
array([[ 0, 1, -1],
[ 0, -1, 1],
[-1, 0, -1]], dtype=int8)
See Also
--------
GenotypeArray.map_alleles, HaplotypeArray.map_alleles,
AlleleCountsArray.map_alleles
"""

ref = asarray_ndim(ref, 1)
alt = asarray_ndim(alt, 1, 2)
alleles = asarray_ndim(alleles, 1, 2)
check_dim0_aligned(ref, alt)
check_dim0_aligned(ref, alleles)

# reshape for convenience
ref = ref[:, None]
if alt.ndim == 1:
alt = alt[:, None]
if alleles.ndim == 1:
alleles = alleles[:, None]
source_alleles = np.append(ref, alt, axis=1)

# setup output array
out = np.empty(source_alleles.shape, dtype=dtype)
out.fill(-1)

# find matches
for ai in range(source_alleles.shape[1]):
match = source_alleles[:, ai, None] == alleles
match_i, match_j = match.nonzero()
out[match_i, ai] = match_j

return out
59 changes: 57 additions & 2 deletions allel/test/test_model_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -1035,19 +1035,28 @@ def test_count_alleles_subpops(self):
eq(3, actual['sub2'].n_alleles)

def test_map_alleles(self):
a = np.array(haplotype_data, dtype=np.int8)

# generalised implementation
a = np.array(haplotype_data)
h = self.setup_instance(a)
mapping = np.array([[0, 1, 2],
[2, 0, 1],
[1, 2, 0],
[2, 1, 0]], dtype=np.int8)
[2, 1, 0]])
expect = [[0, 1, -1],
[0, 0, -1],
[0, -1, -1],
[-1, -1, -1]]
actual = h.map_alleles(mapping)
aeq(expect, actual)

# optimised implementation
a = np.array(haplotype_data, dtype='i1')
h = self.setup_instance(a)
mapping = np.array(mapping, dtype='i1')
actual = h.map_alleles(mapping)
aeq(expect, actual)


class AlleleCountsArrayInterface(object):

Expand Down Expand Up @@ -1206,6 +1215,21 @@ def test_is_count_doubleton(self):
aeq(expect, actual)
eq(np.sum(expect), ac.count_doubleton(allele=2))

def test_map_alleles(self):
ac = self.setup_instance(allele_counts_data)
mapping = np.array([[0, 1, 2],
[2, 0, 1],
[1, 2, 0],
[2, 1, 0],
[2, 0, 1]])
expect = [[3, 1, 0],
[2, 1, 1],
[1, 1, 2],
[2, 0, 0],
[0, 0, 0]]
actual = ac.map_alleles(mapping)
aeq(expect, actual)


class SortedIndexInterface(object):

Expand Down Expand Up @@ -1966,3 +1990,34 @@ def test_query(self):
expr = 'type == b"exon"'
r = ft.query(expr)
aeq(a.take([2, 3]), r)


def test_create_allele_mapping():

# biallelic case
ref = [b'A', b'C', b'T', b'G']
alt = [b'T', b'G', b'C', b'A']
alleles = [[b'A', b'T'], # no transformation
[b'G', b'C'], # swap
[b'T', b'A'], # 1 missing
[b'A', b'C']] # 1 missing
expect = [[0, 1],
[1, 0],
[0, -1],
[-1, 0]]
actual = allel.model.create_allele_mapping(ref, alt, alleles)
aeq(expect, actual)

# multiallelic case
ref = [b'A', b'C', b'T']
alt = [[b'T', b'G'],
[b'A', b'T'],
[b'G', b'.']]
alleles = [[b'A', b'T'],
[b'C', b'T'],
[b'G', b'A']]
expect = [[0, 1, -1],
[0, -1, 1],
[-1, 0, -1]]
actual = allel.model.create_allele_mapping(ref, alt, alleles)
aeq(expect, actual)

0 comments on commit 0f29e51

Please sign in to comment.