Skip to content

Commit

Permalink
weir and cockerham will be the end of me
Browse files Browse the repository at this point in the history
  • Loading branch information
alimanfoo committed Mar 11, 2015
1 parent e782622 commit c5a288e
Show file tree
Hide file tree
Showing 9 changed files with 1,430 additions and 10 deletions.
8 changes: 4 additions & 4 deletions allel/bcolz.py
Original file line number Diff line number Diff line change
Expand Up @@ -924,9 +924,9 @@ def f(block):
return GenotypeArray(block, copy=False).is_hom_alt()
return carray_block_map(self.carr, f, **kwargs)

def is_het(self, **kwargs):
def is_het(self, allele=None, **kwargs):
def f(block):
return GenotypeArray(block, copy=False).is_het()
return GenotypeArray(block, copy=False).is_het(allele=allele)
return carray_block_map(self.carr, f, **kwargs)

def is_call(self, call, **kwargs):
Expand Down Expand Up @@ -960,9 +960,9 @@ def f(block):
return GenotypeArray(block, copy=False).is_hom_alt()
return carray_block_sum(self.carr, axis=axis, transform=f)

def count_het(self, axis=None):
def count_het(self, allele=None, axis=None):
def f(block):
return GenotypeArray(block, copy=False).is_het()
return GenotypeArray(block, copy=False).is_het(allele=allele)
return carray_block_sum(self.carr, axis=axis, transform=f)

def count_call(self, call, axis=None):
Expand Down
17 changes: 14 additions & 3 deletions allel/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -484,7 +484,7 @@ def is_hom_alt(self):
return out

# noinspection PyUnusedLocal
def is_het(self):
def is_het(self, allele=None):
"""Find genotype calls that are heterozygous.
Returns
Expand All @@ -493,6 +493,8 @@ def is_het(self):
out : ndarray, bool, shape (n_variants, n_samples)
Array where elements are True if the genotype call matches the
condition.
allele : int, optional
Heterozygous allele.
Examples
--------
Expand All @@ -505,6 +507,10 @@ def is_het(self):
array([[False, True],
[ True, False],
[ True, False]], dtype=bool)
>>> g.is_het(2)
array([[False, False],
[False, False],
[ True, False]], dtype=bool)
"""

Expand All @@ -513,6 +519,9 @@ def is_het(self):
allele1 = self[..., 0] # noqa
allele2 = self[..., 1] # noqa
ex = '(allele1 >= 0) & (allele2 >= 0) & (allele1 != allele2)'
if allele is not None:
ex += ' & ((allele1 == {0}) | (allele2 == {0}))' \
.format(allele)
out = ne.evaluate(ex)

# general ploidy case
Expand All @@ -521,6 +530,8 @@ def is_het(self):
other_alleles = self[..., 1:] # noqa
out = np.all(self >= 0, axis=-1) \
& np.any(allele1 != other_alleles, axis=-1)
if allele is not None:
out &= np.any(self == allele, axis=-1)

return out

Expand Down Expand Up @@ -592,8 +603,8 @@ def count_hom_alt(self, axis=None):
b = self.is_hom_alt()
return np.sum(b, axis=axis)

def count_het(self, axis=None):
b = self.is_het()
def count_het(self, allele=None, axis=None):
b = self.is_het(allele=allele)
return np.sum(b, axis=axis)

def count_call(self, call, axis=None):
Expand Down
3 changes: 2 additions & 1 deletion allel/stats/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@

from allel.stats.diversity import mean_pairwise_diversity, \
sequence_diversity, windowed_diversity, mean_pairwise_divergence, \
sequence_divergence, windowed_divergence
sequence_divergence, windowed_divergence, \
weir_cockerham_anova

from allel.stats.distance import pairwise_distance, pairwise_dxy

Expand Down
2 changes: 1 addition & 1 deletion allel/stats/distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def f(b):
return pdist(b, metric=metric)

if hasattr(x, 'chunklen'):
# use block-wise implementation
# use chunk-wise implementation
blen = x.chunklen
dist = None
for i in range(0, x.shape[0], blen):
Expand Down
235 changes: 234 additions & 1 deletion allel/stats/diversity.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,15 @@
from __future__ import absolute_import, print_function, division


import logging
logger = logging.getLogger(__name__)
debug = logger.debug


import numpy as np


from allel.model import SortedIndex
from allel.model import SortedIndex, GenotypeArray
from allel.util import asarray_ndim, ignore_invalid, check_dim0_aligned
from allel.stats.window import windowed_statistic, per_base

Expand Down Expand Up @@ -545,3 +550,231 @@ def windowed_divergence(pos, ac1, ac2, size, start=None, stop=None, step=None,
fill=fill)

return dxy, windows, n_bases, counts


def weir_cockerham_anova(g, subpops, max_allele=None):
"""Compute the variance components from the analyses of variance of
allele frequencies according to Weir and Cockerham (1984).
Parameters
----------
g : array_like, int, shape (n_variants, n_samples, ploidy)
Genotype array.
subpops : sequence of sequences of ints
Sample indices for each subpopulation.
max_allele : int, optional
The highest allele index to consider.
Returns
-------
a : ndarray, float, shape (n_variants, n_alleles)
Component of variance between populations.
b : ndarray, float, shape (n_variants, n_alleles)
Component of variance between individuals within populations.
c : ndarray, float, shape (n_variants, n_alleles)
Component of variance between gametes within individuals.
Notes
-----
Please consider this implementation experimental, it has not been
rigorously tested.
Examples
--------
Calculate variance components from some genotype data::
>>> import allel
>>> g = [[[0, 0], [0, 0], [1, 1], [1, 1]],
... [[0, 1], [0, 1], [0, 1], [0, 1]],
... [[0, 0], [0, 0], [0, 0], [0, 0]],
... [[0, 1], [1, 2], [1, 1], [2, 2]],
... [[0, 0], [1, 1], [0, 1], [-1, -1]]]
>>> subpops = [[0, 1], [2, 3]]
>>> a, b, c = allel.stats.weir_cockerham_anova(g, subpops)
>>> a
array([[ 0.5 , 0.5 , 0. ],
[ 0. , 0. , 0. ],
[ 0. , 0. , 0. ],
[ 0. , -0.125, -0.125],
[-0.375, -0.375, 0. ]])
>>> b
array([[ 0. , 0. , 0. ],
[-0.25 , -0.25 , 0. ],
[ 0. , 0. , 0. ],
[ 0. , 0.125 , 0.25 ],
[ 0.41666667, 0.41666667, 0. ]])
>>> c
array([[ 0. , 0. , 0. ],
[ 0.5 , 0.5 , 0. ],
[ 0. , 0. , 0. ],
[ 0.125 , 0.25 , 0.125 ],
[ 0.16666667, 0.16666667, 0. ]])
Estimate the parameter theta (a.k.a., Fst) for each variant
and each allele individually::
>>> fst = a / (a + b + c)
>>> fst
array([[ 1. , 1. , nan],
[ 0. , 0. , nan],
[ nan, nan, nan],
[ 0. , -0.5, -0.5],
[-1.8, -1.8, nan]])
Estimate Fst for each variant individually (averaging over alleles)::
>>> fst = (np.sum(a, axis=1) /
... (np.sum(a, axis=1) + np.sum(b, axis=1) + np.sum(c, axis=1)))
>>> fst
array([ 1. , 0. , nan, -0.4, -1.8])
Estimate Fst averaging over all variants and alleles::
>>> fst = np.sum(a) / (np.sum(a) + np.sum(b) + np.sum(c))
>>> fst
-4.3680905886891398e-17
"""

# check inputs
if not hasattr(g, 'shape') or not hasattr(g, 'ndim'):
g = GenotypeArray(g, copy=False)
if g.ndim != 3:
raise ValueError('g must have three dimensions')
if g.shape[2] != 2:
raise NotImplementedError('only diploid genotypes are supported')

# determine highest allele index
if max_allele is None:
max_allele = g.max()

if hasattr(g, 'chunklen'):
# use a chunk-wise implementation
blen = g.chunklen
n_variants = g.shape[0]
shape = (n_variants, max_allele + 1)
a = np.zeros(shape, dtype='f8')
b = np.zeros(shape, dtype='f8')
c = np.zeros(shape, dtype='f8')
for i in range(0, n_variants, blen):
gb = g[i:i+blen]
ab, bb, cb = _weir_cockerham_variance_components(gb, subpops,
max_allele)
a[i:i+blen] = ab
b[i:i+blen] = bb
c[i:i+blen] = cb

else:
a, b, c = _weir_cockerham_variance_components(g, subpops, max_allele)

return a, b, c


# noinspection PyPep8Naming
def _weir_cockerham_variance_components(g, subpops, max_allele):

# check inputs
g = GenotypeArray(g, copy=False)
n_variants, n_samples, ploidy = g.shape
n_alleles = max_allele + 1

# number of populations sampled
r = len(subpops)
n_populations = r
debug('r: %r', r)

# count alleles within each subpopulation
ac = [g.count_alleles(subpop=s, max_allele=max_allele) for s in subpops]

# stack allele counts from each sub-population into a single array
ac = np.dstack(ac)
assert ac.shape == (n_variants, n_alleles, n_populations)
debug('ac: %s, %r', ac.shape, ac)

# count number of alleles called within each population by summing
# allele counts along the alleles dimension
an = np.sum(ac, axis=1)
assert an.shape == (n_variants, n_populations)
debug('an: %s, %r', an.shape, an)

# compute number of individuals sampled from each population
n = an // 2
assert n.shape == (n_variants, n_populations)
debug('n: %s, %r', n.shape, n)

# compute the total number of individuals sampled across all populations
n_total = np.sum(n, axis=1)
assert n_total.shape == (n_variants,)
debug('n_total: %s, %r', n_total.shape, n_total)

# compute the average sample size across populations
n_bar = np.mean(n, axis=1)
assert n_bar.shape == (n_variants,)
debug('n_bar: %s, %r', n_bar.shape, n_bar)

# compute the term n sub C incorporating the coefficient of variation in
# sample sizes
n_C = (n_total - (np.sum(n**2, axis=1) / n_total)) / (r - 1)
assert n_C.shape == (n_variants,)
debug('n_C: %s, %r', n_C.shape, n_C)

# compute allele frequencies within each population
p = ac / an[:, np.newaxis, :]
assert p.shape == (n_variants, n_alleles, n_populations)
debug('p: %s, %r', p.shape, p)

# compute the average sample frequency of each allele
ac_total = np.sum(ac, axis=2)
an_total = np.sum(an, axis=1)
p_bar = ac_total / an_total[:, np.newaxis]
assert p_bar.shape == (n_variants, n_alleles)
debug('p_bar: %s, %r', p_bar.shape, p_bar)

# add in some extra dimensions to enable broadcasting
n_bar = n_bar[:, np.newaxis]
n_C = n_C[:, np.newaxis]

# compute the sample variance of allele frequencies over populations
s_squared = (
np.sum(n[:, np.newaxis, :] * ((p - p_bar[:, :, np.newaxis]) ** 2),
axis=2)
/ (n_bar * (r - 1))
)
assert s_squared.shape == (n_variants, n_alleles)
debug('s_squared: %s, %r', s_squared.shape, s_squared)

# compute the average heterozygosity over all populations
h_bar = [g.count_het(allele=allele, axis=1) / n_total
for allele in range(n_alleles)]
h_bar = np.column_stack(h_bar)
assert h_bar.shape == (n_variants, n_alleles)
debug('h_bar: %s, %r', h_bar.shape, h_bar)

# now comes the tricky bit...

# component of variance between populations
p_bar = p_bar[:, :]
a = ((n_bar / n_C)
* (s_squared -
((1 / (n_bar - 1))
* ((p_bar * (1 - p_bar))
- ((r - 1) * s_squared / r)
- (h_bar / 4)))))
assert a.shape == (n_variants, n_alleles)

# component of variance between individuals within populations
b = ((n_bar / (n_bar - 1))
* ((p_bar * (1 - p_bar))
- ((r - 1) * s_squared / r)
- (((2 * n_bar) - 1) * h_bar / (4 * n_bar))))
assert b.shape == (n_variants, n_alleles)

# component of variance between gametes within individuals
c = h_bar / 2
assert c.shape == (n_variants, n_alleles)

return a, b, c
1 change: 1 addition & 0 deletions docs/stats.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ Diversity & divergence
.. autofunction:: mean_pairwise_divergence
.. autofunction:: sequence_divergence
.. autofunction:: windowed_divergence
.. autofunction:: weir_cockerham_anova

Pairwise distance
-----------------
Expand Down

0 comments on commit c5a288e

Please sign in to comment.