Skip to content

Commit

Permalink
Tajimas D minimum number of segregating sites (#237)
Browse files Browse the repository at this point in the history
* test and fix for tajima d min sites

* release notes;
  • Loading branch information
alimanfoo committed Dec 21, 2018
1 parent fa6fde3 commit 9ca937c
Show file tree
Hide file tree
Showing 4 changed files with 130 additions and 23 deletions.
50 changes: 29 additions & 21 deletions allel/stats/diversity.py
Original file line number Diff line number Diff line change
Expand Up @@ -860,7 +860,7 @@ def windowed_watterson_theta(pos, ac, size=None, start=None, stop=None,


# noinspection PyPep8Naming
def tajima_d(ac, pos=None, start=None, stop=None):
def tajima_d(ac, pos=None, start=None, stop=None, min_sites=3):
"""Calculate the value of Tajima's D over a given region.
Parameters
Expand All @@ -873,6 +873,9 @@ def tajima_d(ac, pos=None, start=None, stop=None):
The position at which to start (1-based). Defaults to the first position.
stop : int, optional
The position at which to stop (1-based). Defaults to the last position.
min_sites : int, optional
Minimum number of segregating sites for which to calculate a value. If
there are fewer, np.nan is returned. Defaults to 3.
Returns
-------
Expand Down Expand Up @@ -911,11 +914,13 @@ def tajima_d(ac, pos=None, start=None, stop=None):
loc = pos.locate_range(start, stop)
ac = ac[loc]

# assume number of chromosomes sampled is constant for all variants
n = ac.sum(axis=1).max()

# count segregating variants
S = ac.count_segregating()
if S < min_sites:
return np.nan

# assume number of chromosomes sampled is constant for all variants
n = ac.sum(axis=1).max()

# (n-1)th harmonic number
a1 = np.sum(1 / np.arange(1, n))
Expand Down Expand Up @@ -951,7 +956,7 @@ def tajima_d(ac, pos=None, start=None, stop=None):

# noinspection PyPep8Naming
def windowed_tajima_d(pos, ac, size=None, start=None, stop=None,
step=None, windows=None, fill=np.nan):
step=None, windows=None, min_sites=3):
"""Calculate the value of Tajima's D in windows over a single
chromosome/contig.
Expand All @@ -974,8 +979,9 @@ def windowed_tajima_d(pos, ac, size=None, start=None, stop=None,
Manually specify the windows to use as a sequence of (window_start,
window_stop) positions, using 1-based coordinates. Overrides the
size/start/stop/step parameters.
fill : object, optional
The value to use where a window is completely inaccessible.
min_sites : int, optional
Minimum number of segregating sites for which to calculate a value. If
there are fewer, np.nan is returned. Defaults to 3.
Returns
-------
Expand All @@ -1001,18 +1007,15 @@ def windowed_tajima_d(pos, ac, size=None, start=None, stop=None,
... [[0, 1], [-1, -1]],
... [[-1, -1], [-1, -1]]])
>>> ac = g.count_alleles()
>>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
>>> D, windows, counts = allel.windowed_tajima_d(
... pos, ac, size=10, start=1, stop=31
... )
>>> pos = [2, 4, 7, 14, 15, 20, 22, 25, 27]
>>> D, windows, counts = allel.windowed_tajima_d(pos, ac, size=20, step=10, start=1, stop=31)
>>> D
array([0.59158014, 2.93397641, 6.12372436])
array([1.36521524, 4.22566622])
>>> windows
array([[ 1, 10],
[11, 20],
[21, 31]])
array([[ 1, 20],
[11, 31]])
>>> counts
array([3, 4, 2])
array([6, 6])
"""

Expand Down Expand Up @@ -1045,6 +1048,8 @@ def windowed_tajima_d(pos, ac, size=None, start=None, stop=None,
# noinspection PyPep8Naming
def statistic(w_is_seg, w_mpd):
S = np.count_nonzero(w_is_seg)
if S < min_sites:
return np.nan
pi = np.sum(w_mpd)
d = pi - (S / a1)
d_stdev = np.sqrt((e1 * S) + (e2 * S * (S - 1)))
Expand All @@ -1054,12 +1059,12 @@ def statistic(w_is_seg, w_mpd):
D, windows, counts = windowed_statistic(pos, values=(is_seg, mpd),
statistic=statistic, size=size,
start=start, stop=stop, step=step,
windows=windows, fill=fill)
windows=windows, fill=np.nan)

return D, windows, counts


def moving_tajima_d(ac, size, start=0, stop=None, step=None):
def moving_tajima_d(ac, size, start=0, stop=None, step=None, min_sites=3):
"""Calculate the value of Tajima's D in moving windows of `size` variants.
Expand All @@ -1076,6 +1081,9 @@ def moving_tajima_d(ac, size, start=0, stop=None, step=None):
step : int, optional
The number of variants between start positions of windows. If not
given, defaults to the window size, i.e., non-overlapping windows.
min_sites : int, optional
Minimum number of segregating sites for which to calculate a value. If
there are fewer, np.nan is returned. Defaults to 3.
Returns
-------
Expand All @@ -1096,12 +1104,12 @@ def moving_tajima_d(ac, size, start=0, stop=None, step=None):
... [[0, 1], [-1, -1]],
... [[-1, -1], [-1, -1]]])
>>> ac = g.count_alleles()
>>> D = allel.moving_tajima_d(ac, size=3)
>>> D = allel.moving_tajima_d(ac, size=4, step=2)
>>> D
array([0.59158014, 1.89305645, 5.79748537])
array([0.1676558 , 2.01186954, 5.70029703])
"""

d = moving_statistic(values=ac, statistic=tajima_d, size=size, start=start, stop=stop,
step=step)
step=step, min_sites=min_sites)
return d
7 changes: 5 additions & 2 deletions allel/stats/window.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from allel.util import asarray_ndim, ignore_invalid, check_equal_length


def moving_statistic(values, statistic, size, start=0, stop=None, step=None):
def moving_statistic(values, statistic, size, start=0, stop=None, step=None, **kwargs):
"""Calculate a statistic in a moving window over `values`.
Parameters
Expand All @@ -28,6 +28,9 @@ def moving_statistic(values, statistic, size, start=0, stop=None, step=None):
step : int, optional
The distance between start positions of windows. If not given,
defaults to the window size, i.e., non-overlapping windows.
kwargs
Additional keyword arguments are passed through to the `statistic`
function.
Returns
-------
Expand All @@ -49,7 +52,7 @@ def moving_statistic(values, statistic, size, start=0, stop=None, step=None):
windows = index_windows(values, size, start, stop, step)

# setup output
out = np.array([statistic(values[i:j]) for i, j in windows])
out = np.array([statistic(values[i:j], **kwargs) for i, j in windows])

return out

Expand Down
91 changes: 91 additions & 0 deletions allel/test/test_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import numpy as np
import pytest
from pytest import approx


import allel
Expand Down Expand Up @@ -294,6 +295,96 @@ def test_windowed_divergence(self):
)
assert_array_almost_equal(expect, actual)

def test_tajima_d(self):
from allel import tajima_d

# example with calculable value
ac = AlleleCountsArray([[1, 3],
[2, 2],
[3, 1]])
expect = approx(0.168, 0.01)
actual = tajima_d(ac)
assert expect == actual

# too few sites
ac = AlleleCountsArray([[2, 2],
[3, 1]])
assert np.nan is tajima_d(ac)

# too few segregating sites
ac = AlleleCountsArray([[4, 0],
[2, 2],
[3, 1]])
assert np.nan is tajima_d(ac)
# allow people to override if they really want to
assert approx(0.592, 0.01) == tajima_d(ac, min_sites=2)

def test_moving_tajima_d(self):
from allel import moving_tajima_d

# example with calculable value
ac = AlleleCountsArray([[1, 3],
[2, 2],
[3, 1],
[1, 3],
[2, 2]])
expect = np.array([0.168] * 3)
actual = moving_tajima_d(ac, size=3, step=1)
assert_array_almost_equal(expect, actual, decimal=3)

# too few sites
actual = moving_tajima_d(ac, size=2, step=1)
assert 4 == len(actual)
assert np.all(np.isnan(actual))

# too few segregating sites
ac = AlleleCountsArray([[4, 0],
[2, 2],
[3, 1],
[4, 0],
[2, 2]])
actual = moving_tajima_d(ac, size=3, step=1)
assert 3 == len(actual)
assert np.all(np.isnan(actual))
# allow people to override if they really want to
expect = np.array([0.592] * 3)
actual = moving_tajima_d(ac, size=3, step=1, min_sites=2)
assert_array_almost_equal(expect, actual, decimal=3)

def test_windowed_tajima_d(self):
from allel import windowed_tajima_d

pos = np.array([1, 11, 21, 31, 41])

# example with calculable value
ac = AlleleCountsArray([[1, 3],
[2, 2],
[3, 1],
[1, 3],
[2, 2]])
expect = np.array([0.168] * 3)
actual, _, _ = windowed_tajima_d(pos, ac, size=25, step=10)
assert_array_almost_equal(expect, actual, decimal=3)

# too few sites
actual, _, _ = windowed_tajima_d(pos, ac, size=15, step=10)
assert 4 == len(actual)
assert np.all(np.isnan(actual))

# too few segregating sites
ac = AlleleCountsArray([[4, 0],
[2, 2],
[3, 1],
[4, 0],
[2, 2]])
actual, _, _ = windowed_tajima_d(pos, ac, size=25, step=10)
assert 3 == len(actual)
assert np.all(np.isnan(actual))
# allow people to override if they really want to
expect = np.array([0.592] * 3)
actual, _, _ = windowed_tajima_d(pos, ac, size=25, step=10, min_sites=2)
assert_array_almost_equal(expect, actual, decimal=3)


class TestHardyWeinberg(unittest.TestCase):

Expand Down
5 changes: 5 additions & 0 deletions docs/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,11 @@ v1.2.0 (work in progress)
:func:`allel.equally_accessible_windows`. By :user:`Alistair Miles <alimanfoo>`,
:issue:`234`, :issue:`166`.

* Fixed functions calculating Tajima's D such that a value of `np.nan`
is returned if there are fewer than 3 segregating sites. By
:user:`Andrew Kern <andrewkern>` and :user:`Alistair Miles
<alimanfoo>`, :issue:`175`, :issue:`237`.

* Fixed incorrect fill value in GFF parsing functions. By
:user:`Alistair Miles <alimanfoo>`, :issue:`165`, :issue:`223`.

Expand Down

0 comments on commit 9ca937c

Please sign in to comment.