Skip to content

Commit

Permalink
implement locate_unlinked
Browse files Browse the repository at this point in the history
  • Loading branch information
alimanfoo committed Mar 10, 2015
1 parent 92e1fbb commit fe6c6a6
Show file tree
Hide file tree
Showing 9 changed files with 5,136 additions and 2,622 deletions.
2 changes: 1 addition & 1 deletion allel/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# flake8: noqa


__version__ = '0.10.0.dev4'
__version__ = '0.10.0.dev5'


import allel.model as model
Expand Down
2,432 changes: 1,243 additions & 1,189 deletions allel/opt/model.c

Large diffs are not rendered by default.

4,069 changes: 2,654 additions & 1,415 deletions allel/opt/stats.c

Large diffs are not rendered by default.

71 changes: 71 additions & 0 deletions allel/opt/stats.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -93,3 +93,74 @@ def gn_pairwise_corrcoef_int8(cnp.int8_t[:, :] gn):
k += 1

return np.asarray(out)


@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
@cython.initializedcheck(False)
def gn_locate_unlinked_int8(cnp.int8_t[:, :] gn, int size, int step,
cnp.float32_t threshold):
cdef cnp.uint8_t[:] loc
cdef int window_start, window_stop, i, j
cdef cnp.float32_t r_squared
cdef cnp.int8_t[:, :] gn_sq
cdef int overlap = size - step

# cache square calculation to improve performance
gn_sq = np.power(gn, 2)

# setup output
loc = np.ones(gn.shape[0], dtype='u1')

# setup intermediates
last = False

for window_start in range(0, gn.shape[0], step):

# determine end of current window
window_stop = window_start + size
if window_stop > gn.shape[0]:
window_stop = gn.shape[0]
last = True

if window_start == 0:
# first window
for i in range(window_start, window_stop):
# only go further if still unlinked
if loc[i]:
for j in range(i+1, window_stop):
# only go further if still unlinked
if loc[j]:
gn0 = gn[i]
gn1 = gn[j]
gn0_sq = gn_sq[i]
gn1_sq = gn_sq[j]
r_squared = gn_corrcoef_int8(gn0, gn1, gn0_sq, gn1_sq) ** 2
if r_squared > threshold:
loc[j] = 0

else:
# subsequent windows
for i in range(window_start, window_stop):
# only go further if still unlinked
if loc[i]:
# don't recalculate anything from overlap with previous
# window
ii = max(i+1, window_start+overlap)
if ii < window_stop:
for j in range(ii, window_stop):
# only go further if still unlinked
if loc[j]:
gn0 = gn[i]
gn1 = gn[j]
gn0_sq = gn_sq[i]
gn1_sq = gn_sq[j]
r_squared = gn_corrcoef_int8(gn0, gn1, gn0_sq, gn1_sq) ** 2
if r_squared > threshold:
loc[j] = 0

if last:
break

return np.asarray(loc).view(dtype='b1')
2 changes: 1 addition & 1 deletion allel/stats/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@
from allel.stats.hw import heterozygosity_observed, heterozygosity_expected, \
inbreeding_coefficient

from allel.stats.ld import rogers_huff_r
from allel.stats.ld import rogers_huff_r, locate_unlinked
79 changes: 77 additions & 2 deletions allel/stats/ld.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,49 @@


def rogers_huff_r(gn):
"""TODO
"""Estimate the linkage disequilibrium parameter *r* for each pair of
variants using the method of Rogers and Huff (2008).
"""
Parameters
----------
gn : array_like, int8, shape (n_variants, n_samples)
Diploid genotypes at biallelic variants, coded as the number of
alternate alleles per call (i.e., 0 = hom ref, 1 = het, 2 = hom alt).
Returns
-------
r : ndarray, float, shape (n_variants * (n_variants - 1) // 2,)
Matrix in condensed form.
Examples
--------
>>> import allel
>>> g = allel.model.GenotypeArray([[[0, 0], [1, 1], [0, 0]],
... [[0, 0], [1, 1], [0, 0]],
... [[1, 1], [0, 0], [1, 1]],
... [[0, 0], [0, 1], [-1, -1]]], dtype='i1')
>>> gn = g.to_n_alt(fill=-1)
>>> gn
array([[ 0, 2, 0],
[ 0, 2, 0],
[ 2, 0, 2],
[ 0, 1, -1]], dtype=int8)
>>> r = allel.stats.rogers_huff_r(gn)
>>> r
array([ 1. , -1.00000012, 1. , -1.00000012, 1. , -1. ], dtype=float32)
>>> r ** 2
array([ 1. , 1.00000024, 1. , 1.00000024, 1. , 1. ], dtype=float32)
>>> from scipy.spatial.distance import squareform
>>> squareform(r ** 2)
array([[ 0. , 1. , 1.00000024, 1. ],
[ 1. , 0. , 1.00000024, 1. ],
[ 1.00000024, 1.00000024, 0. , 1. ],
[ 1. , 1. , 1. , 0. ]])
""" # flake8: noqa

# check inputs
gn = asarray_ndim(gn, 2, dtype='i1')
Expand All @@ -22,3 +62,38 @@ def rogers_huff_r(gn):
r = r[0]

return r


def locate_unlinked(gn, size=100, step=20, threshold=.1):
"""Locate variants in approximate linkage equilibrium, where r**2 is
below the given `threshold`.
Parameters
----------
gn : array_like, int8, shape (n_variants, n_samples)
Diploid genotypes at biallelic variants, coded as the number of
alternate alleles per call (i.e., 0 = hom ref, 1 = het, 2 = hom alt).
size : int
Window size (number of variants).
step : int
Number of variants to advance to the next window.
threshold : float
Maximum value of r**2 to include variants.
Returns
-------
loc : ndarray, bool, shape (n_variants)
Boolean array where True items locate variants in approximate
linkage equilibrium.
"""

# check inputs
gn = asarray_ndim(gn, 2, dtype='i1')

from allel.opt.stats import gn_locate_unlinked_int8
loc = gn_locate_unlinked_int8(gn, size, step, threshold)

return loc
23 changes: 22 additions & 1 deletion allel/test/test_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,7 +420,7 @@ def metric(ac1, ac2):


class TestLinkageDisequilibrium(unittest.TestCase):

def test_rogers_huff_r(self):

gn = [[0, 1, 2],
Expand Down Expand Up @@ -458,3 +458,24 @@ def test_rogers_huff_r(self):
expect = 1.
actual = allel.stats.rogers_huff_r(gn)
eq(expect, actual)

gn = [[0, 1, 2],
[0, 1, -1]]
expect = 1.
actual = allel.stats.rogers_huff_r(gn)
eq(expect, actual)

gn = [[0, 2],
[2, 0],
[0, 1]]
expect = [-1, 1, -1]
actual = allel.stats.rogers_huff_r(gn)
assert_array_close(expect, actual)

gn = [[0, 2, 0],
[0, 2, 0],
[2, 0, 2],
[0, 2, -1]]
expect = [1, -1, 1, -1, 1, -1]
actual = allel.stats.rogers_huff_r(gn)
assert_array_close(expect, actual)
7 changes: 7 additions & 0 deletions docs/stats.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,13 @@ Hardy-Weinberg equilibrium
.. autofunction:: heterozygosity_expected
.. autofunction:: inbreeding_coefficient

Linkage disequilibrium
----------------------

.. automodule:: allel.stats.ld
.. autofunction:: rogers_huff_r
.. autofunction:: locate_unlinked

Window utilities
----------------

Expand Down

0 comments on commit fe6c6a6

Please sign in to comment.