Skip to content

Commit

Permalink
Adressing is_accessible problem in roh_mhmm, issue #296 (#327)
Browse files Browse the repository at this point in the history
* Added unit test of 'roh_mhmm()' in the simple case of supplying a genotype list, position list, and contig size. Does not test other functionality of 'roh_mhmm()', nor does it test other ROH functions.

* By using 'allow_none=True' in call to 'is_accessible()' in 'roh_mhmm()', '_mhmm_predict_roh_state()' will simply skip masking inaccessible regions. This is the intended functionality when supplying 'contig_size' but not 'is_accessible'. Added unit test of 'roh_mhmm()' in the simple case of supplying a genotype list, position list, and contig size. Does not test other functionality of 'roh_mhmm()', nor does it test other ROH functions.

* PEP8 formatting correction.

* Edits to @olavur branch

- properly handle is_accessible in poisson HMM func using position_windows
- fix minor spelling errors in check func args
- rename `eqw` variable for clarity
- prefer np.zeros to generate gv for tests

* satisfy PEP8

* Add text describing ROH hmm update

include credits + issue no.

* Update docs/release.rst

Co-authored-by: Ólavur Mortensen <olavurmortensen@gmail.com>
Co-authored-by: Alistair Miles <alimanfoo@googlemail.com>
  • Loading branch information
3 people committed Jul 8, 2020
1 parent c697481 commit d25437d
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 10 deletions.
24 changes: 14 additions & 10 deletions allel/stats/roh.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from allel.model.ndarray import GenotypeVector
from allel.util import asarray_ndim, check_dim0_aligned
from allel.stats.misc import tabulate_state_blocks
from allel.stats.window import equally_accessible_windows, windowed_statistic
from allel.stats.window import equally_accessible_windows, windowed_statistic, position_windows


def roh_mhmm(gv, pos, phet_roh=0.001, phet_nonroh=(0.0025, 0.01), transition=1e-6,
Expand Down Expand Up @@ -70,7 +70,7 @@ def roh_mhmm(gv, pos, phet_roh=0.001, phet_nonroh=(0.0025, 0.01), transition=1e-
gv = GenotypeVector(gv)
pos = asarray_ndim(pos, 1)
check_dim0_aligned(gv, pos)
is_accessible = asarray_ndim(is_accessible, 1, dtype=bool)
is_accessible = asarray_ndim(is_accessible, 1, dtype=bool, allow_none=True)

# heterozygote probabilities
het_px = np.concatenate([(phet_roh,), phet_nonroh])
Expand All @@ -85,7 +85,7 @@ def roh_mhmm(gv, pos, phet_roh=0.001, phet_nonroh=(0.0025, 0.01), transition=1e-
if is_accessible is None:
if contig_size is None:
raise ValueError(
"If is_accessibile argument is not provided, you must provide contig_size")
"If is_accessible argument is not provided, you must provide `contig_size`")
p_accessible = 1.0
else:
p_accessible = is_accessible.mean()
Expand Down Expand Up @@ -207,19 +207,23 @@ def roh_poissonhmm(gv, pos, phet_roh=0.001, phet_nonroh=(0.0025, 0.01), transiti

from pomegranate import HiddenMarkovModel, PoissonDistribution

is_accessible = asarray_ndim(is_accessible, 1, dtype=bool, allow_none=True)

# equally accessbile windows
if is_accessible is None:
if contig_size is None:
raise ValueError(
"If is_accessibile argument is not provided, you must provide contig_size")
is_accessible = np.ones((contig_size,), dtype="bool")
"If is_accessible argument is not provided, you must provide `contig_size`")

# given no accessibility provided use the standard window calculation
roh_windows = position_windows(
pos=pos, size=window_size, step=window_size, start=1, stop=contig_size)
else:
contig_size = is_accessible.size

eqw = equally_accessible_windows(is_accessible, window_size)
roh_windows = equally_accessible_windows(is_accessible, window_size)

ishet = GenotypeVector(gv).is_het()
counts, wins, records = windowed_statistic(pos, ishet, np.sum, windows=eqw)
counts, wins, records = windowed_statistic(pos, ishet, np.sum, windows=roh_windows)

# heterozygote probabilities
het_px = np.concatenate([(phet_roh,), phet_nonroh])
Expand All @@ -242,8 +246,8 @@ def roh_poissonhmm(gv, pos, phet_roh=0.001, phet_nonroh=(0.0025, 0.01), transiti
df_roh = df_blocks[(df_blocks.state == 0)].reset_index(drop=True)

# adapt the dataframe for ROH
df_roh["start"] = df_roh.start_ridx.apply(lambda y: eqw[y, 0])
df_roh["stop"] = df_roh.stop_lidx.apply(lambda y: eqw[y, 1])
df_roh["start"] = df_roh.start_ridx.apply(lambda y: roh_windows[y, 0])
df_roh["stop"] = df_roh.stop_lidx.apply(lambda y: roh_windows[y, 1])
df_roh["length"] = df_roh.stop - df_roh.start

# filter by ROH size
Expand Down
26 changes: 26 additions & 0 deletions allel/test/test_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -910,3 +910,29 @@ def test_patterson_d(self):
expect_den = [0., 1., 1., 0.25, np.nan]
assert_array_almost_equal(expect_num, num)
assert_array_almost_equal(expect_den, den)


class TestRunsOfHomozygosity(unittest.TestCase):

def test_roh_mhmm_100pct(self):

# values correspond to start/stop/length/is_marginal
roh_expected = np.array([[1, 100, 100, True]], dtype=object)
fraction_expected = 1.0
gv = np.zeros((4, 2), dtype=np.int16)
pos = [1, 10, 50, 100]
roh, fraction = allel.roh_mhmm(gv, pos, contig_size=100)
aeq(roh.values, roh_expected)
assert fraction == fraction_expected

def test_roh_mhmm_0pct(self):

fraction_expected = 0.0

gv = np.zeros((4, 2), dtype=np.int16)
gv[2, 0] = 1

pos = [1, 10, 50, 100]
roh, fraction = allel.roh_mhmm(gv, pos, contig_size=100)
assert roh.shape[0] == 0
assert fraction == fraction_expected
4 changes: 4 additions & 0 deletions docs/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ WIP v1.3.0
* Minor documentation fixes. By :user:`Murillo Rodrigues <mufernando>`,
:issue:`273`, :issue:`294`.

* Improve handling of `None` when defining accessibility in `roh_mhmm`
and `roh_poissonhmm`. By :user:`Ólavur Mortensen <olavurmortensen>`
and :user:`Nick Harding <hardingnj>`, :issue:`296`.

.. _release_1.2.1:

v1.2.1
Expand Down

0 comments on commit d25437d

Please sign in to comment.