Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@

## Unreleased

* Fix for error in `generate_trio_stats_expr` that led to an incorrect untransmitted count. [(#238)](https://github.com/broadinstitute/gnomad_methods/pull/238)
* Fix for error in `compute_quantile_bin` that caused incorrect binning when a single score overlapped multiple bins. [(#238)](https://github.com/broadinstitute/gnomad_methods/pull/238)
* Removed assumption of `snv` annotation from `compute_quantile_bin`. [(#238)](https://github.com/broadinstitute/gnomad_methods/pull/238)
* Fixed `create_binned_ht` because it produced a "Cannot combine expressions from different source objects error". [(#238)](https://github.com/broadinstitute/gnomad_methods/pull/238)

## Version 0.4.0 - July 9th, 2020

**Note** gnomAD resources have been moved to a [requester pays bucket](https://cloud.google.com/storage/docs/requester-pays).
Expand Down
7 changes: 6 additions & 1 deletion gnomad/sample_qc/relatedness.py
Original file line number Diff line number Diff line change
Expand Up @@ -855,7 +855,12 @@ def _ac_an_parent_child_count(
trio_stats = hl.struct(
**{
f"{name2}_{name}": hl.agg.filter(
trio_mt.proband_entry.GT.is_non_ref() & expr,
(
trio_mt.proband_entry.GT.is_non_ref()
| trio_mt.father_entry.GT.is_non_ref()
| trio_mt.mother_entry.GT.is_non_ref()
)
& expr,
hl.agg.sum(
trans_count_map.get(
(
Expand Down
23 changes: 13 additions & 10 deletions gnomad/variant_qc/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,12 +110,16 @@ def quantiles_to_bin_boundaries(quantiles: List[int]) -> Dict:
bin_expr = {
f"{bin_id}_{snv}": (bin_expr & snv_expr)
for bin_id, bin_expr in bin_expr.items()
for snv, snv_expr in [("snv", ht.snv), ("indel", ~ht.snv)]
for snv, snv_expr in [
("snv", hl.is_snp(ht.alleles[0], ht.alleles[1])),
("indel", ~hl.is_snp(ht.alleles[0], ht.alleles[1])),
]
}

bin_ht = ht.annotate(
**{f"_filter_{bin_id}": bin_expr for bin_id, bin_expr in bin_expr.items()},
_score=score_expr,
snv=hl.is_snp(ht.alleles[0], ht.alleles[1]),
)

logger.info(
Expand Down Expand Up @@ -183,10 +187,11 @@ def quantiles_to_bin_boundaries(quantiles: List[int]) -> Dict:
# If a value falls in a bin that needs expansion, assign it randomly to one of the expanded bins
# Otherwise, simply modify the bin to its global index (with expanded bins that is)
bin_ht = bin_ht.select(
"snv",
**{
bin_id: hl.cond(
bin_id: hl.if_else(
bin_ht.bin_stats[bin_id].merged_bins.contains(bin_ht[bin_id]),
bin_ht[bin_id]
bin_ht.bin_stats[bin_id].global_bin_indices[bin_ht[bin_id]]
+ hl.int(
hl.rand_unif(
0, bin_ht.bin_stats[bin_id].merged_bins[bin_ht[bin_id]] + 1
Expand All @@ -195,7 +200,7 @@ def quantiles_to_bin_boundaries(quantiles: List[int]) -> Dict:
bin_ht.bin_stats[bin_id].global_bin_indices[bin_ht[bin_id]],
)
for bin_id in bin_expr
}
},
)

if desc:
Expand Down Expand Up @@ -223,10 +228,8 @@ def quantiles_to_bin_boundaries(quantiles: List[int]) -> Dict:

bin_ht = bin_ht.transmute(
**{
bin_id: hl.cond(
ht[bin_ht.key].snv,
bin_ht[f"{bin_id}_snv"],
bin_ht[f"{bin_id}_indel"],
bin_id: hl.if_else(
bin_ht.snv, bin_ht[f"{bin_id}_snv"], bin_ht[f"{bin_id}_indel"],
)
for bin_id in bin_expr_no_snv
}
Expand Down Expand Up @@ -435,7 +438,7 @@ def add_rank(

rank_ht = rank_ht.key_by("_score").persist()
scan_expr = {
"rank": hl.cond(
"rank": hl.if_else(
rank_ht.is_snv,
hl.scan.count_where(rank_ht.is_snv),
hl.scan.count_where(~rank_ht.is_snv),
Expand All @@ -445,7 +448,7 @@ def add_rank(
{
name: hl.or_missing(
rank_ht[f"_{name}"],
hl.cond(
hl.if_else(
rank_ht.is_snv,
hl.scan.count_where(rank_ht.is_snv & rank_ht[f"_{name}"]),
hl.scan.count_where(~rank_ht.is_snv & rank_ht[f"_{name}"]),
Expand Down
20 changes: 7 additions & 13 deletions gnomad/variant_qc/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@
import hail as hl
import pyspark.sql

import hail as hl

import gnomad.resources.grch37 as grch37_resources
import gnomad.resources.grch38 as grch38_resources
from gnomad.sample_qc.relatedness import (
Expand Down Expand Up @@ -66,7 +64,7 @@ def create_binned_ht(
:return: table with bin number for each variant
"""

def update_bin_expr(
def _update_bin_expr(
bin_expr: Dict[str, hl.expr.BooleanExpression],
new_expr: hl.expr.BooleanExpression,
new_id: str,
Expand All @@ -89,31 +87,27 @@ def update_bin_expr(
)
return bin_expr

ht = ht.annotate(
singleton=ht.ac_raw == 1, snv=hl.is_snp(ht.alleles[0], ht.alleles[1]),
)

ht = ht.filter(ht.ac_raw > 0).persist()

# Desired bins and sub-bins
bin_expr = {"bin": True}

if singleton:
bin_expr = update_bin_expr(bin_expr, ht.singleton, "singleton")
bin_expr = _update_bin_expr(bin_expr, ht.ac_raw == 1, "singleton")

if biallelic:
bin_expr = update_bin_expr(bin_expr, ~ht.was_split, "biallelic")
bin_expr = _update_bin_expr(bin_expr, ~ht.was_split, "biallelic")

if adj:
bin_expr = update_bin_expr(bin_expr, (ht.ac > 0), "adj")
bin_expr = _update_bin_expr(bin_expr, (ht.ac > 0), "adj")

if add_substrat:
for add_id, add_expr in add_substrat.items():
bin_expr = update_bin_expr(bin_expr, add_expr, add_id)
bin_expr = _update_bin_expr(bin_expr, add_expr, add_id)

bin_ht = compute_quantile_bin(
ht, score_expr=ht.score, bin_expr=bin_expr, n_bins=n_bins
)

ht = ht.select_globals()
ht = ht.join(bin_ht, how="left")

return ht
Expand Down