Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@

* Removed `compute_quantile_bin` and added `compute_ranked_bin` as an alternative that provides more even binning. This is now used by `create_binned_ht` instead. [(#288)](https://github.com/broadinstitute/gnomad_methods/pull/288)
* Added additional counts to summary statistics (added autosome/sex chromosome counts, allele counts, counts for missense and synomymous variants) [(#289)](https://github.com/broadinstitute/gnomad_methods/pull/289)
* Added function, `default_generate_gene_lof_matrix`, to generate gene matrix [(#290)](https://github.com/broadinstitute/gnomad_methods/pull/290)

## Version 0.4.0 - July 9th, 2020

Expand Down
208 changes: 207 additions & 1 deletion gnomad/assessment/summary_stats.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import logging
from typing import Dict
from typing import Dict, Optional, Set

import hail as hl

from gnomad.utils.filtering import filter_low_conf_regions
from gnomad.utils.vep import (
add_most_severe_consequence_to_consequence,
filter_vep_to_canonical_transcripts,
get_most_severe_consequence_for_summary,
process_consequences,
)


Expand Down Expand Up @@ -246,3 +248,207 @@ def get_summary_counts(
ht.locus, ht.alleles, ht.lof, ht.no_lof_flags, ht.most_severe_csq,
)
)


def get_an_criteria(
mt: hl.MatrixTable,
samples_by_sex: Optional[Dict[str, int]] = None,
meta_root: str = "meta",
sex_field: str = "sex_imputation.sex_karyotype",
xy_str: str = "XY",
xx_str: str = "XX",
freq_field: str = "freq",
freq_index: int = 0,
an_proportion_cutoff: float = 0.8,
) -> hl.expr.BooleanExpression:
"""
Generates criteria to filter samples based on allele number (AN).

Uses allele number as proxy for call rate.

:param mt: Input MatrixTable.
:param samples_by_sex: Optional Dictionary containing number of samples (value) for each sample sex (key).
:param meta_root: Name of field in MatrixTable containing sample metadata information. Default is 'meta'.
:param sex_field: Name of field in MatrixTable containing sample sex assignment. Defualt is 'sex_imputation.sex_karyotype'.
:param xy_str: String marking whether a sample has XY sex. Default is 'XY'.
:param xx_str: String marking whether a sample has XX sex. Default is 'XX'.
:param freq_field: Name of field in MT that contains frequency information. Default is 'freq'.
:param freq_index: Which index of frequency struct to use. Default is 0.
:param an_proportion_cutoff: Desired allele number proportion cutoff. Default is 0.8.
"""
if samples_by_sex is None:
samples_by_sex = mt.aggregate_cols(hl.agg.counter(mt[meta_root][sex_field]))
return (
hl.case()
.when(
mt.locus.in_autosome_or_par(),
mt[freq_field][freq_index].AN
>= an_proportion_cutoff * 2 * sum(samples_by_sex.values()),
)
.when(
mt.locus.in_x_nonpar(),
mt[freq_field][freq_index].AN
>= an_proportion_cutoff
* (samples_by_sex[xy_str] + samples_by_sex[xx_str] * 2),
)
.when(
mt.locus.in_y_nonpar(),
mt[freq_field][freq_index].AN
>= an_proportion_cutoff * samples_by_sex[xy_str],
)
.or_missing()
)


def get_tx_expression_expr(
key_expr: hl.expr.StructExpression,
tx_ht: hl.Table,
csq_expr: hl.expr.StructExpression,
gene_field: str = "ensg",
csq_field: str = "csq",
tx_struct: str = "tx_annotation",
) -> hl.expr.Float64Expression:
"""
Pulls appropriate transcript expression annotation struct given a specific locus and alleles (provided in `key_expr`).

Assumes that `key_expr` contains a locus and alleles.
Assumes that multi-allelic variants have been split in both `tx_ht` and `key_expr`.

:param row_key_expr: StructExpression containing locus and alleles to search in `tx_ht`.
:param tx_ht: Input Table containing transcript expression information.
:param csq_expr: Input StructExpression that contains VEP consequence information.
:param gene_field: Field in `csq_expr` that contains gene ID.
:param csq_field: Field in `csq_expr` that contains `most_severe_consequence` annotation.
:param tx_struct: StructExpression that contains transcript expression information.
:return: StructExpression that contains transcript expression information for given gene ID in `csq_expr`.
"""
return hl.find(
lambda csq: (csq[gene_field] == csq_expr.gene_id)
& (csq[csq_field] == csq_expr.most_severe_consequence),
tx_ht[key_expr][tx_struct],
)


def default_generate_gene_lof_matrix(
mt: hl.MatrixTable,
tx_ht: Optional[hl.Table],
high_expression_cutoff: float = 0.9,
low_expression_cutoff: float = 0.1,
filter_field: str = "filters",
freq_field: str = "freq",
freq_index: int = 0,
additional_csq_set: Set[str] = {"missense_variant", "synonymous_variant"},
all_transcripts: bool = False,
filter_an: bool = False,
filter_to_rare: bool = False,
pre_loftee: bool = False,
lof_csq_set: Set[str] = {
"splice_acceptor_variant",
"splice_donor_variant",
"stop_gained",
"frameshift_variant",
},
remove_ultra_common: bool = False,
) -> hl.MatrixTable:
"""

:param mt: Input MatrixTable.
:param tx_ht: Optional Table containing expression levels per transcript.
:param high_expression_cutoff: Minimum mean proportion expressed cutoff for a transcript to be considered highly expressed. Default is 0.9.
:param low_expression_cutoff: Upper mean proportion expressed cutoff for a transcript to lowly expressed. Default is 0.1.
:param filter_field: Name of field in MT that contains variant filters. Default is 'filters'.
:param freq_field: Name of field in MT that contains frequency information. Default is 'freq'.
:param freq_index: Which index of frequency struct to use. Default is 0.
:param additional_csq_set: Set of additional consequences to keep. Default is {'missense_variant', 'synonymous_variant'}.
:param all_transcripts: Whether to use all transcripts instead of just the transcript with most severe consequence. Default is False.
:param filter_an: Whether to filter using allele number as proxy for call rate. Default is False.
:param filter_to_rare: Whether to filter to rare (AF < 5%) variants. Default is False.
:param pre_loftee: Whether LoF consequences have been annotated with LOFTEE. Default is False.
:param lof_csq_set: Set of LoF consequence strings. Default is {"splice_acceptor_variant", "splice_donor_variant", "stop_gained", "frameshift_variant"}.
:param remove_ultra_common: Whether to remove ultra common (AF > 95%) variants. Default is False.
"""
logger.info("Filtering to PASS variants...")
filt_criteria = hl.len(mt[filter_field]) == 0
if filter_an:
logger.info(
"Using AN (as a call rate proxy) to filter to variants that meet a minimum call rate..."
)
filt_criteria &= get_an_criteria(mt)
if remove_ultra_common:
logger.info("Removing ultra common (AF > 95%) variants...")
filt_criteria &= mt[freq_field][freq_index].AF < 0.95
if filter_to_rare:
logger.info("Filtering to rare (AF < 5%) variants...")
filt_criteria &= mt[freq_field][freq_index].AF < 0.05
mt = mt.filter_rows(filt_criteria)

if all_transcripts:
logger.info("Exploding transcript_consequences field...")
explode_field = "transcript_consequences"
else:
logger.info(
"Adding most severe (worst) consequence and expoding worst_csq_by_gene field..."
)
mt = process_consequences(mt)
explode_field = "worst_csq_by_gene"

if pre_loftee:
logger.info(f"Filtering to LoF consequences: {lof_csq_set}")
lof_cats = hl.literal(lof_csq_set)
criteria = lambda x: lof_cats.contains(
add_most_severe_consequence_to_consequence(x).most_severe_consequence
)
else:
logger.info("Filtering to LoF variants that pass LOFTEE with no LoF flags...")
criteria = lambda x: (x.lof == "HC") & hl.is_missing(x.lof_flags)

if additional_csq_set:
logger.info(f"Including these consequences: {additional_csq_set}")
additional_cats = hl.literal(additional_csq_set)
criteria &= lambda x: additional_cats.contains(
add_most_severe_consequence_to_consequence(x).most_severe_consequence
)

csqs = mt.vep[explode_field].filter(criteria)
mt = mt.select_rows(mt[freq_field], csqs=csqs)
mt = mt.explode_rows(mt.csqs)
annotation_expr = {
"gene_id": mt.csqs.gene_id,
"gene": mt.csqs.gene_symbol,
"indel": hl.is_indel(mt.alleles[0], mt.alleles[1]),
"most_severe_consequence": mt.csqs.most_severe_consequence,
}

if tx_ht:
logger.info("Adding transcript expression annotation...")
tx_annotation = get_tx_expression_expr(
mt.row_key, tx_ht, mt.csqs,
).mean_proportion
annotation_expr["expressed"] = (
hl.case()
.when(tx_annotation >= high_expression_cutoff, "high")
.when(tx_annotation > low_expression_cutoff, "medium")
.when(hl.is_defined(tx_annotation), "low")
.default("missing")
)
else:
annotation_expr["transcript_id"] = mt.csqs.transcript_id
annotation_expr["canonical"] = hl.is_defined(mt.csqs.canonical)
mt = mt.annotate_rows(**annotation_expr)

return (
mt.group_rows_by(*list(annotation_expr.keys()))
.aggregate_rows(
n_sites=hl.agg.count(),
n_sites_array=hl.agg.array_sum(mt.freq.map(lambda x: hl.int(x.AC > 0))),
classic_caf=hl.agg.sum(mt[freq_field][freq_index].AF),
max_af=hl.agg.max(mt[freq_field][freq_index].AF),
classic_caf_array=hl.agg.array_sum(mt[freq_field].map(lambda x: x.AF)),
)
.aggregate_entries(
num_homs=hl.agg.count_where(mt.GT.is_hom_var()),
num_hets=hl.agg.count_where(mt.GT.is_het()),
defined_sites=hl.agg.count_where(hl.is_defined(mt.GT)),
)
.result()
)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks great! There's a computation of p that I presume is coming in another PR?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yup! I think you calculated p here https://github.com/macarthur-lab/gnomad_lof/blob/master/constraint/gene_lof_matrix.py#L137. I wasn't sure if I should include that function in this PR