broadinstitute · ch-kr · Feb 2, 2021 · Feb 1, 2021 · Feb 1, 2021 · Feb 1, 2021
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -30,6 +30,7 @@
 
 * Removed `compute_quantile_bin` and added `compute_ranked_bin` as an alternative that provides more even binning. This is now used by `create_binned_ht` instead. [(#288)](https://github.com/broadinstitute/gnomad_methods/pull/288)
 * Added additional counts to summary statistics (added autosome/sex chromosome counts, allele counts, counts for missense and synomymous variants) [(#289)](https://github.com/broadinstitute/gnomad_methods/pull/289)
+* Added function, `default_generate_gene_lof_matrix`, to generate gene matrix [(#290)](https://github.com/broadinstitute/gnomad_methods/pull/290)
 
 ## Version 0.4.0 - July 9th, 2020
 

diff --git a/gnomad/assessment/summary_stats.py b/gnomad/assessment/summary_stats.py
@@ -1,12 +1,14 @@
 import logging
-from typing import Dict
+from typing import Dict, Optional, Set
 
 import hail as hl
 
 from gnomad.utils.filtering import filter_low_conf_regions
 from gnomad.utils.vep import (
+    add_most_severe_consequence_to_consequence,
     filter_vep_to_canonical_transcripts,
     get_most_severe_consequence_for_summary,
+    process_consequences,
 )
 
 
@@ -246,3 +248,207 @@ def get_summary_counts(
             ht.locus, ht.alleles, ht.lof, ht.no_lof_flags, ht.most_severe_csq,
         )
     )
+
+
+def get_an_criteria(
+    mt: hl.MatrixTable,
+    samples_by_sex: Optional[Dict[str, int]] = None,
+    meta_root: str = "meta",
+    sex_field: str = "sex_imputation.sex_karyotype",
+    xy_str: str = "XY",
+    xx_str: str = "XX",
+    freq_field: str = "freq",
+    freq_index: int = 0,
+    an_proportion_cutoff: float = 0.8,
+) -> hl.expr.BooleanExpression:
+    """
+    Generates criteria to filter samples based on allele number (AN).
+
+    Uses allele number as proxy for call rate.
+
+    :param mt: Input MatrixTable.
+    :param samples_by_sex: Optional Dictionary containing number of samples (value) for each sample sex (key).
+    :param meta_root: Name of field in MatrixTable containing sample metadata information. Default is 'meta'.
+    :param sex_field: Name of field in MatrixTable containing sample sex assignment. Defualt is 'sex_imputation.sex_karyotype'.
+    :param xy_str: String marking whether a sample has XY sex. Default is 'XY'.
+    :param xx_str: String marking whether a sample has XX sex. Default is 'XX'.
+    :param freq_field: Name of field in MT that contains frequency information. Default is 'freq'.
+    :param freq_index: Which index of frequency struct to use. Default is 0.
+    :param an_proportion_cutoff: Desired allele number proportion cutoff. Default is 0.8.
+    """
+    if samples_by_sex is None:
+        samples_by_sex = mt.aggregate_cols(hl.agg.counter(mt[meta_root][sex_field]))
+    return (
+        hl.case()
+        .when(
+            mt.locus.in_autosome_or_par(),
+            mt[freq_field][freq_index].AN
+            >= an_proportion_cutoff * 2 * sum(samples_by_sex.values()),
+        )
+        .when(
+            mt.locus.in_x_nonpar(),
+            mt[freq_field][freq_index].AN
+            >= an_proportion_cutoff
+            * (samples_by_sex[xy_str] + samples_by_sex[xx_str] * 2),
+        )
+        .when(
+            mt.locus.in_y_nonpar(),
+            mt[freq_field][freq_index].AN
+            >= an_proportion_cutoff * samples_by_sex[xy_str],
+        )
+        .or_missing()
+    )
+
+
+def get_tx_expression_expr(
+    key_expr: hl.expr.StructExpression,
+    tx_ht: hl.Table,
+    csq_expr: hl.expr.StructExpression,
+    gene_field: str = "ensg",
+    csq_field: str = "csq",
+    tx_struct: str = "tx_annotation",
+) -> hl.expr.Float64Expression:
+    """
+    Pulls appropriate transcript expression annotation struct given a specific locus and alleles (provided in `key_expr`).
+
+    Assumes that `key_expr` contains a locus and alleles.
+    Assumes that multi-allelic variants have been split in both `tx_ht` and `key_expr`.
+
+    :param row_key_expr: StructExpression containing locus and alleles to search in `tx_ht`.
+    :param tx_ht: Input Table containing transcript expression information.
+    :param csq_expr: Input StructExpression that contains VEP consequence information.
+    :param gene_field: Field in `csq_expr` that contains gene ID.
+    :param csq_field: Field in `csq_expr` that contains `most_severe_consequence` annotation.
+    :param tx_struct: StructExpression that contains transcript expression information.
+    :return: StructExpression that contains transcript expression information for given gene ID in `csq_expr`.
+    """
+    return hl.find(
+        lambda csq: (csq[gene_field] == csq_expr.gene_id)
+        & (csq[csq_field] == csq_expr.most_severe_consequence),
+        tx_ht[key_expr][tx_struct],
+    )
+
+
+def default_generate_gene_lof_matrix(
+    mt: hl.MatrixTable,
+    tx_ht: Optional[hl.Table],
+    high_expression_cutoff: float = 0.9,
+    low_expression_cutoff: float = 0.1,
+    filter_field: str = "filters",
+    freq_field: str = "freq",
+    freq_index: int = 0,
+    additional_csq_set: Set[str] = {"missense_variant", "synonymous_variant"},
+    all_transcripts: bool = False,
+    filter_an: bool = False,
+    filter_to_rare: bool = False,
+    pre_loftee: bool = False,
+    lof_csq_set: Set[str] = {
+        "splice_acceptor_variant",
+        "splice_donor_variant",
+        "stop_gained",
+        "frameshift_variant",
+    },
+    remove_ultra_common: bool = False,
+) -> hl.MatrixTable:
+    """
+
+    :param mt: Input MatrixTable.
+    :param tx_ht: Optional Table containing expression levels per transcript.
+    :param high_expression_cutoff: Minimum mean proportion expressed cutoff for a transcript to be considered highly expressed. Default is 0.9.
+    :param low_expression_cutoff: Upper mean proportion expressed cutoff for a transcript to lowly expressed. Default is 0.1.
+    :param filter_field: Name of field in MT that contains variant filters. Default is 'filters'.
+    :param freq_field: Name of field in MT that contains frequency information. Default is 'freq'.
+    :param freq_index: Which index of frequency struct to use. Default is 0.
+    :param additional_csq_set: Set of additional consequences to keep. Default is {'missense_variant', 'synonymous_variant'}.
+    :param all_transcripts: Whether to use all transcripts instead of just the transcript with most severe consequence. Default is False.
+    :param filter_an: Whether to filter using allele number as proxy for call rate. Default is False.
+    :param filter_to_rare: Whether to filter to rare (AF < 5%) variants. Default is False.
+    :param pre_loftee: Whether LoF consequences have been annotated with LOFTEE. Default is False.
+    :param lof_csq_set: Set of LoF consequence strings. Default is {"splice_acceptor_variant", "splice_donor_variant", "stop_gained", "frameshift_variant"}.
+    :param remove_ultra_common: Whether to remove ultra common (AF > 95%) variants. Default is False.
+    """
+    logger.info("Filtering to PASS variants...")
+    filt_criteria = hl.len(mt[filter_field]) == 0
+    if filter_an:
+        logger.info(
+            "Using AN (as a call rate proxy) to filter to variants that meet a minimum call rate..."
+        )
+        filt_criteria &= get_an_criteria(mt)
+    if remove_ultra_common:
+        logger.info("Removing ultra common (AF > 95%) variants...")
+        filt_criteria &= mt[freq_field][freq_index].AF < 0.95
+    if filter_to_rare:
+        logger.info("Filtering to rare (AF < 5%) variants...")
+        filt_criteria &= mt[freq_field][freq_index].AF < 0.05
+    mt = mt.filter_rows(filt_criteria)
+
+    if all_transcripts:
+        logger.info("Exploding transcript_consequences field...")
+        explode_field = "transcript_consequences"
+    else:
+        logger.info(
+            "Adding most severe (worst) consequence and expoding worst_csq_by_gene field..."
+        )
+        mt = process_consequences(mt)
+        explode_field = "worst_csq_by_gene"
+
+    if pre_loftee:
+        logger.info(f"Filtering to LoF consequences: {lof_csq_set}")
+        lof_cats = hl.literal(lof_csq_set)
+        criteria = lambda x: lof_cats.contains(
+            add_most_severe_consequence_to_consequence(x).most_severe_consequence
+        )
+    else:
+        logger.info("Filtering to LoF variants that pass LOFTEE with no LoF flags...")
+        criteria = lambda x: (x.lof == "HC") & hl.is_missing(x.lof_flags)
+
+    if additional_csq_set:
+        logger.info(f"Including these consequences: {additional_csq_set}")
+        additional_cats = hl.literal(additional_csq_set)
+        criteria &= lambda x: additional_cats.contains(
+            add_most_severe_consequence_to_consequence(x).most_severe_consequence
+        )
+
+    csqs = mt.vep[explode_field].filter(criteria)
+    mt = mt.select_rows(mt[freq_field], csqs=csqs)
+    mt = mt.explode_rows(mt.csqs)
+    annotation_expr = {
+        "gene_id": mt.csqs.gene_id,
+        "gene": mt.csqs.gene_symbol,
+        "indel": hl.is_indel(mt.alleles[0], mt.alleles[1]),
+        "most_severe_consequence": mt.csqs.most_severe_consequence,
+    }
+
+    if tx_ht:
+        logger.info("Adding transcript expression annotation...")
+        tx_annotation = get_tx_expression_expr(
+            mt.row_key, tx_ht, mt.csqs,
+        ).mean_proportion
+        annotation_expr["expressed"] = (
+            hl.case()
+            .when(tx_annotation >= high_expression_cutoff, "high")
+            .when(tx_annotation > low_expression_cutoff, "medium")
+            .when(hl.is_defined(tx_annotation), "low")
+            .default("missing")
+        )
+    else:
+        annotation_expr["transcript_id"] = mt.csqs.transcript_id
+        annotation_expr["canonical"] = hl.is_defined(mt.csqs.canonical)
+    mt = mt.annotate_rows(**annotation_expr)
+
+    return (
+        mt.group_rows_by(*list(annotation_expr.keys()))
+        .aggregate_rows(
+            n_sites=hl.agg.count(),
+            n_sites_array=hl.agg.array_sum(mt.freq.map(lambda x: hl.int(x.AC > 0))),
+            classic_caf=hl.agg.sum(mt[freq_field][freq_index].AF),
+            max_af=hl.agg.max(mt[freq_field][freq_index].AF),
+            classic_caf_array=hl.agg.array_sum(mt[freq_field].map(lambda x: x.AF)),
+        )
+        .aggregate_entries(
+            num_homs=hl.agg.count_where(mt.GT.is_hom_var()),
+            num_hets=hl.agg.count_where(mt.GT.is_het()),
+            defined_sites=hl.agg.count_where(hl.is_defined(mt.GT)),
+        )
+        .result()
+    )