From eaa2e3adeb879a7f90753b7bb0b321ad79f159fa Mon Sep 17 00:00:00 2001
From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com>
Date: Thu, 13 Jun 2024 21:59:46 -0600
Subject: [PATCH 1/4] Fix bug in process consequences and clean up

---
 gnomad/utils/vep.py | 161 ++++++++++++++++++++++++++++++++------------
 1 file changed, 119 insertions(+), 42 deletions(-)

diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py
index f71c921c4..63cbfafe6 100644
--- a/gnomad/utils/vep.py
+++ b/gnomad/utils/vep.py
@@ -270,24 +270,60 @@ def vep_or_lookup_vep(
     return vep_ht.union(revep_ht)
 
 
-def add_most_severe_consequence_to_consequence(
-    tc: hl.expr.StructExpression,
-) -> hl.expr.StructExpression:
+def get_most_severe_consequence_expr(
+    csq_expr: hl.expr.ArrayExpression,
+    csq_order: Optional[List[str]] = None,
+) -> hl.expr.StringExpression:
     """
-    Add most_severe_consequence annotation to transcript consequences.
+    Get the most severe consequence from a collection of consequences.
+
+    This is for a given transcript, as there are often multiple annotations for a single
+    transcript: e.g. splice_region_variant&intron_variant -> splice_region_variant
 
-    This is for a given transcript, as there are often multiple annotations for a single transcript:
-    e.g. splice_region_variant&intron_variant -> splice_region_variant
+    :param csq_expr: ArrayExpression of consequences.
+    :param csq_order: Optional list indicating the order of VEP consequences, sorted
+        from high to low impact. Default is None, which uses the value of the
+        `CSQ_ORDER` global.
+    :return: Most severe consequence in `csq_expr`.
     """
-    csqs = hl.literal(CSQ_ORDER)
+    if csq_order is None:
+        csq_order = CSQ_ORDER
+    csqs = hl.literal(csq_order)
 
-    return tc.annotate(
-        most_severe_consequence=csqs.find(lambda c: tc.consequence_terms.contains(c))
-    )
+    return csqs.find(lambda c: csq_expr.contains(c))
+
+
+def add_most_severe_consequence_to_consequence(
+    tc: Union[hl.expr.StructExpression, hl.expr.ArrayExpression],
+    csq_order: Optional[List[str]] = None,
+    most_severe_csq_field: str = "most_severe_consequence",
+) -> Union[hl.expr.StructExpression, hl.expr.ArrayExpression]:
+    """
+    Add a `most_severe_consequence` field to a transcript consequence or array of transcript consequences.
+
+    For a single transcript consequence, `tc` should be a StructExpression with a
+    `consequence_terms` field, e.g. Struct(consequence_terms=['missense_variant']).
+    For an array of transcript consequences, `tc` should be an ArrayExpression of
+    StructExpressions with a `consequence_terms` field.
+
+    :param tc: Transcript consequence or array of transcript consequences to annotate.
+    :param csq_order: Optional list indicating the order of VEP consequences, sorted
+        from high to low impact. Default is None, which uses the value of the
+        `CSQ_ORDER` global.
+    :param most_severe_csq_field: Field name to use for most severe consequence. Default
+        is 'most_severe_consequence'.
+    :return: Transcript consequence or array of transcript consequences annotated with
+        the most severe consequence.
+    """
+    csq = lambda x: get_most_severe_consequence_expr(x.consequence_terms, csq_order)
+    if isinstance(tc, hl.expr.StructExpression):
+        return tc.annotate(**{most_severe_csq_field: csq(tc)})
+    else:
+        return tc.map(lambda x: x.annotate(**{most_severe_csq_field: csq(x)}))
 
 
 def process_consequences(
-    mt: Union[hl.MatrixTable, hl.Table],
+    t: Union[hl.MatrixTable, hl.Table],
     vep_root: str = "vep",
     penalize_flags: bool = True,
     csq_order: Optional[List[str]] = None,
@@ -298,12 +334,25 @@ def process_consequences(
 
     `most_severe_consequence` is the worst consequence for a transcript.
 
+    Each transcript consequence is annotated with a `csq_score` which is a combination
+    of the index of the consequence's `most_severe_consequence` in `csq_order` and a
+    boost for loss-of-function consequences, and polyphen predictions if `has_polyphen`
+    is True.
+
+    The score adjustment is as follows:
+        - lof == 'HC' & NO lof_flags (-1000 if penalize_flags, -500 if not)
+        - lof == 'HC' & lof_flags (-500)
+        - lof == 'OS' (-20)
+        - lof == 'LC' (-10)
+        - everything else (0)
+
     .. note::
+
         From gnomAD v4.0 on, the PolyPhen annotation was removed from the VEP Struct
         in the release HTs. When using this function with gnomAD v4.0 or later,
         set `has_polyphen` to False.
 
-    :param mt: Input Table or MatrixTable.
+    :param t: Input Table or MatrixTable.
     :param vep_root: Root for VEP annotation (probably "vep").
     :param penalize_flags: Whether to penalize LOFTEE flagged variants, or treat them
         as equal to HC.
@@ -317,56 +366,84 @@ def process_consequences(
     if csq_order is None:
         csq_order = CSQ_ORDER
     csqs = hl.literal(csq_order)
+
+    # Assign a score to each consequence based on the order in csq_order.
     csq_dict = hl.literal(dict(zip(csq_order, range(len(csq_order)))))
 
-    def _csq_score(tc: hl.expr.StructExpression) -> int:
-        return csq_dict[tc.most_severe_consequence]
+    def _find_worst_transcript_consequence(
+        tcl: hl.expr.ArrayExpression,
+    ) -> hl.expr.StructExpression:
+        """
+        Find the worst transcript consequence in an array of transcript consequences.
 
-    flag_score = 500
-    no_flag_score = flag_score * (1 + penalize_flags)
+        :param tcl: Array of transcript consequences.
+        :return: Worst transcript consequence.
+        """
+        flag = 500
+        no_flag = flag * (1 + penalize_flags)
 
-    def _csq_score_modifier(tc: hl.expr.StructExpression) -> float:
-        modifier = _csq_score(tc)
-        flag_condition = (tc.lof == "HC") & (tc.lof_flags != "")
-        modifier -= hl.if_else(flag_condition, flag_score, no_flag_score)
-        modifier -= hl.if_else(tc.lof == "OS", 20, 0)
-        modifier -= hl.if_else(tc.lof == "LC", 10, 0)
-        if has_polyphen:
-            modifier -= (
-                hl.case()
-                .when(tc.polyphen_prediction == "probably_damaging", 0.5)
-                .when(tc.polyphen_prediction == "possibly_damaging", 0.25)
-                .when(tc.polyphen_prediction == "benign", 0.1)
+        # Score each consequence based on the order in csq_order.
+        score_expr = tcl.map(
+            lambda tc: csq_dict[csqs.find(lambda x: x == tc.most_severe_consequence)]
+        )
+
+        # Determine the score adjustment based on the consequence's LOF and LOF flags.
+        sub_expr = tcl.map(
+            lambda tc: (
+                hl.case(missing_false=True)
+                .when((tc.lof == "HC") & hl.or_else(tc.lof_flags == "", True), no_flag)
+                .when((tc.lof == "HC") & (tc.lof_flags != ""), flag)
+                .when(tc.lof == "OS", 20)
+                .when(tc.lof == "LC", 10)
                 .default(0)
             )
-        return modifier
+        )
 
-    def find_worst_transcript_consequence(
-        tcl: hl.expr.ArrayExpression,
-    ) -> hl.expr.StructExpression:
-        tcl = tcl.map(
-            lambda tc: tc.annotate(csq_score=_csq_score(tc) - _csq_score_modifier(tc))
+        # If requested, determine the score adjustment based on the consequence's
+        # PolyPhen prediction.
+        if has_polyphen:
+            polyphen_sub_expr = tcl.map(
+                lambda tc: (
+                    hl.case(missing_false=True)
+                    .when(tc.polyphen_prediction == "probably_damaging", 0.5)
+                    .when(tc.polyphen_prediction == "possibly_damaging", 0.25)
+                    .when(tc.polyphen_prediction == "benign", 0.1)
+                    .default(0)
+                )
+            )
+            sub_expr = hl.map(lambda s, ps: s + ps, sub_expr, polyphen_sub_expr)
+
+        # Calculate the final consequence score.
+        tcl = hl.map(
+            lambda tc, s, ss: tc.annotate(csq_score=s - ss), tcl, score_expr, sub_expr
         )
+
+        # Return the worst consequence based on the calculated score.
         return hl.or_missing(hl.len(tcl) > 0, hl.sorted(tcl, lambda x: x.csq_score)[0])
 
-    transcript_csqs = mt[vep_root].transcript_consequences.map(
-        add_most_severe_consequence_to_consequence
+    # Annotate each transcript consequence with the 'most_severe_consequence'.
+    transcript_csqs = t[vep_root].transcript_consequences.map(
+        lambda tc: add_most_severe_consequence_to_consequence(tc, csq_order)
     )
 
+    # Group transcript consequences by gene and find the worst consequence for each.
     gene_dict = transcript_csqs.group_by(lambda tc: tc.gene_symbol)
-    worst_csq_gene = gene_dict.map_values(find_worst_transcript_consequence).values()
+    worst_csq_gene = gene_dict.map_values(_find_worst_transcript_consequence).values()
     sorted_scores = hl.sorted(worst_csq_gene, key=lambda tc: tc.csq_score)
 
+    # Filter transcript consequences to only include canonical transcripts and find the
+    # worst consequence for each gene.
     canonical = transcript_csqs.filter(lambda csq: csq.canonical == 1)
     gene_canonical_dict = canonical.group_by(lambda tc: tc.gene_symbol)
     worst_csq_gene_canonical = gene_canonical_dict.map_values(
-        find_worst_transcript_consequence
+        _find_worst_transcript_consequence
     ).values()
     sorted_canonical_scores = hl.sorted(
         worst_csq_gene_canonical, key=lambda tc: tc.csq_score
     )
 
-    vep_data = mt[vep_root].annotate(
+    # Annotate the HT/MT with the worst consequence for each gene and variant.
+    vep_data = t[vep_root].annotate(
         transcript_consequences=transcript_csqs,
         worst_consequence_term=csqs.find(
             lambda c: transcript_csqs.map(
@@ -384,9 +461,9 @@ def find_worst_transcript_consequence(
     )
 
     return (
-        mt.annotate_rows(**{vep_root: vep_data})
-        if isinstance(mt, hl.MatrixTable)
-        else mt.annotate(**{vep_root: vep_data})
+        t.annotate_rows(**{vep_root: vep_data})
+        if isinstance(t, hl.MatrixTable)
+        else t.annotate(**{vep_root: vep_data})
     )
 
 

From 223fa65ca27938a3d21056485df1c076f791bc6d Mon Sep 17 00:00:00 2001
From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com>
Date: Thu, 13 Jun 2024 22:02:09 -0600
Subject: [PATCH 2/4] Use updated add_most_severe_consequence_to_consequence(

---
 gnomad/utils/vep.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py
index 63cbfafe6..5370cdcb4 100644
--- a/gnomad/utils/vep.py
+++ b/gnomad/utils/vep.py
@@ -422,8 +422,8 @@ def _find_worst_transcript_consequence(
         return hl.or_missing(hl.len(tcl) > 0, hl.sorted(tcl, lambda x: x.csq_score)[0])
 
     # Annotate each transcript consequence with the 'most_severe_consequence'.
-    transcript_csqs = t[vep_root].transcript_consequences.map(
-        lambda tc: add_most_severe_consequence_to_consequence(tc, csq_order)
+    transcript_csqs = add_most_severe_consequence_to_consequence(
+        t[vep_root].transcript_consequences, csq_order
     )
 
     # Group transcript consequences by gene and find the worst consequence for each.

From 832e7c57278b2fe8cd74020d756c337b22c278b2 Mon Sep 17 00:00:00 2001
From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com>
Date: Fri, 14 Jun 2024 07:59:28 -0600
Subject: [PATCH 3/4] Update gnomad/utils/vep.py

Co-authored-by: Mike Wilson <mwilson@broadinstitute.org>
---
 gnomad/utils/vep.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py
index 5370cdcb4..e22f070c0 100644
--- a/gnomad/utils/vep.py
+++ b/gnomad/utils/vep.py
@@ -335,9 +335,9 @@ def process_consequences(
     `most_severe_consequence` is the worst consequence for a transcript.
 
     Each transcript consequence is annotated with a `csq_score` which is a combination
-    of the index of the consequence's `most_severe_consequence` in `csq_order` and a
-    boost for loss-of-function consequences, and polyphen predictions if `has_polyphen`
-    is True.
+    of the index of the consequence's `most_severe_consequence` in `csq_order` and an
+    extra deduction for loss-of-function consequences, and polyphen predictions if 
+    `has_polyphen` is True. Lower scores translate to higher severity. 
 
     The score adjustment is as follows:
         - lof == 'HC' & NO lof_flags (-1000 if penalize_flags, -500 if not)

From a39146368414012f06961b76043ca26c10a76b19 Mon Sep 17 00:00:00 2001
From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com>
Date: Fri, 14 Jun 2024 08:04:09 -0600
Subject: [PATCH 4/4] Format

---
 gnomad/utils/vep.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py
index e22f070c0..57c7b7707 100644
--- a/gnomad/utils/vep.py
+++ b/gnomad/utils/vep.py
@@ -336,8 +336,8 @@ def process_consequences(
 
     Each transcript consequence is annotated with a `csq_score` which is a combination
     of the index of the consequence's `most_severe_consequence` in `csq_order` and an
-    extra deduction for loss-of-function consequences, and polyphen predictions if 
-    `has_polyphen` is True. Lower scores translate to higher severity. 
+    extra deduction for loss-of-function consequences, and polyphen predictions if
+    `has_polyphen` is True. Lower scores translate to higher severity.
 
     The score adjustment is as follows:
         - lof == 'HC' & NO lof_flags (-1000 if penalize_flags, -500 if not)