From eaa2e3adeb879a7f90753b7bb0b321ad79f159fa Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Thu, 13 Jun 2024 21:59:46 -0600 Subject: [PATCH 1/4] Fix bug in process consequences and clean up --- gnomad/utils/vep.py | 161 ++++++++++++++++++++++++++++++++------------ 1 file changed, 119 insertions(+), 42 deletions(-) diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index f71c921c4..63cbfafe6 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -270,24 +270,60 @@ def vep_or_lookup_vep( return vep_ht.union(revep_ht) -def add_most_severe_consequence_to_consequence( - tc: hl.expr.StructExpression, -) -> hl.expr.StructExpression: +def get_most_severe_consequence_expr( + csq_expr: hl.expr.ArrayExpression, + csq_order: Optional[List[str]] = None, +) -> hl.expr.StringExpression: """ - Add most_severe_consequence annotation to transcript consequences. + Get the most severe consequence from a collection of consequences. + + This is for a given transcript, as there are often multiple annotations for a single + transcript: e.g. splice_region_variant&intron_variant -> splice_region_variant - This is for a given transcript, as there are often multiple annotations for a single transcript: - e.g. splice_region_variant&intron_variant -> splice_region_variant + :param csq_expr: ArrayExpression of consequences. + :param csq_order: Optional list indicating the order of VEP consequences, sorted + from high to low impact. Default is None, which uses the value of the + `CSQ_ORDER` global. + :return: Most severe consequence in `csq_expr`. """ - csqs = hl.literal(CSQ_ORDER) + if csq_order is None: + csq_order = CSQ_ORDER + csqs = hl.literal(csq_order) - return tc.annotate( - most_severe_consequence=csqs.find(lambda c: tc.consequence_terms.contains(c)) - ) + return csqs.find(lambda c: csq_expr.contains(c)) + + +def add_most_severe_consequence_to_consequence( + tc: Union[hl.expr.StructExpression, hl.expr.ArrayExpression], + csq_order: Optional[List[str]] = None, + most_severe_csq_field: str = "most_severe_consequence", +) -> Union[hl.expr.StructExpression, hl.expr.ArrayExpression]: + """ + Add a `most_severe_consequence` field to a transcript consequence or array of transcript consequences. + + For a single transcript consequence, `tc` should be a StructExpression with a + `consequence_terms` field, e.g. Struct(consequence_terms=['missense_variant']). + For an array of transcript consequences, `tc` should be an ArrayExpression of + StructExpressions with a `consequence_terms` field. + + :param tc: Transcript consequence or array of transcript consequences to annotate. + :param csq_order: Optional list indicating the order of VEP consequences, sorted + from high to low impact. Default is None, which uses the value of the + `CSQ_ORDER` global. + :param most_severe_csq_field: Field name to use for most severe consequence. Default + is 'most_severe_consequence'. + :return: Transcript consequence or array of transcript consequences annotated with + the most severe consequence. + """ + csq = lambda x: get_most_severe_consequence_expr(x.consequence_terms, csq_order) + if isinstance(tc, hl.expr.StructExpression): + return tc.annotate(**{most_severe_csq_field: csq(tc)}) + else: + return tc.map(lambda x: x.annotate(**{most_severe_csq_field: csq(x)})) def process_consequences( - mt: Union[hl.MatrixTable, hl.Table], + t: Union[hl.MatrixTable, hl.Table], vep_root: str = "vep", penalize_flags: bool = True, csq_order: Optional[List[str]] = None, @@ -298,12 +334,25 @@ def process_consequences( `most_severe_consequence` is the worst consequence for a transcript. + Each transcript consequence is annotated with a `csq_score` which is a combination + of the index of the consequence's `most_severe_consequence` in `csq_order` and a + boost for loss-of-function consequences, and polyphen predictions if `has_polyphen` + is True. + + The score adjustment is as follows: + - lof == 'HC' & NO lof_flags (-1000 if penalize_flags, -500 if not) + - lof == 'HC' & lof_flags (-500) + - lof == 'OS' (-20) + - lof == 'LC' (-10) + - everything else (0) + .. note:: + From gnomAD v4.0 on, the PolyPhen annotation was removed from the VEP Struct in the release HTs. When using this function with gnomAD v4.0 or later, set `has_polyphen` to False. - :param mt: Input Table or MatrixTable. + :param t: Input Table or MatrixTable. :param vep_root: Root for VEP annotation (probably "vep"). :param penalize_flags: Whether to penalize LOFTEE flagged variants, or treat them as equal to HC. @@ -317,56 +366,84 @@ def process_consequences( if csq_order is None: csq_order = CSQ_ORDER csqs = hl.literal(csq_order) + + # Assign a score to each consequence based on the order in csq_order. csq_dict = hl.literal(dict(zip(csq_order, range(len(csq_order))))) - def _csq_score(tc: hl.expr.StructExpression) -> int: - return csq_dict[tc.most_severe_consequence] + def _find_worst_transcript_consequence( + tcl: hl.expr.ArrayExpression, + ) -> hl.expr.StructExpression: + """ + Find the worst transcript consequence in an array of transcript consequences. - flag_score = 500 - no_flag_score = flag_score * (1 + penalize_flags) + :param tcl: Array of transcript consequences. + :return: Worst transcript consequence. + """ + flag = 500 + no_flag = flag * (1 + penalize_flags) - def _csq_score_modifier(tc: hl.expr.StructExpression) -> float: - modifier = _csq_score(tc) - flag_condition = (tc.lof == "HC") & (tc.lof_flags != "") - modifier -= hl.if_else(flag_condition, flag_score, no_flag_score) - modifier -= hl.if_else(tc.lof == "OS", 20, 0) - modifier -= hl.if_else(tc.lof == "LC", 10, 0) - if has_polyphen: - modifier -= ( - hl.case() - .when(tc.polyphen_prediction == "probably_damaging", 0.5) - .when(tc.polyphen_prediction == "possibly_damaging", 0.25) - .when(tc.polyphen_prediction == "benign", 0.1) + # Score each consequence based on the order in csq_order. + score_expr = tcl.map( + lambda tc: csq_dict[csqs.find(lambda x: x == tc.most_severe_consequence)] + ) + + # Determine the score adjustment based on the consequence's LOF and LOF flags. + sub_expr = tcl.map( + lambda tc: ( + hl.case(missing_false=True) + .when((tc.lof == "HC") & hl.or_else(tc.lof_flags == "", True), no_flag) + .when((tc.lof == "HC") & (tc.lof_flags != ""), flag) + .when(tc.lof == "OS", 20) + .when(tc.lof == "LC", 10) .default(0) ) - return modifier + ) - def find_worst_transcript_consequence( - tcl: hl.expr.ArrayExpression, - ) -> hl.expr.StructExpression: - tcl = tcl.map( - lambda tc: tc.annotate(csq_score=_csq_score(tc) - _csq_score_modifier(tc)) + # If requested, determine the score adjustment based on the consequence's + # PolyPhen prediction. + if has_polyphen: + polyphen_sub_expr = tcl.map( + lambda tc: ( + hl.case(missing_false=True) + .when(tc.polyphen_prediction == "probably_damaging", 0.5) + .when(tc.polyphen_prediction == "possibly_damaging", 0.25) + .when(tc.polyphen_prediction == "benign", 0.1) + .default(0) + ) + ) + sub_expr = hl.map(lambda s, ps: s + ps, sub_expr, polyphen_sub_expr) + + # Calculate the final consequence score. + tcl = hl.map( + lambda tc, s, ss: tc.annotate(csq_score=s - ss), tcl, score_expr, sub_expr ) + + # Return the worst consequence based on the calculated score. return hl.or_missing(hl.len(tcl) > 0, hl.sorted(tcl, lambda x: x.csq_score)[0]) - transcript_csqs = mt[vep_root].transcript_consequences.map( - add_most_severe_consequence_to_consequence + # Annotate each transcript consequence with the 'most_severe_consequence'. + transcript_csqs = t[vep_root].transcript_consequences.map( + lambda tc: add_most_severe_consequence_to_consequence(tc, csq_order) ) + # Group transcript consequences by gene and find the worst consequence for each. gene_dict = transcript_csqs.group_by(lambda tc: tc.gene_symbol) - worst_csq_gene = gene_dict.map_values(find_worst_transcript_consequence).values() + worst_csq_gene = gene_dict.map_values(_find_worst_transcript_consequence).values() sorted_scores = hl.sorted(worst_csq_gene, key=lambda tc: tc.csq_score) + # Filter transcript consequences to only include canonical transcripts and find the + # worst consequence for each gene. canonical = transcript_csqs.filter(lambda csq: csq.canonical == 1) gene_canonical_dict = canonical.group_by(lambda tc: tc.gene_symbol) worst_csq_gene_canonical = gene_canonical_dict.map_values( - find_worst_transcript_consequence + _find_worst_transcript_consequence ).values() sorted_canonical_scores = hl.sorted( worst_csq_gene_canonical, key=lambda tc: tc.csq_score ) - vep_data = mt[vep_root].annotate( + # Annotate the HT/MT with the worst consequence for each gene and variant. + vep_data = t[vep_root].annotate( transcript_consequences=transcript_csqs, worst_consequence_term=csqs.find( lambda c: transcript_csqs.map( @@ -384,9 +461,9 @@ def find_worst_transcript_consequence( ) return ( - mt.annotate_rows(**{vep_root: vep_data}) - if isinstance(mt, hl.MatrixTable) - else mt.annotate(**{vep_root: vep_data}) + t.annotate_rows(**{vep_root: vep_data}) + if isinstance(t, hl.MatrixTable) + else t.annotate(**{vep_root: vep_data}) ) From 223fa65ca27938a3d21056485df1c076f791bc6d Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Thu, 13 Jun 2024 22:02:09 -0600 Subject: [PATCH 2/4] Use updated add_most_severe_consequence_to_consequence( --- gnomad/utils/vep.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index 63cbfafe6..5370cdcb4 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -422,8 +422,8 @@ def _find_worst_transcript_consequence( return hl.or_missing(hl.len(tcl) > 0, hl.sorted(tcl, lambda x: x.csq_score)[0]) # Annotate each transcript consequence with the 'most_severe_consequence'. - transcript_csqs = t[vep_root].transcript_consequences.map( - lambda tc: add_most_severe_consequence_to_consequence(tc, csq_order) + transcript_csqs = add_most_severe_consequence_to_consequence( + t[vep_root].transcript_consequences, csq_order ) # Group transcript consequences by gene and find the worst consequence for each. From 832e7c57278b2fe8cd74020d756c337b22c278b2 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Fri, 14 Jun 2024 07:59:28 -0600 Subject: [PATCH 3/4] Update gnomad/utils/vep.py Co-authored-by: Mike Wilson --- gnomad/utils/vep.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index 5370cdcb4..e22f070c0 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -335,9 +335,9 @@ def process_consequences( `most_severe_consequence` is the worst consequence for a transcript. Each transcript consequence is annotated with a `csq_score` which is a combination - of the index of the consequence's `most_severe_consequence` in `csq_order` and a - boost for loss-of-function consequences, and polyphen predictions if `has_polyphen` - is True. + of the index of the consequence's `most_severe_consequence` in `csq_order` and an + extra deduction for loss-of-function consequences, and polyphen predictions if + `has_polyphen` is True. Lower scores translate to higher severity. The score adjustment is as follows: - lof == 'HC' & NO lof_flags (-1000 if penalize_flags, -500 if not) From a39146368414012f06961b76043ca26c10a76b19 Mon Sep 17 00:00:00 2001 From: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> Date: Fri, 14 Jun 2024 08:04:09 -0600 Subject: [PATCH 4/4] Format --- gnomad/utils/vep.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gnomad/utils/vep.py b/gnomad/utils/vep.py index e22f070c0..57c7b7707 100644 --- a/gnomad/utils/vep.py +++ b/gnomad/utils/vep.py @@ -336,8 +336,8 @@ def process_consequences( Each transcript consequence is annotated with a `csq_score` which is a combination of the index of the consequence's `most_severe_consequence` in `csq_order` and an - extra deduction for loss-of-function consequences, and polyphen predictions if - `has_polyphen` is True. Lower scores translate to higher severity. + extra deduction for loss-of-function consequences, and polyphen predictions if + `has_polyphen` is True. Lower scores translate to higher severity. The score adjustment is as follows: - lof == 'HC' & NO lof_flags (-1000 if penalize_flags, -500 if not)