-
Notifications
You must be signed in to change notification settings - Fork 31
Fix bug in process_consequences that was introduced when adding support for VEP without polyphen
#710
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix bug in process_consequences that was introduced when adding support for VEP without polyphen
#710
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -270,24 +270,60 @@ def vep_or_lookup_vep( | |
| return vep_ht.union(revep_ht) | ||
|
|
||
|
|
||
| def add_most_severe_consequence_to_consequence( | ||
| tc: hl.expr.StructExpression, | ||
| ) -> hl.expr.StructExpression: | ||
| def get_most_severe_consequence_expr( | ||
| csq_expr: hl.expr.ArrayExpression, | ||
| csq_order: Optional[List[str]] = None, | ||
| ) -> hl.expr.StringExpression: | ||
| """ | ||
| Add most_severe_consequence annotation to transcript consequences. | ||
| Get the most severe consequence from a collection of consequences. | ||
|
|
||
| This is for a given transcript, as there are often multiple annotations for a single | ||
| transcript: e.g. splice_region_variant&intron_variant -> splice_region_variant | ||
|
|
||
| This is for a given transcript, as there are often multiple annotations for a single transcript: | ||
| e.g. splice_region_variant&intron_variant -> splice_region_variant | ||
| :param csq_expr: ArrayExpression of consequences. | ||
| :param csq_order: Optional list indicating the order of VEP consequences, sorted | ||
| from high to low impact. Default is None, which uses the value of the | ||
| `CSQ_ORDER` global. | ||
| :return: Most severe consequence in `csq_expr`. | ||
| """ | ||
| csqs = hl.literal(CSQ_ORDER) | ||
| if csq_order is None: | ||
| csq_order = CSQ_ORDER | ||
| csqs = hl.literal(csq_order) | ||
|
|
||
| return tc.annotate( | ||
| most_severe_consequence=csqs.find(lambda c: tc.consequence_terms.contains(c)) | ||
| ) | ||
| return csqs.find(lambda c: csq_expr.contains(c)) | ||
|
|
||
|
|
||
| def add_most_severe_consequence_to_consequence( | ||
| tc: Union[hl.expr.StructExpression, hl.expr.ArrayExpression], | ||
| csq_order: Optional[List[str]] = None, | ||
| most_severe_csq_field: str = "most_severe_consequence", | ||
| ) -> Union[hl.expr.StructExpression, hl.expr.ArrayExpression]: | ||
| """ | ||
| Add a `most_severe_consequence` field to a transcript consequence or array of transcript consequences. | ||
|
|
||
| For a single transcript consequence, `tc` should be a StructExpression with a | ||
| `consequence_terms` field, e.g. Struct(consequence_terms=['missense_variant']). | ||
| For an array of transcript consequences, `tc` should be an ArrayExpression of | ||
| StructExpressions with a `consequence_terms` field. | ||
|
|
||
| :param tc: Transcript consequence or array of transcript consequences to annotate. | ||
| :param csq_order: Optional list indicating the order of VEP consequences, sorted | ||
| from high to low impact. Default is None, which uses the value of the | ||
| `CSQ_ORDER` global. | ||
| :param most_severe_csq_field: Field name to use for most severe consequence. Default | ||
| is 'most_severe_consequence'. | ||
| :return: Transcript consequence or array of transcript consequences annotated with | ||
| the most severe consequence. | ||
| """ | ||
| csq = lambda x: get_most_severe_consequence_expr(x.consequence_terms, csq_order) | ||
| if isinstance(tc, hl.expr.StructExpression): | ||
| return tc.annotate(**{most_severe_csq_field: csq(tc)}) | ||
| else: | ||
| return tc.map(lambda x: x.annotate(**{most_severe_csq_field: csq(x)})) | ||
|
|
||
|
|
||
| def process_consequences( | ||
| mt: Union[hl.MatrixTable, hl.Table], | ||
| t: Union[hl.MatrixTable, hl.Table], | ||
| vep_root: str = "vep", | ||
| penalize_flags: bool = True, | ||
| csq_order: Optional[List[str]] = None, | ||
|
|
@@ -298,12 +334,25 @@ def process_consequences( | |
|
|
||
| `most_severe_consequence` is the worst consequence for a transcript. | ||
|
|
||
| Each transcript consequence is annotated with a `csq_score` which is a combination | ||
| of the index of the consequence's `most_severe_consequence` in `csq_order` and an | ||
| extra deduction for loss-of-function consequences, and polyphen predictions if | ||
| `has_polyphen` is True. Lower scores translate to higher severity. | ||
|
|
||
| The score adjustment is as follows: | ||
| - lof == 'HC' & NO lof_flags (-1000 if penalize_flags, -500 if not) | ||
| - lof == 'HC' & lof_flags (-500) | ||
| - lof == 'OS' (-20) | ||
| - lof == 'LC' (-10) | ||
| - everything else (0) | ||
|
|
||
| .. note:: | ||
|
|
||
| From gnomAD v4.0 on, the PolyPhen annotation was removed from the VEP Struct | ||
| in the release HTs. When using this function with gnomAD v4.0 or later, | ||
| set `has_polyphen` to False. | ||
|
|
||
| :param mt: Input Table or MatrixTable. | ||
| :param t: Input Table or MatrixTable. | ||
| :param vep_root: Root for VEP annotation (probably "vep"). | ||
| :param penalize_flags: Whether to penalize LOFTEE flagged variants, or treat them | ||
| as equal to HC. | ||
|
|
@@ -317,56 +366,84 @@ def process_consequences( | |
| if csq_order is None: | ||
| csq_order = CSQ_ORDER | ||
| csqs = hl.literal(csq_order) | ||
|
|
||
| # Assign a score to each consequence based on the order in csq_order. | ||
| csq_dict = hl.literal(dict(zip(csq_order, range(len(csq_order))))) | ||
|
|
||
| def _csq_score(tc: hl.expr.StructExpression) -> int: | ||
| return csq_dict[tc.most_severe_consequence] | ||
| def _find_worst_transcript_consequence( | ||
| tcl: hl.expr.ArrayExpression, | ||
| ) -> hl.expr.StructExpression: | ||
| """ | ||
| Find the worst transcript consequence in an array of transcript consequences. | ||
|
|
||
| flag_score = 500 | ||
| no_flag_score = flag_score * (1 + penalize_flags) | ||
| :param tcl: Array of transcript consequences. | ||
| :return: Worst transcript consequence. | ||
| """ | ||
| flag = 500 | ||
| no_flag = flag * (1 + penalize_flags) | ||
|
|
||
| def _csq_score_modifier(tc: hl.expr.StructExpression) -> float: | ||
| modifier = _csq_score(tc) | ||
| flag_condition = (tc.lof == "HC") & (tc.lof_flags != "") | ||
| modifier -= hl.if_else(flag_condition, flag_score, no_flag_score) | ||
| modifier -= hl.if_else(tc.lof == "OS", 20, 0) | ||
| modifier -= hl.if_else(tc.lof == "LC", 10, 0) | ||
| if has_polyphen: | ||
| modifier -= ( | ||
| hl.case() | ||
| .when(tc.polyphen_prediction == "probably_damaging", 0.5) | ||
| .when(tc.polyphen_prediction == "possibly_damaging", 0.25) | ||
| .when(tc.polyphen_prediction == "benign", 0.1) | ||
| # Score each consequence based on the order in csq_order. | ||
| score_expr = tcl.map( | ||
| lambda tc: csq_dict[csqs.find(lambda x: x == tc.most_severe_consequence)] | ||
| ) | ||
|
|
||
| # Determine the score adjustment based on the consequence's LOF and LOF flags. | ||
| sub_expr = tcl.map( | ||
| lambda tc: ( | ||
| hl.case(missing_false=True) | ||
| .when((tc.lof == "HC") & hl.or_else(tc.lof_flags == "", True), no_flag) | ||
| .when((tc.lof == "HC") & (tc.lof_flags != ""), flag) | ||
|
Comment on lines
+394
to
+395
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I know this is unchanged from before but since no_flag and flag are only used once and you cant pass a value for flag, I think its more difficult to read with the variables, because you need to go back up to the assignment. I'd just do 500, 500 /(1 + penalize_flags). IF you agree with the value adjustment above.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just going to leave as is for now since my next PR will completely remove the use of the scores |
||
| .when(tc.lof == "OS", 20) | ||
| .when(tc.lof == "LC", 10) | ||
| .default(0) | ||
| ) | ||
| return modifier | ||
| ) | ||
|
|
||
| def find_worst_transcript_consequence( | ||
| tcl: hl.expr.ArrayExpression, | ||
| ) -> hl.expr.StructExpression: | ||
| tcl = tcl.map( | ||
| lambda tc: tc.annotate(csq_score=_csq_score(tc) - _csq_score_modifier(tc)) | ||
| # If requested, determine the score adjustment based on the consequence's | ||
| # PolyPhen prediction. | ||
| if has_polyphen: | ||
| polyphen_sub_expr = tcl.map( | ||
| lambda tc: ( | ||
| hl.case(missing_false=True) | ||
| .when(tc.polyphen_prediction == "probably_damaging", 0.5) | ||
| .when(tc.polyphen_prediction == "possibly_damaging", 0.25) | ||
| .when(tc.polyphen_prediction == "benign", 0.1) | ||
| .default(0) | ||
| ) | ||
| ) | ||
| sub_expr = hl.map(lambda s, ps: s + ps, sub_expr, polyphen_sub_expr) | ||
|
|
||
| # Calculate the final consequence score. | ||
| tcl = hl.map( | ||
| lambda tc, s, ss: tc.annotate(csq_score=s - ss), tcl, score_expr, sub_expr | ||
| ) | ||
|
|
||
| # Return the worst consequence based on the calculated score. | ||
| return hl.or_missing(hl.len(tcl) > 0, hl.sorted(tcl, lambda x: x.csq_score)[0]) | ||
|
|
||
| transcript_csqs = mt[vep_root].transcript_consequences.map( | ||
| add_most_severe_consequence_to_consequence | ||
| # Annotate each transcript consequence with the 'most_severe_consequence'. | ||
| transcript_csqs = add_most_severe_consequence_to_consequence( | ||
| t[vep_root].transcript_consequences, csq_order | ||
| ) | ||
|
|
||
| # Group transcript consequences by gene and find the worst consequence for each. | ||
| gene_dict = transcript_csqs.group_by(lambda tc: tc.gene_symbol) | ||
| worst_csq_gene = gene_dict.map_values(find_worst_transcript_consequence).values() | ||
| worst_csq_gene = gene_dict.map_values(_find_worst_transcript_consequence).values() | ||
| sorted_scores = hl.sorted(worst_csq_gene, key=lambda tc: tc.csq_score) | ||
|
|
||
| # Filter transcript consequences to only include canonical transcripts and find the | ||
| # worst consequence for each gene. | ||
| canonical = transcript_csqs.filter(lambda csq: csq.canonical == 1) | ||
| gene_canonical_dict = canonical.group_by(lambda tc: tc.gene_symbol) | ||
| worst_csq_gene_canonical = gene_canonical_dict.map_values( | ||
| find_worst_transcript_consequence | ||
| _find_worst_transcript_consequence | ||
| ).values() | ||
| sorted_canonical_scores = hl.sorted( | ||
| worst_csq_gene_canonical, key=lambda tc: tc.csq_score | ||
| ) | ||
|
|
||
| vep_data = mt[vep_root].annotate( | ||
| # Annotate the HT/MT with the worst consequence for each gene and variant. | ||
| vep_data = t[vep_root].annotate( | ||
| transcript_consequences=transcript_csqs, | ||
| worst_consequence_term=csqs.find( | ||
| lambda c: transcript_csqs.map( | ||
|
|
@@ -384,9 +461,9 @@ def find_worst_transcript_consequence( | |
| ) | ||
|
|
||
| return ( | ||
| mt.annotate_rows(**{vep_root: vep_data}) | ||
| if isinstance(mt, hl.MatrixTable) | ||
| else mt.annotate(**{vep_root: vep_data}) | ||
| t.annotate_rows(**{vep_root: vep_data}) | ||
| if isinstance(t, hl.MatrixTable) | ||
| else t.annotate(**{vep_root: vep_data}) | ||
| ) | ||
|
|
||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also I know this is from before but since were in the process of revamping this module, I think the language around and then how we handle the logic in relation to those flags is counterintuitive, i.e the penalize_flag logic is actually a no_flag booster. These values seem arbitrary and I cant imagine anyone is actually using these score values themselves since LOF is deducted so far down. It would be more clear if the scores were no_flag=500, flag = 500/(1+penalize_flags).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, I'm actually completely removing the scores in my next PR