Skip to content

Commit

Permalink
Merge pull request #114 from bcgsc/feature/GERO-296_tmbur_high_matches
Browse files Browse the repository at this point in the history
Feature/gero 296 tmbur high matches
  • Loading branch information
dustinbleile committed May 16, 2023
2 parents 07678d7 + 430205b commit bc7fc1f
Show file tree
Hide file tree
Showing 6 changed files with 110 additions and 17 deletions.
55 changes: 51 additions & 4 deletions ipr/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,11 @@
from graphkb.match import INPUT_COPY_CATEGORIES
from graphkb.types import Variant
from graphkb.util import FeatureNotFoundError, convert_to_rid_list
from pandas import isnull
from progressbar import progressbar
from typing import Any, Dict, List, Sequence, Set, cast

from .constants import FAILED_REVIEW_STATUS
from .constants import FAILED_REVIEW_STATUS, TMB_HIGH_CATEGORY
from .ipr import convert_statements_to_alterations
from .types import (
GkbStatement,
Expand Down Expand Up @@ -328,7 +329,8 @@ def annotate_positional_variants(

for var_key in VARIANT_KEYS:
variant = row.get(var_key)
if not variant:
matches = []
if not variant or isnull(variant):
continue
try:
try:
Expand All @@ -349,12 +351,14 @@ def annotate_positional_variants(
matches = gkb_match.match_positional_variant(graphkb_conn, variant)

# GERO-299 - check for conflicting nonsense and missense categories

missense = [
m for m in matches if 'missense' in m.get('type', m).get('displayName', '')
]
nonsense = [
m for m in matches if 'nonsense' in m.get('type', m).get('displayName', '')
]

missense_cat = [m for m in missense if m.get('@class', '') == 'CategoryVariant']
nonsense_cat = [m for m in nonsense if m.get('@class', '') == 'CategoryVariant']
if missense_cat and nonsense_cat:
Expand Down Expand Up @@ -399,6 +403,9 @@ def annotate_positional_variants(
matches = [
m for m in matches if m not in missense_cat and m not in nonsense_cat
]
elif nonsense_cat and ':c.' in variant:
logger.error(f"GERO-304 - dropping nonsense variants from hgvsCds {variant}")
matches = [m for m in matches if m not in nonsense_cat]

for ipr_row in get_ipr_statements_from_variants(
graphkb_conn, matches, disease_name
Expand Down Expand Up @@ -444,8 +451,8 @@ def annotate_positional_variants(

def annotate_msi(
graphkb_conn: GraphKBConnection,
msi_category: str,
disease_name: str,
disease_name: str = 'cancer',
msi_category: str = 'microsatellite instability',
) -> List[KbMatch]:
"""Annotate microsatellite instablity from GraphKB in the IPR alterations format.
Expand Down Expand Up @@ -476,3 +483,43 @@ def annotate_msi(
ipr_row['variantType'] = 'msi'
gkb_matches.append(ipr_row)
return gkb_matches


def annotate_tmb(
graphkb_conn: GraphKBConnection,
disease_name: str = 'cancer',
category: str = TMB_HIGH_CATEGORY,
) -> List[KbMatch]:
"""Annotate Tumour Mutation Burden (tmb) categories from GraphKB in the IPR alterations format.
Match to GraphKb Category variants with similar names
Args:
graphkb_conn: the graphkb api connection object
disease_name: oncotree disease name for graphkb matching.
category: such as 'high mutation burden'
Returns:
list of kbMatches records for IPR
"""
gkb_matches = []
categories = graphkb_conn.query(
{
'target': {
'target': 'CategoryVariant',
'filters': {
'reference1': {
'target': 'Signature',
'filters': {'OR': [{'name': category}, {'displayName': category}]},
}
},
},
'queryType': 'similarTo',
'returnProperties': ['@rid', 'displayName'],
},
)
if categories:
for ipr_row in get_ipr_statements_from_variants(graphkb_conn, categories, disease_name):
ipr_row['variant'] = category
ipr_row['variantType'] = 'tmb'
gkb_matches.append(ipr_row)
return gkb_matches
3 changes: 3 additions & 0 deletions ipr/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@

# all possible values for review status are: ['pending', 'not required', 'passed', 'failed', 'initial']
FAILED_REVIEW_STATUS = 'failed'

TMB_HIGH = 10.0 # genomic mutations per mb
TMB_HIGH_CATEGORY = 'high mutation burden'
16 changes: 16 additions & 0 deletions ipr/content.spec.json
Original file line number Diff line number Diff line change
Expand Up @@ -1197,6 +1197,22 @@
"null"
]
},
"rnaAltCount": {
"description": "the number of alternate reads in the rna supporting the mutation",
"example": 1,
"type": [
"integer",
"null"
]
},
"rnaDepth": {
"description": "the total number of reads at this position in the rna",
"example": 2,
"type": [
"integer",
"null"
]
},
"svg": {
"description": "svg image file content for this SV",
"type": [
Expand Down
3 changes: 2 additions & 1 deletion ipr/inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,8 @@
'highQuality',
'comments',
'library',
# GERO-307 - tumourAltCount and tumourDepth are available but not rnaAltCount and rnaDepth
'rnaAltCount',
'rnaDepth',
'tumourAltCount',
'tumourDepth',
'germline',
Expand Down
41 changes: 36 additions & 5 deletions ipr/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,11 @@
annotate_expression_variants,
annotate_msi,
annotate_positional_variants,
annotate_tmb,
get_gene_information,
)
from .connection import IprConnection
from .constants import DEFAULT_URL
from .constants import DEFAULT_URL, TMB_HIGH, TMB_HIGH_CATEGORY
from .inputs import (
check_comparators,
check_variant_links,
Expand Down Expand Up @@ -198,10 +199,38 @@ def create_report(
gkb_matches: List[KbMatch] = []

# Signature category variants
tmb_variant: IprVariant = {}
tmb_matches = []
if 'tmburMutationBurden' in content.keys():
logger.warning(
'GERO-296 - not yet implemented - high tumour mutation burden category matching.'
)
tmb_val = 0.0
tmb = {}
try:
tmb = content.get('tmburMutationBurden', {})
tmb_val = tmb['genomeIndelTmb'] + tmb['genomeSnvTmb']
except Exception as err:
logger.error(f"tmburMutationBurden parsing failure: {err}")

if tmb_val >= TMB_HIGH:
logger.warning(
f'GERO-296 - tmburMutationBurden high -checking graphkb matches for {TMB_HIGH_CATEGORY}'
)
if not tmb.get('key'):
tmb['key'] = TMB_HIGH_CATEGORY
if not tmb.get('kbCategory'):
tmb['kbCategory'] = TMB_HIGH_CATEGORY

# GERO-296 - try matching to graphkb
tmb_matches = annotate_tmb(graphkb_conn, kb_disease_match, TMB_HIGH_CATEGORY)
if tmb_matches:
tmb_variant['kbCategory'] = TMB_HIGH_CATEGORY # type: ignore
tmb_variant['variant'] = TMB_HIGH_CATEGORY
tmb_variant['key'] = tmb['key']
tmb_variant['variantType'] = 'tmb'
logger.info(
f"GERO-296 '{TMB_HIGH_CATEGORY}' matches {len(tmb_matches)} statements."
)
gkb_matches.extend(tmb_matches)
logger.debug(f"\tgkb_matches: {len(gkb_matches)}")

msi = content.get('msi', [])
msi_matches = []
Expand All @@ -216,7 +245,7 @@ def create_report(
msi_cat = msi.get('kbCategory')
msi_variant = msi.copy()
logger.info(f'Matching GKB msi {msi_cat}')
msi_matches = annotate_msi(graphkb_conn, msi_cat, kb_disease_match)
msi_matches = annotate_msi(graphkb_conn, kb_disease_match, msi_cat)
if msi_matches:
msi_variant['kbCategory'] = msi_cat # type: ignore
msi_variant['variant'] = msi_cat
Expand Down Expand Up @@ -262,6 +291,8 @@ def create_report(
all_variants = expression_variants + copy_variants + structural_variants + small_mutations # type: ignore
if msi_matches:
all_variants.append(msi_variant) # type: ignore
if tmb_matches:
all_variants.append(tmb_variant) # type: ignore

if match_germline: # verify germline kb statements matched germline observed variants
gkb_matches = germline_kb_matches(gkb_matches, all_variants)
Expand Down
9 changes: 2 additions & 7 deletions tests/test_annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,6 @@ def test_annotate_structural_variants_tp53(graphkb_conn):
disease = 'cancer'
ref_key = 'prot_only'
pref = annotate_positional_variants(graphkb_conn, [TP53_MUT_DICT[ref_key]], disease)
known_issues = set(['TP53:p.M237X']) # SDEV-3122 -
# GERO-299 - nonsense - stop codon - should not match. This is missense not nonsense (#164:933).
nonsense = [a for a in pref if a['kbVariant'] == 'TP53 nonsense']
assert not nonsense
Expand All @@ -117,15 +116,11 @@ def test_annotate_structural_variants_tp53(graphkb_conn):
diff = pref_vars.symmetric_difference(alt_vars)
missing = pref_vars.difference(alt_vars)

known_issues = set([])
if 'hgvsCds' in alt_rep:
known_issues.add('TP53 nonsense') # GERO-299
if 'p.M237' not in alt_rep:
known_issues.add('TP53:p.M237X') # SDEV-3122 - not matching imprecise mutations
known_issues = set()
if key == 'genome_only':
# genome_only matched to more precise type 'TP53 deleterious mutation' but not 'TP53 mutation'
known_issues.add('TP53 mutation')

# strangely genome_only matched to more precise type 'TP53 deleterious mutation' but not 'TP53 mutation'
missing = pref_vars.difference(alt_vars).difference(known_issues)
print(alt_vars)
assert not missing, f"{key} missing{missing}: {diff}"
Expand Down

0 comments on commit bc7fc1f

Please sign in to comment.