Skip to content

Commit

Permalink
Merge pull request #683 from davmlaw/issue_670_updated_master_gene_sy…
Browse files Browse the repository at this point in the history
…mbol_dash_underscore

#670 - Gene Symbol support dash and underscore
  • Loading branch information
reece committed Sep 14, 2023
2 parents f3429f1 + e752162 commit df7fb70
Show file tree
Hide file tree
Showing 6 changed files with 170 additions and 87 deletions.
5 changes: 2 additions & 3 deletions src/hgvs/_data/hgvs.pymeta
Original file line number Diff line number Diff line change
Expand Up @@ -204,8 +204,7 @@ pm = :x ?(x in '-+') -> x
snum = <pm? num>:x -> int(x)

# Accessions, possibly versioned. Should accept, e.g., NM_01234.5, LRG_01234_1p1
accn = <letter letterOrDigit+ ('_' letterOrDigit+)? ('.' digit+)?>
accn = <letter ((letterOrDigit | ('-'|'_') ~~letterOrDigit)+)? ('.' digit+)?>
opt_gene_expr = (paren_gene | ->None):gene -> gene
paren_gene = '(' gene_symbol:symbol ')' -> symbol
gene_symbol = <letter letterOrDigit+>

gene_symbol = <letter (letterOrDigit | ('-'|'_') ~~letterOrDigit)+>
197 changes: 120 additions & 77 deletions src/hgvs/generated/hgvs_grammar.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
# Changes will be overwritten by the generation script.
# Generated by: sbin/generate_parser.py
# Grammar file: src/hgvs/_data/hgvs.pymeta
# Grammar hash: ebf8644509260c4d1f137fc546dd03f0
# Grammar hash: 3a6ac8d6d2dda7f4a178efe0081659e0
# Parsley version: 1.3
# Python version: 3.10.6 (main, May 29 2023, 11:10:38) [GCC 11.3.0]
# Python version: 3.10.12 (main, Jun 11 2023, 05:26:28) [GCC 11.4.0]
# --------------------------------------------------


Expand Down Expand Up @@ -3188,129 +3188,172 @@ def _G_consumedby_873():
_G_apply_874, lastError = self._apply(self.rule_letter, "letter", [])
self.considerError(lastError, None)

def _G_many1_875():
self._trace("", (12490, 12504), self.input.position)
_G_apply_876, lastError = self._apply(
self.rule_letterOrDigit, "letterOrDigit", []
)
self.considerError(lastError, None)
return (_G_apply_876, self.currentError)

_G_many1_877, lastError = self.many(_G_many1_875, _G_many1_875())
self.considerError(lastError, None)

def _G_optional_878():
self._trace("", (12507, 12510), self.input.position)
_G_exactly_879, lastError = self.exactly("_")
self.considerError(lastError, None)

def _G_many1_880():
self._trace("", (12510, 12524), self.input.position)
_G_apply_881, lastError = self._apply(
self.rule_letterOrDigit, "letterOrDigit", []
)
def _G_optional_875():
def _G_many1_876():
def _G_or_877():
self._trace("", (12493, 12506), self.input.position)
_G_apply_878, lastError = self._apply(self.rule_letterOrDigit, "letterOrDigit", [])
self.considerError(lastError, None)
return (_G_apply_878, self.currentError)

def _G_or_879():
def _G_or_880():
self._trace("", (12510, 12513), self.input.position)
_G_exactly_881, lastError = self.exactly("-")
self.considerError(lastError, None)
return (_G_exactly_881, self.currentError)

def _G_or_882():
self._trace("", (12514, 12517), self.input.position)
_G_exactly_883, lastError = self.exactly("_")
self.considerError(lastError, None)
return (_G_exactly_883, self.currentError)

_G_or_884, lastError = self._or([_G_or_880, _G_or_882])
self.considerError(lastError, None)

def _G_lookahead_885():
self._trace("", (12521, 12534), self.input.position)
_G_apply_886, lastError = self._apply(self.rule_letterOrDigit, "letterOrDigit", [])
self.considerError(lastError, None)
return (_G_apply_886, self.currentError)

_G_lookahead_887, lastError = self.lookahead(_G_lookahead_885)
self.considerError(lastError, None)
return (_G_lookahead_887, self.currentError)

_G_or_888, lastError = self._or([_G_or_877, _G_or_879])
self.considerError(lastError, None)
return (_G_apply_881, self.currentError)
return (_G_or_888, self.currentError)

_G_many1_882, lastError = self.many(_G_many1_880, _G_many1_880())
_G_many1_889, lastError = self.many(_G_many1_876, _G_many1_876())
self.considerError(lastError, None)
return (_G_many1_882, self.currentError)
return (_G_many1_889, self.currentError)

def _G_optional_883():
def _G_optional_890():
return (None, self.input.nullError())

_G_or_884, lastError = self._or([_G_optional_878, _G_optional_883])
_G_or_891, lastError = self._or([_G_optional_875, _G_optional_890])
self.considerError(lastError, None)

def _G_optional_885():
self._trace("", (12529, 12532), self.input.position)
_G_exactly_886, lastError = self.exactly(".")
def _G_optional_892():
self._trace("", (12540, 12543), self.input.position)
_G_exactly_893, lastError = self.exactly(".")
self.considerError(lastError, None)

def _G_many1_887():
self._trace("", (12532, 12538), self.input.position)
_G_apply_888, lastError = self._apply(self.rule_digit, "digit", [])
def _G_many1_894():
self._trace("", (12543, 12549), self.input.position)
_G_apply_895, lastError = self._apply(self.rule_digit, "digit", [])
self.considerError(lastError, None)
return (_G_apply_888, self.currentError)
return (_G_apply_895, self.currentError)

_G_many1_889, lastError = self.many(_G_many1_887, _G_many1_887())
_G_many1_896, lastError = self.many(_G_many1_894, _G_many1_894())
self.considerError(lastError, None)
return (_G_many1_889, self.currentError)
return (_G_many1_896, self.currentError)

def _G_optional_890():
def _G_optional_897():
return (None, self.input.nullError())

_G_or_891, lastError = self._or([_G_optional_885, _G_optional_890])
_G_or_898, lastError = self._or([_G_optional_892, _G_optional_897])
self.considerError(lastError, None)
return (_G_or_891, self.currentError)
return (_G_or_898, self.currentError)

_G_consumedby_892, lastError = self.consumedby(_G_consumedby_873)
_G_consumedby_899, lastError = self.consumedby(_G_consumedby_873)
self.considerError(lastError, "accn")
return (_G_consumedby_892, self.currentError)
return (_G_consumedby_899, self.currentError)

def rule_opt_gene_expr(self):
_locals = {"self": self}
self.locals["opt_gene_expr"] = _locals

def _G_or_893():
self._trace("", (12560, 12570), self.input.position)
_G_apply_894, lastError = self._apply(self.rule_paren_gene, "paren_gene", [])
def _G_or_900():
self._trace("", (12571, 12581), self.input.position)
_G_apply_901, lastError = self._apply(self.rule_paren_gene, "paren_gene", [])
self.considerError(lastError, None)
return (_G_apply_894, self.currentError)
return (_G_apply_901, self.currentError)

def _G_or_895():
_G_python_896, lastError = (None), None
def _G_or_902():
_G_python_903, lastError = (None), None
self.considerError(lastError, None)
return (_G_python_896, self.currentError)
return (_G_python_903, self.currentError)

_G_or_897, lastError = self._or([_G_or_893, _G_or_895])
_G_or_904, lastError = self._or([_G_or_900, _G_or_902])
self.considerError(lastError, "opt_gene_expr")
_locals["gene"] = _G_or_897
_G_python_899, lastError = eval(self._G_expr_898, self.globals, _locals), None
_locals["gene"] = _G_or_904
_G_python_906, lastError = eval(self._G_expr_905, self.globals, _locals), None
self.considerError(lastError, "opt_gene_expr")
return (_G_python_899, self.currentError)
return (_G_python_906, self.currentError)

def rule_paren_gene(self):
_locals = {"self": self}
self.locals["paren_gene"] = _locals
self._trace("", (12606, 12610), self.input.position)
_G_exactly_900, lastError = self.exactly("(")
self._trace("", (12617, 12621), self.input.position)
_G_exactly_907, lastError = self.exactly("(")
self.considerError(lastError, "paren_gene")
self._trace("", (12610, 12622), self.input.position)
_G_apply_901, lastError = self._apply(self.rule_gene_symbol, "gene_symbol", [])
self._trace("", (12621, 12633), self.input.position)
_G_apply_908, lastError = self._apply(self.rule_gene_symbol, "gene_symbol", [])
self.considerError(lastError, "paren_gene")
_locals["symbol"] = _G_apply_901
self._trace("", (12629, 12633), self.input.position)
_G_exactly_902, lastError = self.exactly(")")
_locals["symbol"] = _G_apply_908
self._trace("", (12640, 12644), self.input.position)
_G_exactly_909, lastError = self.exactly(")")
self.considerError(lastError, "paren_gene")
_G_python_904, lastError = eval(self._G_expr_903, self.globals, _locals), None
_G_python_911, lastError = eval(self._G_expr_910, self.globals, _locals), None
self.considerError(lastError, "paren_gene")
return (_G_python_904, self.currentError)
return (_G_python_911, self.currentError)

def rule_gene_symbol(self):
_locals = {"self": self}
self.locals["gene_symbol"] = _locals

def _G_consumedby_905():
self._trace("", (12659, 12665), self.input.position)
_G_apply_906, lastError = self._apply(self.rule_letter, "letter", [])
def _G_consumedby_912():
self._trace("", (12670, 12676), self.input.position)
_G_apply_913, lastError = self._apply(self.rule_letter, "letter", [])
self.considerError(lastError, None)

def _G_many1_907():
self._trace("", (12665, 12679), self.input.position)
_G_apply_908, lastError = self._apply(
self.rule_letterOrDigit, "letterOrDigit", []
)
def _G_many1_914():
def _G_or_915():
self._trace("", (12678, 12691), self.input.position)
_G_apply_916, lastError = self._apply(self.rule_letterOrDigit, "letterOrDigit", [])
self.considerError(lastError, None)
return (_G_apply_916, self.currentError)

def _G_or_917():
def _G_or_918():
self._trace("", (12695, 12698), self.input.position)
_G_exactly_919, lastError = self.exactly("-")
self.considerError(lastError, None)
return (_G_exactly_919, self.currentError)

def _G_or_920():
self._trace("", (12699, 12702), self.input.position)
_G_exactly_921, lastError = self.exactly("_")
self.considerError(lastError, None)
return (_G_exactly_921, self.currentError)

_G_or_922, lastError = self._or([_G_or_918, _G_or_920])
self.considerError(lastError, None)

def _G_lookahead_923():
self._trace("", (12706, 12719), self.input.position)
_G_apply_924, lastError = self._apply(self.rule_letterOrDigit, "letterOrDigit", [])
self.considerError(lastError, None)
return (_G_apply_924, self.currentError)

_G_lookahead_925, lastError = self.lookahead(_G_lookahead_923)
self.considerError(lastError, None)
return (_G_lookahead_925, self.currentError)

_G_or_926, lastError = self._or([_G_or_915, _G_or_917])
self.considerError(lastError, None)
return (_G_apply_908, self.currentError)
return (_G_or_926, self.currentError)

_G_many1_909, lastError = self.many(_G_many1_907, _G_many1_907())
_G_many1_927, lastError = self.many(_G_many1_914, _G_many1_914())
self.considerError(lastError, None)
return (_G_many1_909, self.currentError)
return (_G_many1_927, self.currentError)

_G_consumedby_910, lastError = self.consumedby(_G_consumedby_905)
_G_consumedby_928, lastError = self.consumedby(_G_consumedby_912)
self.considerError(lastError, "gene_symbol")
return (_G_consumedby_910, self.currentError)
return (_G_consumedby_928, self.currentError)

_G_expr_20 = compile(
"hgvs.sequencevariant.SequenceVariant(ac=ac, gene=gene, type=type, posedit=posedit)",
Expand Down Expand Up @@ -3396,8 +3439,8 @@ def _G_many1_907():
_G_expr_831 = compile("x in 'X*'", "<string>", "eval")
_G_expr_846 = compile("int(x)", "<string>", "eval")
_G_expr_861 = compile("x in '-+'", "<string>", "eval")
_G_expr_898 = compile("gene", "<string>", "eval")
_G_expr_903 = compile("symbol", "<string>", "eval")
_G_expr_905 = compile("gene", "<string>", "eval")
_G_expr_910 = compile("symbol", "<string>", "eval")

if Grammar.globals is not None:
Grammar.globals = Grammar.globals.copy()
Expand Down
3 changes: 2 additions & 1 deletion src/hgvs/normalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
HGVSInvalidVariantError,
HGVSUnsupportedOperationError,
)
from hgvs.parser import Parser
from hgvs.utils.norm import normalize_alleles

_logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -449,6 +448,8 @@ def _normalize_alleles(self, var, boundary):


if __name__ == "__main__":
from hgvs.parser import Parser

hgvsparser = Parser()
var = hgvsparser.parse_hgvs_variant("NM_001166478.1:c.61delG")
hdp = connect()
Expand Down
6 changes: 2 additions & 4 deletions src/hgvs/sequencevariant.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
# -*- coding: utf-8 -*-
"""represents simple sequence-based variants
"""
""" represents simple sequence-based variants """

from __future__ import absolute_import, division, print_function, unicode_literals

Expand Down Expand Up @@ -91,7 +89,7 @@ def validate(self):


# <LICENSE>
# Copyright 2018 HGVS Contributors (https://github.com/biocommons/hgvs)
# Copyright 2023 HGVS Contributors (https://github.com/biocommons/hgvs)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down
1 change: 0 additions & 1 deletion tests/data/grammar_test.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ accn NM_123456|NC_999999 True list
accn NM_X3|NC_333 True list
accn NM_|3M|33_12345678.9|NM_12345.|NM_*901.3 False list
accn NM|NC True list
accn N|M|3 False list
accn XR_123456.7|XM_999999.9|ZZ_12345678890.0|U14680.1 True list

############################################################
Expand Down
45 changes: 44 additions & 1 deletion tests/test_hgvs_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,51 @@
def test_parser_variants_with_gene_names(parser):
assert parser.parse("NM_01234.5(BOGUS):c.22+1A>T")

# HGNC approved symbols include non-alphanumeric:
# dashes - ADAMTSL4-AS1 or TRL-CAA5-1
assert parser.parse("NM_01234.5(BOGUS-EXCELLENT):c.22+1A>T")
assert parser.parse("NM_01234.5(BOGUS-MOST-EXCELLENT):c.22+1A>T")

# underscore - GTF2H2C_2, APOBEC3A_B, C4B_2
assert parser.parse("NM_01234.5(BOGUS_EXCELLENT):c.22+1A>T")

with pytest.raises(hgvs.exceptions.HGVSParseError):
parser.parse("NM_01234.5(1BOGUS):c.22+1A>T") # Starts with non-alpha

with pytest.raises(hgvs.exceptions.HGVSParseError):
parser.parse("NM_01234.5(-BOGUS):c.22+1A>T") # Starts with non-alpha

with pytest.raises(hgvs.exceptions.HGVSParseError):
parser.parse("NM_01234.5(BOGUS-):c.22+1A>T") # Ends with non-alpha

with pytest.raises(hgvs.exceptions.HGVSParseError):
parser.parse("NM_01234.5(BOGUS/EXCELLENT):c.22+1A>T") # contains invalid character


def test_parser_variants_with_no_transcript_gene_names(parser):
""" Test it also works with no transcript provided """

assert parser.parse("BOGUS:c.22+1A>T")

# HGNC approved symbols include non-alphanumeric:
# dashes - ADAMTSL4-AS1 or TRL-CAA5-1
assert parser.parse("BOGUS-EXCELLENT:c.22+1A>T")
assert parser.parse("BOGUS-MOST-EXCELLENT:c.22+1A>T")

# underscore - GTF2H2C_2, APOBEC3A_B, C4B_2
assert parser.parse("BOGUS_EXCELLENT:c.22+1A>T")

with pytest.raises(hgvs.exceptions.HGVSParseError):
parser.parse("1BOGUS:c.22+1A>T") # Starts with non-alpha

with pytest.raises(hgvs.exceptions.HGVSParseError):
parser.parse("-BOGUS:c.22+1A>T") # Starts with non-alpha

with pytest.raises(hgvs.exceptions.HGVSParseError):
parser.parse("BOGUS-:c.22+1A>T") # Ends with non-alpha

with pytest.raises(hgvs.exceptions.HGVSParseError):
parser.parse("NM_01234.5(1BOGUS):c.22+1A>T")
parser.parse("BOGUS/EXCELLENT:c.22+1A>T") # contains invalid character


class Test_Parser(unittest.TestCase):
Expand Down

0 comments on commit df7fb70

Please sign in to comment.