In [1]:
from os import environ
environ['VARIATION_NORM_EB_PROD'] = 'true'
environ['UTA_PASSWORD'] = 'admin'

from variation.data_sources import UTA
from variation.to_vrs import ToVRS
from bioutils.accessions import coerce_namespace
from ga4gh.vrs import normalize, models
from ga4gh.vrs.dataproxy import SeqRepoDataProxy, SequenceProxy
from ga4gh.vrs.extras.translator import Translator
from ga4gh.core import ga4gh_identify, pjs_copy
from variation.data_sources import SeqRepoAccess
from bioutils.normalize import normalize as _normalize, NormalizationMode

In [2]:
seqrepo_access = SeqRepoAccess()
dp = SeqRepoDataProxy(seqrepo_access.seq_repo_client)
tlr = Translator(data_proxy=dp)
uta = UTA()

# Original query: NM_004448.4:c.2263_2277del
## ClinGene Allele Registry MANE Transcript: NM_004448.4:c.2264_2278del

Original query represented as VRS Allele

In [3]:
interval = models.SimpleInterval(start=2262, end=2277)
location = models.Location(sequence_id='refseq:NM_004448.4', interval=interval)
sstate = models.SequenceState(sequence='')
allele = models.Allele(location=location, state=sstate)
allele.as_dict()

{'type': 'Allele',
 'location': {'type': 'SequenceLocation',
  'sequence_id': 'refseq:NM_004448.4',
  'interval': {'type': 'SimpleInterval', 'start': 2262, 'end': 2277}},
 'state': {'type': 'SequenceState', 'sequence': ''}}

Copy vrs-python's current normalize method (cds start not included in ival)

In [4]:
sequence = SequenceProxy(dp, allele.location.sequence_id._value)
ival = (allele.location.interval.start._value, allele.location.interval.end._value)
alleles = (None, allele.state.sequence._value)

new_allele = pjs_copy(allele)

In [5]:
new_ival, new_alleles = _normalize(
    sequence, ival,
    alleles=alleles,
    mode=NormalizationMode.EXPAND,
    anchor_length=0
)
new_ival, new_alleles

((2262, 2277), ('GGTGGAGCCGCTGAC', ''))

Original query allele object normalized

In [6]:
new_allele.location.interval.start = new_ival[0]
new_allele.location.interval.end = new_ival[1]
new_allele.state.sequence = new_alleles[1]
new_allele.as_dict()

{'type': 'Allele',
 'location': {'type': 'SequenceLocation',
  'sequence_id': 'refseq:NM_004448.4',
  'interval': {'type': 'SimpleInterval', 'start': 2262, 'end': 2277}},
 'state': {'type': 'SequenceState', 'sequence': ''}}

Now let's see what happens to the allele when we include cds start site

In [7]:
cds_start_end = uta.get_cds_start_end('NM_004448.4')
cds_start = cds_start_end[0]
cds_start

175

In [8]:
new_ival, new_alleles = _normalize(
    sequence, (ival[0] + cds_start, ival[1] + cds_start),
    alleles=alleles,
    mode=NormalizationMode.EXPAND,
    anchor_length=0
)
new_ival, new_alleles

((2437, 2453), ('TTGAGGGAAAACACAT', 'T'))

In [9]:
new_allele.location.interval.start = new_ival[0] - cds_start
new_allele.location.interval.end = new_ival[1] - cds_start
new_allele.state.sequence = new_alleles[1]
new_allele.as_dict()

{'type': 'Allele',
 'location': {'type': 'SequenceLocation',
  'sequence_id': 'refseq:NM_004448.4',
  'interval': {'type': 'SimpleInterval', 'start': 2262, 'end': 2278}},
 'state': {'type': 'SequenceState', 'sequence': 'T'}}