In [1]:
import os
os.environ['VARIATION_NORM_EB_PROD'] = 'true'

import itertools
from variation.mane_transcript import MANETranscript
from variation.tokenizers.caches import AminoAcidCache
from variation.data_sources import SeqRepoAccess, TranscriptMappings, MANETranscriptMappings, UTA

In [2]:
transcript_mappings = TranscriptMappings()
amino_acid_cache = AminoAcidCache()
seqrepo = SeqRepoAccess()
mane_transcript_mappings = MANETranscriptMappings()
uta = UTA(db_pwd='admin')
mane_transcript = MANETranscript(seqrepo, transcript_mappings, mane_transcript_mappings, uta)

# BRAF V600E

## p -> MANE p

### RefSeq

In [3]:
mane_transcript.get_mane_transcript('NP_004324.2', 600, None, 'p', 'V', normalize_endpoint=True)

{'gene': 'BRAF',
 'refseq': 'NP_001361187.1',
 'ensembl': 'ENSP00000496776.1',
 'pos': (640, 640),
 'strand': '-',
 'status': 'MANE Select'}

#### Other accessions that should work

In [4]:
mane_transcript.get_mane_transcript('NP_001365401.1', 548, None, 'p', 'V')

{'gene': 'BRAF',
 'refseq': 'NP_001361187.1',
 'ensembl': 'ENSP00000496776.1',
 'pos': (640, 640),
 'strand': '-',
 'status': 'MANE Select'}

In [5]:
mane_transcript.get_mane_transcript('NP_001365402.1', 548, None, 'p', 'V')

{'gene': 'BRAF',
 'refseq': 'NP_001361187.1',
 'ensembl': 'ENSP00000496776.1',
 'pos': (640, 640),
 'strand': '-',
 'status': 'MANE Select'}

In [6]:
mane_transcript.get_mane_transcript('NP_001365400.1', 563, None, 'p', 'V')

{'gene': 'BRAF',
 'refseq': 'NP_001361187.1',
 'ensembl': 'ENSP00000496776.1',
 'pos': (640, 640),
 'strand': '-',
 'status': 'MANE Select'}

### Ensembl

In [7]:
mane_transcript.get_mane_transcript('ENSP00000288602.6', 600, None, 'p', 'V', normalize_endpoint=True)

## c -> MANE c

### RefSeq

#### Most recent accession version

In [8]:
mane_transcript.get_mane_transcript('NM_004333.6', 1799, None, 'c', 'T', normalize_endpoint=True)

{'gene': 'BRAF',
 'refseq': 'NM_001374258.1',
 'ensembl': 'ENST00000644969.2',
 'coding_start_site': 226,
 'pos': (1919, 1919),
 'strand': '-',
 'status': 'MANE Select'}

#### Other accessions that should point to the same

In [9]:
mane_transcript.get_mane_transcript('NM_001378471.1', 1688, None, 'c', 'T', normalize_endpoint=True)

{'gene': 'BRAF',
 'refseq': 'NM_001374258.1',
 'ensembl': 'ENST00000644969.2',
 'coding_start_site': 226,
 'pos': (1919, 1919),
 'strand': '-',
 'status': 'MANE Select'}

In [10]:
mane_transcript.get_mane_transcript('NM_001378472.1', 1643, None, 'c', 'T')

{'gene': 'BRAF',
 'refseq': 'NM_001374258.1',
 'ensembl': 'ENST00000644969.2',
 'coding_start_site': 226,
 'pos': (1919, 1919),
 'strand': '-',
 'status': 'MANE Select'}

In [11]:
mane_transcript.get_mane_transcript('NM_001378475.1', 1535, None, 'c', 'T')

{'gene': 'BRAF',
 'refseq': 'NM_001374258.1',
 'ensembl': 'ENST00000644969.2',
 'coding_start_site': 226,
 'pos': (1919, 1919),
 'strand': '-',
 'status': 'MANE Select'}

#### Older accession versions

In [12]:
mane_transcript.get_mane_transcript('NM_004333.5', 1799, None, 'c', 'T', normalize_endpoint=True)

{'gene': 'BRAF',
 'refseq': 'NM_001374258.1',
 'ensembl': 'ENST00000644969.2',
 'coding_start_site': 226,
 'pos': (1919, 1919),
 'strand': '-',
 'status': 'MANE Select'}

In [13]:
mane_transcript.get_mane_transcript('NM_004333.4', 1799, None, 'c', 'T', normalize_endpoint=True)

{'gene': 'BRAF',
 'refseq': 'NM_001374258.1',
 'ensembl': 'ENST00000644969.2',
 'coding_start_site': 226,
 'pos': (1919, 1919),
 'strand': '-',
 'status': 'MANE Select'}

### Ensembl

In [14]:
mane_transcript.get_mane_transcript('ENST00000288602.11', 1799, None, 'c', 'T', normalize_endpoint=True)

{'gene': 'BRAF',
 'refseq': 'NM_001374258.1',
 'ensembl': 'ENST00000644969.2',
 'coding_start_site': 226,
 'pos': (1919, 1919),
 'strand': '-',
 'status': 'MANE Select'}

In [15]:
mane_transcript.get_mane_transcript('ENST00000288602.11', 1799, None, 'c', 'T', normalize_endpoint=False)

{'gene': 'BRAF',
 'refseq': 'NM_001374258.1',
 'ensembl': 'ENST00000644969.2',
 'coding_start_site': 226,
 'pos': (1919, 1919),
 'strand': '-',
 'status': 'MANE Select'}

## g -> MANE c

In [16]:
mane_transcript.get_mane_transcript('NC_000007.14', 140753336, None, 'g', normalize_endpoint=True)

{'gene': 'BRAF',
 'refseq': 'NM_001374258.1',
 'ensembl': 'ENST00000644969.2',
 'coding_start_site': 226,
 'pos': (1919, 1919),
 'strand': '-',
 'status': 'MANE Select'}

In [17]:
mane_transcript.get_mane_transcript('NC_000007.13', 140453136, None, 'g', normalize_endpoint=True)

{'gene': 'BRAF',
 'refseq': 'NM_001374258.1',
 'ensembl': 'ENST00000644969.2',
 'coding_start_site': 226,
 'pos': (1919, 1919),
 'strand': '-',
 'status': 'MANE Select'}

# EGFR L858R

## p -> MANE p

### RefSeq

In [18]:
mane_transcript.get_mane_transcript('NP_005219.2', 858, None, 'p', 'L', normalize_endpoint=True)

{'gene': 'EGFR',
 'refseq': 'NP_005219.2',
 'ensembl': 'ENSP00000275493.2',
 'pos': (858, 858),
 'strand': '+',
 'status': 'MANE Select'}

### Ensembl

In [19]:
mane_transcript.get_mane_transcript('ENSP00000275493.2', 858, None, 'p', 'L', normalize_endpoint=True)

{'gene': 'EGFR',
 'refseq': 'NP_005219.2',
 'ensembl': 'ENSP00000275493.2',
 'pos': (858, 858),
 'strand': '+',
 'status': 'MANE Select'}

## c -> MANE c

### RefSeq

#### Most recent accession version

In [20]:
mane_transcript.get_mane_transcript('NM_005228.5', 2573, None, 'c', 'T', normalize_endpoint=True)

{'gene': 'EGFR',
 'refseq': 'NM_005228.5',
 'ensembl': 'ENST00000275493.7',
 'coding_start_site': 261,
 'pos': (2573, 2573),
 'strand': '+',
 'status': 'MANE Select'}

#### Older accession versions

In [21]:
mane_transcript.get_mane_transcript('NM_005228.4', 2573, None, 'c', 'T', normalize_endpoint=True)

{'gene': 'EGFR',
 'refseq': 'NM_005228.5',
 'ensembl': 'ENST00000275493.7',
 'coding_start_site': 261,
 'pos': (2573, 2573),
 'strand': '+',
 'status': 'MANE Select'}

### Ensembl

In [22]:
mane_transcript.get_mane_transcript('ENST00000275493.7', 2573, None, 'c', 'T', normalize_endpoint=True)

{'refseq': 'NM_005228.4',
 'ensembl': None,
 'pos': (2573, 2573),
 'strand': '+',
 'status': 'Longest Compatible Remaining'}

## g -> MANE c

In [23]:
mane_transcript.get_mane_transcript('NC_000007.13', 55259515, None, 'g', normalize_endpoint=True)

{'gene': 'EGFR',
 'refseq': 'NM_005228.5',
 'ensembl': 'ENST00000275493.7',
 'coding_start_site': 261,
 'pos': (2573, 2573),
 'strand': '+',
 'status': 'MANE Select'}

# More examples

In [24]:
# https://civicdb.org/events/genes/4/summary/variants/2/summary#variant
# https://reg.genome.network/allele?hgvs=NM_007313.2:c.1001C%3ET
mane_transcript.get_mane_transcript('NP_005148.2', 315, 315, 'p', normalize_endpoint=True)

{'gene': 'ABL1',
 'refseq': 'NP_005148.2',
 'ensembl': 'ENSP00000323315.5',
 'pos': (315, 315),
 'strand': '+',
 'status': 'MANE Select'}

In [25]:
# https://civicdb.org/events/genes/4/summary/variants/2/summary#variant
# https://reg.genome.network/allele?hgvs=NM_007313.2:c.1001C%3ET
mane_transcript.get_mane_transcript('NM_007313.2', 1001, None, 'c', normalize_endpoint=True)

{'gene': 'ABL1',
 'refseq': 'NM_005157.6',
 'ensembl': 'ENST00000318560.6',
 'coding_start_site': 193,
 'pos': (944, 944),
 'strand': '+',
 'status': 'MANE Select'}

In [26]:
# https://civicdb.org/events/genes/19/summary/variants/34/summary#variant
# https://reg.genome.network/allele?hgvs=NM_005228.4:c.2369C%3ET
mane_transcript.get_mane_transcript('NP_005219.2', 790, None, 'p', normalize_endpoint=True)

{'gene': 'EGFR',
 'refseq': 'NP_005219.2',
 'ensembl': 'ENSP00000275493.2',
 'pos': (790, 790),
 'strand': '+',
 'status': 'MANE Select'}

In [27]:
# https://civicdb.org/events/genes/19/summary/variants/34/summary#variant
# https://reg.genome.network/allele?hgvs=NM_005228.4:c.2369C%3ET
mane_transcript.get_mane_transcript('NM_005228.5', 2369, None, 'c', normalize_endpoint=True)

{'gene': 'EGFR',
 'refseq': 'NM_005228.5',
 'ensembl': 'ENST00000275493.7',
 'coding_start_site': 261,
 'pos': (2369, 2369),
 'strand': '+',
 'status': 'MANE Select'}

In [28]:
# https://civicdb.org/events/genes/30/summary/variants/79/summary#variant
# https://reg.genome.network/allele?hgvs=NM_004985.4:c.35G%3EA
mane_transcript.get_mane_transcript('NP_004976.2', 12, None, 'p', normalize_endpoint=True)

{'gene': 'KRAS',
 'refseq': 'NP_004976.2',
 'ensembl': 'ENSP00000308495.3',
 'pos': (12, 12),
 'strand': '-',
 'status': 'MANE Select'}

In [29]:
# https://civicdb.org/events/genes/30/summary/variants/79/summary#variant
# https://reg.genome.network/allele?hgvs=NM_004985.4:c.35G%3EA
mane_transcript.get_mane_transcript('NM_004985.5', 35, None, 'c', normalize_endpoint=True)

{'gene': 'KRAS',
 'refseq': 'NM_004985.5',
 'ensembl': 'ENST00000311936.8',
 'coding_start_site': 190,
 'pos': (35, 35),
 'strand': '-',
 'status': 'MANE Select'}

In [30]:
# https://reg.genome.network/allele?hgvs=NM_004448.4:c.2262_2276del
mane_transcript.get_mane_transcript('NP_004439.2', 755, 759, 'p', normalize_endpoint=True)

{'gene': 'ERBB2',
 'refseq': 'NP_004439.2',
 'ensembl': 'ENSP00000269571.4',
 'pos': (755, 759),
 'strand': '+',
 'status': 'MANE Select'}

In [31]:
# https://reg.genome.network/allele?hgvs=NM_004448.4:c.2262_2276del
mane_transcript.get_mane_transcript('NM_004448.4', 2262, 2276, 'c', normalize_endpoint=True)

{'gene': 'ERBB2',
 'refseq': 'NM_004448.4',
 'ensembl': 'ENST00000269571.10',
 'coding_start_site': 175,
 'pos': (2262, 2276),
 'strand': '+',
 'status': 'MANE Select'}

# Scratch

In [32]:
mane_transcript.get_mane_transcript('NP_004976.2', 12, None, 'p')

{'gene': 'KRAS',
 'refseq': 'NP_004976.2',
 'ensembl': 'ENSP00000308495.3',
 'pos': (12, 12),
 'strand': '-',
 'status': 'MANE Select'}

In [33]:
mane_transcript.get_mane_transcript('ENST00000288602.7', 1799, None, 'p')

# Longest compatible transcript

In [34]:
mane_transcript.get_longest_compatible_transcript('BRAF', 600, None, 'p')

{'refseq': 'NP_001361187.1',
 'ensembl': None,
 'pos': (600, 600),
 'strand': '-',
 'status': 'Longest Compatible Remaining'}

In [35]:
mane_transcript.get_longest_compatible_transcript('BRAF', 1799, None, 'c')

{'refseq': 'NM_001374244.1',
 'ensembl': None,
 'pos': (1799, 1799),
 'strand': '-',
 'status': 'Longest Compatible Remaining'}

# Scratch

In [43]:
def get_seq(ac, cds, start, end):
    return seqrepo.seq_repo_client.fetch(ac)[cds + start - 1: cds + end]

In [37]:
mane = 'NM_004448.4'
mane_cds = uta.get_coding_start_site(mane)
mane_cds

175

In [36]:
q = 'NM_004448.3'
q_cds = uta.get_coding_start_site(q)
q_cds

261

In [44]:
# Original query
q_start, q_end = 2263, 2277
q_seq = get_seq(q, q_cds, q_start, q_end)
q_seq

'TTGAGGGAAAACACA'

In [46]:
# Variation Normalization 
v_start, v_end = 2263, 2277
v_seq = get_seq(mane, mane_cds, v_start, v_end)
v_seq

'TTGAGGGAAAACACA'

In [47]:
# ClinGen Allele Registry API
cgar_api_start, cgar_api_end = 2264, 2278
cgar_api_seq = get_seq(mane, mane_cds, cgar_api_start, cgar_api_end)
cgar_api_seq

'TGAGGGAAAACACAT'

In [48]:
q_seq == v_seq

True

In [49]:
q_seq == cgar_api_seq

False