# MaveDB Mapping

In [1]:
# Load Required Packages
import io
import re
import requests
import hgvs
import base64, hashlib
from ga4gh.vrs import models, vrs_deref, vrs_enref
from ga4gh.core import ga4gh_identify, ga4gh_serialize, ga4gh_digest, ga4gh_deref, sha512t24u
from ga4gh.vrs.extras.translator import Translator
from ga4gh.vrs.dataproxy import SeqRepoDataProxy
import pandas as pd
from gene.query import QueryHandler
import nest_asyncio
import asyncio
from uta_tools.uta_tools import UTATools
from uta_tools.data_sources.uta_database import UTADatabase
from uta_tools.data_sources.mane_transcript_mappings import MANETranscriptMappings
import pickle
from os import environ
from Bio.SeqUtils import seq1
from Bio.Seq import Seq
from biocommons.seqrepo import SeqRepo
from bs4 import BeautifulSoup
sr = SeqRepo("/usr/local/share/seqrepo/latest")
environ["UTA_DB_URL"] = 'postgresql://uta_admin:uta@localhost:5432/uta/uta_20210129'
from pyliftover import LiftOver

Removing allOf attribute from AbsoluteCopyNumber to avoid python-jsonschema-objects error.
Removing allOf attribute from SequenceInterval to avoid python-jsonschema-objects error.
Removing allOf attribute from RepeatedSequenceExpression to avoid python-jsonschema-objects error.


## Process Metadata

In [2]:
def get_urns():
    response = requests.get('https://www.mavedb.org/api/scoresets/')
    json_parse = response.json()
    n_scoresets = len(json_parse)
    urns = list()
    for i in range(n_scoresets):
        if json_parse[i]['target']['reference_maps'][0]['genome']['organism_name'] == 'Homo sapiens':
            urns.append(json_parse[i]['urn'])
    return urns

urns = get_urns()

In [3]:
def get_target_sequence_data():
    response = requests.get('https://www.mavedb.org/api/scoresets/')
    json_parse = response.json()
    n_scoresets = len(json_parse)
    target_sequences = list()
    targets = list()
    for i in range(n_scoresets):
        if json_parse[i]['target']['reference_maps'][0]['genome']['organism_name'] == 'Homo sapiens':
            target_sequences.append(json_parse[i]['target']['reference_sequence']
                                ['sequence'])
    return target_sequences

human_target_sequences = get_target_sequence_data()

In [4]:
def get_target_sequence_type():
    response = requests.get('https://www.mavedb.org/api/scoresets/')
    json_parse = response.json()
    n_scoresets = len(json_parse)
    target_type = list()
    for i in range(n_scoresets):
        if json_parse[i]['target']['reference_maps'][0]['genome']['organism_name'] == 'Homo sapiens':
            target_type.append(json_parse[i]['target']['reference_sequence']
                                ['sequence_type'])
    return target_type

target_type = get_target_sequence_type()

In [5]:
def get_targets():
    response = requests.get('https://www.mavedb.org/api/scoresets/')
    json_parse = response.json()
    n_scoresets = len(json_parse)
    targets = list()
    for i in range(n_scoresets):
        if json_parse[i]['target']['reference_maps'][0]['genome']['organism_name'] == 'Homo sapiens':
            targets.append(json_parse[i]['target']['name'])
    return targets

human_targets = get_targets()

In [6]:
def get_assembly():
    response = requests.get('https://www.mavedb.org/api/scoresets/')
    json_parse = response.json()
    n_scoresets = len(json_parse)
    assembly = list()
    for i in range(n_scoresets):
        if json_parse[i]['target']['reference_maps'][0]['genome']['organism_name'] == 'Homo sapiens':
            assembly.append(json_parse[i]['target']['reference_maps'][0]['genome']['assembly_identifier']['identifier'])
    return assembly

human_assembly = get_assembly()

In [7]:
def get_uniprot():
    response = requests.get('https://www.mavedb.org/api/scoresets/')
    json_parse = response.json()
    n_scoresets = len(json_parse)
    uniprot = list()
    for i in range(n_scoresets):
        if json_parse[i]['target']['reference_maps'][0]['genome']['organism_name'] == 'Homo sapiens':
            if json_parse[i]['target']['uniprot'] == None:
                uniprot.append(None)
            else:
                uniprot.append(json_parse[i]['target']['uniprot']['identifier'])
    return uniprot

uniprot = get_uniprot()

In [8]:
def get_target_type():
    response = requests.get('https://www.mavedb.org/api/scoresets/')
    json_parse = response.json()
    n_scoresets = len(json_parse)
    targets = list()
    for i in range(n_scoresets):
        if json_parse[i]['target']['reference_maps'][0]['genome']['organism_name'] == 'Homo sapiens':
            targets.append(json_parse[i]['target']['type'])
    return targets
targets = get_target_type()

In [9]:
# Create, save dataframe
dat = {'urn': urns, 'target_sequence': human_target_sequences, 'target_sequence_type': target_type, 'target':human_targets, 
       'assembly_id':human_assembly, 'uniprot_id':uniprot, 'target_type':targets}
dat = pd.DataFrame(data=dat)
dat

Unnamed: 0,urn,target_sequence,target_sequence_type,target,assembly_id,uniprot_id,target_type
0,urn:mavedb:00000041-a-1,CTGCGGCTGGAGGTCAAGCTGGGCCAGGGCTGCTTTGGCGAGGTGT...,dna,Src catalytic domain,GCF_000001405.26,P12931,Protein coding
1,urn:mavedb:00000048-a-1,GAGGGGATCAGTATATACACTTCAGATAACTACACCGAGGAAATGG...,dna,CXCR4,GCF_000001405.26,P61073,Protein coding
2,urn:mavedb:00000068-b-1,ATGGAGGAGCCGCAGTCAGATCCTAGCGTCGAGCCCCCTCTGAGTC...,dna,TP53 (P72R),GCF_000001405.26,,Protein coding
3,urn:mavedb:00000023-a-1,AGCTCTTCACCGGAGACCCAAATACAACAAATCAAGTCGCCTGCCC...,dna,LDLR promoter,GCF_000001405.26,,Regulatory
4,urn:mavedb:00000045-c-1,ATGGATGTATTCATGAAAGGACTTTCAAAGGCCAAGGAGGGAGTTG...,dna,alpha-synuclein,GCF_000001405.10,P37840,Protein coding
...,...,...,...,...,...,...,...
158,urn:mavedb:00000072-a-1,GQPREPQVYTLPPSRDELTKNQVSLTCLVKGFYPSDIAVEWESNGQ...,protein,IGHG1,GCF_000001405.26,P01857,Protein coding
159,urn:mavedb:00000029-a-2,GAACTGGAAAAGCCCTGTCCGGTGAGGGGGCAGAAGGACTCAGCGC...,dna,SORT1 enhancer,GCF_000001405.26,,Regulatory
160,urn:mavedb:00000061-b-1,TCTAAGACAAGCAACACTATCCGTGTTTTCTTGCCGAACAAGCAAA...,dna,RAF,GCF_000001405.26,P04049,Protein coding
161,urn:mavedb:00000095-b-1,ATGGATTCTCTTGTGGTCCTTGTGCTCTGTCTCTCATGTTTGCTTC...,dna,CYP2C9,GCF_000001405.26,P11712,Protein coding


## Part 1: MaveDB Metadata to BLAT Alignment Data

In [10]:
# Alignment Helper Function
def get_gene_data(i, blat_chr, return_chr):
    qh = QueryHandler()
    try:
        uniprot = dat.at[i,'uniprot_id']
        gsymb = qh.normalize(str(f'uniprot:{uniprot}')).gene_descriptor.label
    except:
        try:
            target = dat.at[i, 'target'].split(' ')[0]
            gsymb = qh.normalize(target).gene_descriptor.label
        except:
            return 'NA' # if gsymb cannot be extracted
   
    temp = qh.search(gsymb).source_matches
    source_dict = {}
    for i in range(len(temp)):
        source_dict[temp[i].source] = i
    
    if 'HGNC' in source_dict and return_chr == True:
        chrom = temp[source_dict['HGNC']].records[0].locations[0].chr
        return chrom
    
    if 'Ensembl' in source_dict and return_chr == False and len(temp[source_dict['Ensembl']].records) != 0:
        for j in range(len(temp[source_dict['Ensembl']].records)):
            for k in range(len(temp[source_dict['Ensembl']].records[j].locations)):
                if temp[source_dict['Ensembl']].records[j].locations[k].interval.type == 'SequenceInterval': # Multiple records per source
                    start = temp[source_dict['Ensembl']].records[j].locations[k].interval.start.value
                    end = temp[source_dict['Ensembl']].records[j].locations[k].interval.end.value
                    loc_list = {}
                    loc_list['start'] = start
                    loc_list['end'] = end
                    return loc_list
    if 'NCBI' in source_dict and return_chr == False and len(temp[source_dict['NCBI']].records) != 0:
        for j in range(len(temp[source_dict['NCBI']].records)):
            for k in range(len(temp[source_dict['NCBI']].records[j].locations)):
                if temp[source_dict['NCBI']].records[j].locations[k].interval.type == 'SequenceInterval':
                    start = temp[source_dict['NCBI']].records[j].locations[k].interval.start.value
                    end = temp[source_dict['NCBI']].records[j].locations[k].interval.end.value
                    loc_list = {}
                    loc_list['start'] = start
                    loc_list['end'] = end
                    return loc_list
    return 'NA'    

In [11]:
# Get Query and Hit Ranges for Each Human Target Sequence
from Bio import SearchIO
mave_blat_dict = {}

for i in range(len(dat.index)):
    blat_file = open('blat_query.fa', 'w')
    blat_file.write('>' + dat.at[i, 'target'] + '\n')
    blat_file.write(dat.at[i, 'target_sequence'] + '\n')
    blat_file.close()

    if dat.at[i, 'target_sequence_type'] == 'protein':
        !./blat hg38.2bit -q=prot -t=dnax blat_query.fa  blat_out.psl
    else:
        !./blat hg38.2bit blat_query.fa  blat_out.psl

    # Extract ranges
    chrom = ''
    strand = ''
    target = ''
    target_type = ''
    coverage = None
    identity = None
    query_ranges = list()
    hit_ranges = list()
    
    try:
        output = SearchIO.read('blat_out.psl', 'blat-psl')
    except:
        try:
            !./blat hg38.2bit -q=dnax -t=dnax blat_query.fa  blat_out.psl 
            output = SearchIO.read('blat_out.psl', 'blat-psl')
        except:
            qh_dat = {'query_ranges': list('NA'), 'hit_ranges': list('NA')}
            qh_dat = pd.DataFrame(data = qh_dat)
            mave_blat_dict[dat.at[i, 'urn']] = {'chrom': 'NA', 'strand': 'NA', 'target': 'NA', 'target_type': 'NA',
                                            'uniprot': 'NA','coverage': 'NA','identity':'NA', 'hits': qh_dat}
            continue

    # Find chromosome to select hit from
    hit_scores = list()
    hit_dict = {}
    use_chr = False
    
    for c in range(len(output)):
        correct_chr = get_gene_data(i,output[c].id.strip('chr'), return_chr = True)
        if correct_chr == output[c].id.strip('chr'):
            use_chr = True
            break
        if correct_chr == 'NA': # Take top scoring hit if target not found using gene normalizer
            hit_scores = list()
            for e in range(len(output[c])):
                hit_scores.append(output[c][e].score)
            hit_dict[c] = hit_scores

    if use_chr == False:
        for key in hit_dict:
            hit_dict[key] = max(hit_dict[key])
        hit = max(hit_dict, key = hit_dict.get)
    else:
        hit = c
                             
    
    # Use location provided by gene normalizer to find hsp
    loc_dict = get_gene_data(i, output[hit].id.strip('chr'), return_chr = False)
    
    hit_starts = list()
    for n in range(len(output[hit])):
        hit_starts.append(output[hit][n].hit_start)
    
    sub_scores = list()
    for n in range(len(output[hit])):
        sub_scores.append(output[hit][n].score)
    
    if loc_dict == 'NA':
        hsp = output[hit][sub_scores.index(max(sub_scores))] # Take top score if no match found 
    else:
        hsp = output[hit][hit_starts.index(min(hit_starts, key=lambda x:abs(x - loc_dict['start'])))]

        
    for j in range(len(hsp)):
        test_file = open('blat_output_test.txt', 'w')
        test_file.write(str(hsp[j]))
        test_file.close()

        query_string = ''
        hit_string = ''
        strand = hsp[0].query_strand
        coverage = 100 * (hsp.query_end - hsp.query_start) / output.seq_len
        coverage = f"{hsp.query_end - hsp.query_start} / {output.seq_len}, {coverage}" 
        identity = hsp.ident_pct

        test_file = open('blat_output_test.txt', 'r')
        for k,line in enumerate(test_file):
            if k == 1:
                chrom = line.strip('\n')
            if k == 2:
                query_string = line.strip('\n')
            if k == 3:
                hit_string = line.strip('\n')
        test_file.close()

        chrom = chrom.split(' ')[9].strip('chr')
        query_string = query_string.split(' ')
        hit_string = hit_string.split(' ')
        query_ranges.append(query_string[2])
        hit_ranges.append(hit_string[4])
        
    # Add to dict
    qh_dat = {'query_ranges': query_ranges, 'hit_ranges': hit_ranges}
    qh_dat = pd.DataFrame(data = qh_dat)
    mave_blat_dict[dat.at[i, 'urn']] = {'chrom': chrom,'strand': strand,'target': dat.at[i,'target'], 'target_type': dat.at[i, 'target_type'],
                                        'uniprot': dat.at[i,'uniprot_id'],'coverage': coverage,'identity': identity, 'hits': qh_dat} 

Loaded 3209286105 letters in 455 sequences
Searched 750 bases in 1 sequences
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
Loaded 3209286105 letters in 455 sequences
Searched 1053 bases in 1 sequences
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
Loaded 3209286105 letters in 455 sequences
Searched 1182 bases in 1 sequences
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
Loaded 3209286105 letters in 455 sequences
Searched 318 bases in 1 sequences
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
Loaded 3209286105 letters in 455 sequences
Searched 423 bases in 1 sequences
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
Loaded 3209286105 letters in 455 

***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
Loaded 3209286105 letters in 455 sequences
Searched 93 bases in 1 sequences
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
Loaded 3209286105 letters in 455 sequences
Searched 300 bases in 1 sequences
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
Loaded 3209286105 letters in 455 sequences
Searched 117 bases in 1 sequences
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
Loaded 3209286105 letters in 455 sequences
Searched 477 bases in 1 sequences
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
Loaded 3209286105 letters in 455 sequences
Searched 81 

***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
Loaded 3209286105 letters in 455 sequences
Searched 54 bases in 1 sequences
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
Loaded 3209286105 letters in 455 sequences
Searched 249 bases in 1 sequences
Loaded 3209286105 letters in 455 sequences
Blatx 455 sequences in database, 1 files in query
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
Loaded 3209286105 letters in 455 sequences
Searched 423 bases in 1 sequences
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
Loaded 3209286105 letters in 455 sequences
Searched 1242 bases in 1 sequences
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Datab

***Using Gene Database Endpoint: http://localhost:8000***
Loaded 3209286105 letters in 455 sequences
Searched 274 bases in 1 sequences
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
Loaded 3209286105 letters in 455 sequences
Searched 600 bases in 1 sequences
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
Loaded 3209286105 letters in 455 sequences
Searched 495 bases in 1 sequences
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
Loaded 3209286105 letters in 455 sequences
Searched 306 bases in 1 sequences
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000

***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene 

***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
Loaded 3209286105 letters in 455 sequences
Blatx 455 sequences in database, 1 files in query
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
Loaded 3209286105 letters in 455 sequences
Searched 1053 bases in 1 sequences
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
Loaded 3209286105 letters in 455 sequences
Searched 423 bases in 1 sequences
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
Loaded 3209286105 letters in 455 sequences
Searched 1053 bases in 1 sequences
***Using Gene Database Endpoint: http://localhost:8000***
***Using Gene Database Endpoint: http://localhost:8000***
Loaded 3209286105 letters in 455 seq

In [12]:
mave_blat_dict

{'urn:mavedb:00000041-a-1': {'chrom': '20',
  'strand': 1,
  'target': 'Src catalytic domain',
  'target_type': 'Protein coding',
  'uniprot': 'P12931',
  'coverage': '750 / 750, 100.0',
  'identity': 99.86666666666666,
  'hits':   query_ranges           hit_ranges
  0       [0:52]  [37397802:37397854]
  1     [52:232]  [37400114:37400294]
  2    [232:309]  [37401601:37401678]
  3    [309:463]  [37402434:37402588]
  4    [463:595]  [37402748:37402880]
  5    [595:750]  [37403170:37403325]},
 'urn:mavedb:00000048-a-1': {'chrom': '2',
  'strand': -1,
  'target': 'CXCR4',
  'target_type': 'Protein coding',
  'uniprot': 'P61073',
  'coverage': '1041 / 1053, 98.86039886039886',
  'identity': 100.0,
  'hits':   query_ranges             hit_ranges
  0    [12:1053]  [136114871:136115912]},
 'urn:mavedb:00000068-b-1': {'chrom': '17',
  'strand': -1,
  'target': 'TP53 (P72R)',
  'target_type': 'Protein coding',
  'uniprot': None,
  'coverage': '1180 / 1182, 99.83079526226734',
  'identity': 99.9

## Part 2: BLAT Output to Transcript Selection

In [13]:
## Helper functions

def get_start(string):
    return int(string.split(':')[0].strip('['))

def get_end(string):
    return int(string.split(':')[1].strip(']'))

def get_locs_list(hitsdat):
    locs_list = []
    for i in range(len(hitsdat.index)):
        start = get_start(hitsdat.at[i, 'hit_ranges'])
        end = get_end(hitsdat.at[i, 'hit_ranges'])
        locs_list.append([start,end])
    return locs_list

def get_query_hits(dat):
    query_list = []
    hits_list = []
    for i in range(len(dat.index)):
        query_start = get_start(dat.at[i, 'query_ranges'])
        query_end = get_end(dat.at[i, 'query_ranges'])
        query_list.append([query_start, query_end])
        hit_start = get_start(dat.at[i, 'hit_ranges'])
        hit_end = get_end(dat.at[i, 'hit_ranges'])
        hits_list.append([hit_start, hit_end])
        return query_list, hits_list

def get_ga4gh(dp, ref):
    aliases = dp.get_metadata(ref)['aliases']
    f = filter(lambda x: 'ga4gh' in x, aliases)
    return 'ga4gh:' + list(f)[0].split(':')[1]

def get_chr(dp, chrom):
    aliases = dp.get_metadata('GRCh38:' + chrom)['aliases']
    f = filter(lambda x: 'refseq' in x, aliases)
    return list(f)[0].split(':')[1]

def modify_hgvs(var, ref, off, hp):
    if len(var) == 3 or var == '_wt' or var == '_sy' or '[' in var:
        return var
    var = ref + ':' + var
    var = hp.parse_hgvs_variant(var)
    var.posedit.pos.start.base = var.posedit.pos.start.base + off
    var.posedit.pos.end.base = var.posedit.pos.end.base + off
    return(str(var))

def blat_check(i):
    item = mave_blat_dict[dat.at[i, 'urn']]
    if item['uniprot'] == None:
        test = dat.at[i, 'target'].split(' ')
        for j in range(len(test)):
            try:
                out = qh.normalize(test[j]).gene_descriptor
                gene_dat = [out.label, out.extensions[2].value['chr']]
                if item['chrom'] != gene_dat[1]:
                    return False
                else:
                    return True
            except:
                continue

def get_haplotype_allele(var, ref, offset, l, tr, dp, ts, mapped):
    var = var.lstrip(f'{l}.')
    
    if '[' in var:
        var = var[1:][:-1]
        varlist = var.split(';')
        varlist = list(set(varlist))
    else:
        varlist = list()
        varlist.append(var)

    locs = {}
    alleles = []

    for i in range(len(varlist)):
        try:
            allele = tr.translate_from(ref + ':'+ l +'.' + varlist[i], 'hgvs')
            if mapped == 'pre':
                allele['location']['sequence_id'] = 'ga4gh:SQ.' + sha512t24u(ts.encode('ascii'))
            allele['location']['interval']['start']['value'] = allele['location']['interval']['start']['value'] + offset
            allele['location']['interval']['end']['value'] = allele['location']['interval']['end']['value'] + offset
            allele['_id'] = ga4gh_identify(allele)
            alleles.append(allele)
        except:
            vrstext = {'definition':ref + ':'+ l +'.' + varlist[i], 'type': 'Text'}
            return vrstext
    
    if len(alleles) == 1: # Not haplotype
        return alleles[0]
    else:
        return models.Haplotype(members = alleles)
    
def get_clingen_id(hgvs):
    url = 'https://reg.genome.network/allele?hgvs=' + hgvs
    page = requests.get(url).json()
    page = page['@id']
    try:
        return page.split('/')[4]
    except:
        return 'NA'

In [14]:
## UTA Transcript Selection
nest_asyncio.apply()
mane = MANETranscriptMappings()
utadb = UTADatabase(db_pwd = 'uta')
qh = QueryHandler()
dp = SeqRepoDataProxy(sr = sr)

mappings_dict = {}

for j in range(len(dat.index)):
    if dat.at[j, 'target_type'] == 'Protein coding':
        item = mave_blat_dict[dat.at[j,'urn']]
        #if blat_check(j) == False:
         #   mappings_dict[dat.at[j, 'urn']] = 'BLAT hit not found on correct chromosome'
          #  continue
        if item['chrom'] == 'NA':
            continue
        locs = get_locs_list(item['hits'])
        chrom = get_chr(dp, item['chrom'])

        try:
            uniprot = dat.at[j, 'uniprot_id']
            gsymb = qh.normalize(str(f'uniprot:{uniprot}')).gene_descriptor.label
        except: 
            temp = dat.at[j, 'target'].split(' ')
            gsymb = qh.normalize(temp[0]).gene_descriptor.label

        async def mapq():
            transcript_lists = []
            for i in range(len(locs)):
                testquery = (f"""select *
                            from uta_20210129.tx_exon_aln_v
                            where hgnc = '{gsymb}'
                            and {locs[i][0]} between alt_start_i and alt_end_i
                            or {locs[i][1]} between alt_start_i and alt_end_i
                            and alt_ac = '{chrom}'""") 
    
                out = await utadb.execute_query(testquery)
                tl = []
                for j in range(len(out)):
                    if out[j]['tx_ac'].startswith('NR_') == False:
                        tl.append(out[j]['tx_ac'])
                if tl != []:
                    transcript_lists.append(tl)
            return(transcript_lists)

        ts = asyncio.run(mapq())
        try:
            isect = list(set.intersection(*map(set,ts)))
        except:
            try: # Look for transcripts using uniprot id
                url = 'https://www.uniprot.org/uniprot/' + str(dat.at[j, 'uniprot_id']) + '.xml'
                page = requests.get(url)
                page = BeautifulSoup(page.text)
                page = page.find_all('sequence')
                up = page[1].get_text()

                stri = str(dat.at[j,'target_sequence'])
                if up.find(stri) != -1:
                    full_match = True
                else:
                    full_match = False
                start = up.find(stri[0:9])
                mappings_dict[dat.at[j,'urn']] = [dat.at[j, 'uniprot_id'], start, dat.at[j, 'urn'], full_match]
                continue
            except:
                print([dat.at[j, 'urn'], 'no transcripts found'])
                continue

        mane_trans = mane.get_mane_from_transcripts(isect)
        if mane_trans != []:
            if len(mane_trans) == 1:
                np = mane_trans[0]['RefSeq_prot']
            else:
                if mane_trans[0]['MANE_status'] == 'MANE Select':
                    np = mane_trans[0]['RefSeq_prot']
                else:
                    np = mane_trans[1]['RefSeq_prot']
            
            oseq = dat.at[j, 'target_sequence']
            
            if len(set(str(oseq))) > 4:
                stri = str(oseq)
            else:
                oseq = Seq(oseq)
                stri = str(oseq.translate(table=1)).replace('*', '')
            
            if str(sr[np]).find(stri) != -1:
                full_match = True
            else:
                full_match = False
            start = str(sr[np]).find(stri[0:9])
            mappings_dict[dat.at[j,'urn']] = [np, start, dat.at[j, 'urn'], full_match]
            
        else:
            trans_lens = []
            for i in range(len(isect)):
                trans_lens.append(len(str(sr[isect[i]])))
            loc = trans_lens.index(max(trans_lens))
            nm = isect[loc]
    
            testquery = f"SELECT pro_ac FROM uta_20210129.associated_accessions WHERE tx_ac = '{nm}'"
            async def np():
                out = await utadb.execute_query(testquery)
                try:
                    return out[0]['pro_ac']
                except:
                    return out
            np = asyncio.run(np())
            
            if np != []:
                oseq = dat.at[j, 'target_sequence']
            
                if len(set(str(oseq))) > 4:
                    stri = str(oseq)
                else:
                    oseq = Seq(oseq)
                    stri = str(oseq.translate(table=1)).replace('*', '')
                
                if str(sr[np]).find(stri) != -1:
                    full_match = True
                else:
                    full_match = False
                start = str(sr[np]).find(stri[0:9])
                mappings_dict[dat.at[j,'urn']] = [np, start, dat.at[j, 'urn'], full_match] 
mappings_dict

***Using Gene Database Endpoint: http://localhost:8000***




{'urn:mavedb:00000041-a-1': ['NP_938033.1',
  269,
  'urn:mavedb:00000041-a-1',
  True],
 'urn:mavedb:00000048-a-1': ['NP_003458.1',
  1,
  'urn:mavedb:00000048-a-1',
  True],
 'urn:mavedb:00000068-b-1': ['NP_000537.3',
  0,
  'urn:mavedb:00000068-b-1',
  False],
 'urn:mavedb:00000045-c-1': ['NP_000336.1',
  0,
  'urn:mavedb:00000045-c-1',
  True],
 'urn:mavedb:00000049-a-2': ['NP_005948.3',
  0,
  'urn:mavedb:00000049-a-2',
  True],
 'urn:mavedb:00000062-b-1': ['NP_000760.1',
  8,
  'urn:mavedb:00000062-b-1',
  True],
 'urn:mavedb:00000094-a-12': ['NP_004691.2',
  0,
  'urn:mavedb:00000094-a-12',
  True],
 'urn:mavedb:00000057-c-1': ['NP_005334.1',
  1,
  'urn:mavedb:00000057-c-1',
  True],
 'urn:mavedb:00000094-a-7': ['NP_004691.2',
  0,
  'urn:mavedb:00000094-a-7',
  True],
 'urn:mavedb:00000061-f-1': ['NP_002871.1',
  51,
  'urn:mavedb:00000061-f-1',
  True],
 'urn:mavedb:00000001-c-1': ['NP_008819.1',
  0,
  'urn:mavedb:00000001-c-1',
  True],
 'urn:mavedb:00000049-a-3': ['NP_0059

## Part 3: Transcript to VRS Variant

In [15]:
# VRS Variant Mapping - Coding Scoresets
dp = SeqRepoDataProxy(sr = sr)
tr = Translator(data_proxy = dp)
qh = QueryHandler()
vrs_mappings_dict = {}

for i in range(len(dat.index)):
    if dat.at[i, 'target_type'] == 'Protein coding':
        if dat.at[i, 'urn'] in mappings_dict.keys():
            item = mappings_dict[dat.at[i, 'urn']]
            #if blat_check(i) == False:
             #   vrs_mappings_dict[dat.at[i, 'urn']] = 'BLAT hit not found on correct chromosome'
              #  continue
            string = 'https://www.mavedb.org/scoreset/' + item[2] + '/scores/'
            vardat = requests.get(string).content
            vardat = pd.read_csv(io.StringIO(vardat.decode('utf-8')), skiprows = 3, header = [1])
            np = item[0]
            
            offset = item[1]
            varm = vardat['hgvs_pro']

            var_ids_pre_map = []
            var_ids_post_map = []
        
            ts = dat.at[i, 'target_sequence']
            if len(set(str(ts))) > 4:
                stri = str(ts)
            else:
                ts = Seq(ts)
                ts = str(ts.translate(table=1)).replace('*', '')

            for j in range(len(varm)):
                if len(varm[j]) == 3 or varm[j] == '_wt' or varm[j] == '_sy':
                    continue
                else:
                    try:
                        if np.startswith('N') == True:
                            var_ids_pre_map.append(get_haplotype_allele(varm[j], np, 0, 'p', tr, dp, ts, 'pre').as_dict())
                            var_ids_post_map.append(get_haplotype_allele(varm[j], np, offset, 'p', tr, dp, ts, 'post').as_dict())
                        else:
                            var_ids_pre_map.append(get_haplotype_allele(varm[j], np, 0, 'p', tr, dp, ts, 'pre'))
                            var_ids_post_map.append(get_haplotype_allele(varm[j], np, offset, 'p', tr, dp, ts, 'post'))
                    except:
                        continue
        
            tempdat = pd.DataFrame({'pre_mapping': var_ids_pre_map, 'mapped': var_ids_post_map})
            vrs_mappings_dict[dat.at[i, 'urn']] = tempdat
vrs_mappings_dict

***Using Gene Database Endpoint: http://localhost:8000***


{'urn:mavedb:00000041-a-1':                                             pre_mapping  \
 0     {'_id': 'ga4gh:VA.mEeUxPTsYR3nv2HatnR0fVPAN4wh...   
 1     {'_id': 'ga4gh:VA.FXAraUEJO3xdWq83lg9L_XgSwkNK...   
 2     {'_id': 'ga4gh:VA.FU1r-YnZOUngFiqwOjnttxkS_UWC...   
 3     {'_id': 'ga4gh:VA.4IqIvzuRDc1_dbZERqoMcO2owJI_...   
 4     {'_id': 'ga4gh:VA.mLrgCANFaxaL_w5H-HOqGOXxdHAN...   
 ...                                                 ...   
 3501  {'_id': 'ga4gh:VA.J3C6cHm747Gel6jEHVuBFjq20jVd...   
 3502  {'_id': 'ga4gh:VA.4cAPn_Ym9zeape35iStqIguN--7D...   
 3503  {'_id': 'ga4gh:VA.-SRcbASnO6P-_eUyTkCbUU0ObSIu...   
 3504  {'_id': 'ga4gh:VA.b9uqNCk5AoY2FOepvkpcPmTvUY0M...   
 3505  {'_id': 'ga4gh:VA.Rfe-JmdXfDPA0EZzXPs-rFmiRRu-...   
 
                                                  mapped  
 0     {'_id': 'ga4gh:VA.NXnq2B9h0OLVbKfqGRoDLkf4Mqc0...  
 1     {'_id': 'ga4gh:VA.DK5FkiArURtrxy1ub-2CuUt_95Uv...  
 2     {'_id': 'ga4gh:VA.EP5yJ0IeCjo8r3GCTaxEXDiGO_HC...  
 3     {'_id': 

In [16]:
# VRS variant mapping non-protein coding scoresets
dp = SeqRepoDataProxy(sr = sr)
tr = Translator(data_proxy = dp)
qh = QueryHandler()
vrs_noncoding_mappings_dict = {}

for i in range(len(dat.index)):
    if dat.at[i, 'target_type'] != 'Protein coding':
        item = mave_blat_dict[dat.at[i, 'urn']]
        #if blat_check(i) == False:
         #   vrs_noncoding_mappings_dict[dat.at[i, 'urn']] = 'BLAT hit not found on correct chromosome'
          #  continue
        ranges = get_locs_list(item['hits'])[0]
        string = 'https://www.mavedb.org/scoreset/' + dat.at[i, 'urn'] + '/scores/'
        origdat = requests.get(string).content
        varsdat = pd.read_csv(io.StringIO(origdat.decode('utf-8')), skiprows=13, header = [1])

        if varsdat.columns[0] == 'accession':
            ntlist = varsdat['hgvs_nt'].to_list()
        else:
            varsdat = pd.read_csv(io.StringIO(origdat.decode('utf-8')), skiprows=3, header = [1])
            ntlist = varsdat['hgvs_nt'].to_list()
    
        var_ids_pre_map = []
        var_ids_post_map = []
        ref = get_chr(dp, item['chrom'])
        offset = ranges[0]
        ts = dat.at[i, 'target_sequence']

        for j in range(len(ntlist)):
            if '=' in ntlist[j] or ntlist[j] == '_wt' or ntlist[j] == '_sy':
                continue
            else:
                try:
                    var_ids_pre_map.append(get_haplotype_allele(ntlist[j][2:], ref, 0, 'g', tr, dp, ts, 'pre').as_dict())
                    var_ids_post_map.append(get_haplotype_allele(ntlist[j][2:], ref, offset, 'g', tr, dp, ts, 'post').as_dict())
                except:
                    continue

        tempdat = pd.DataFrame({'pre_mapping': var_ids_pre_map, 'mapped': var_ids_post_map})
        vrs_noncoding_mappings_dict[dat.at[i, 'urn']] = tempdat

vrs_noncoding_mappings_dict

***Using Gene Database Endpoint: http://localhost:8000***


{'urn:mavedb:00000023-a-1':                                            pre_mapping  \
 0    {'_id': 'ga4gh:VA.FLX5H8T-jpV9O46NJJbGrNyvIulK...   
 1    {'_id': 'ga4gh:VA.1HCzBKZmvLqX8_F46V1_pUjefI79...   
 2    {'_id': 'ga4gh:VA.QpQB1HAQ-j7aSoTBwPKsbvFVSKaZ...   
 3    {'_id': 'ga4gh:VA.LjMNKtRZpKvRQRtxMku21IaKCqyN...   
 4    {'_id': 'ga4gh:VA.C-jbg6GMr40hakB-a-8RqTz5bsSO...   
 ..                                                 ...   
 949  {'_id': 'ga4gh:VA.EwW_0o1NtIfFCuC7ni7xA88TA_U9...   
 950  {'_id': 'ga4gh:VA.BblMqZRqprpYmHVckaLwf8iTVzOq...   
 951  {'_id': 'ga4gh:VA.Zp7DSc4_OAT9Yq1l4rkWw5LerMFf...   
 952  {'_id': 'ga4gh:VA.80UINochkaTEconXXTj00b8p5B8E...   
 953  {'_id': 'ga4gh:VA.LNhB18OphEZNQL1-vgBD0mFfxRQY...   
 
                                                 mapped  
 0    {'_id': 'ga4gh:VA.A69lDO5F74TB-NJcnAdBcUOchBGE...  
 1    {'_id': 'ga4gh:VA.fStOAPck2JEFtHmyo7whAThEu1R8...  
 2    {'_id': 'ga4gh:VA.CW60XEws-HFT8CRYLT2e55cRiqLy...  
 3    {'_id': 'ga4gh:VA.HWC4QQN

In [17]:
# Variant Mapping Example - Coding and Noncoding
ex = vrs_mappings_dict['urn:mavedb:00000041-a-1']
print(ex.at[0, 'pre_mapping'])
print(ex.at[0, 'mapped'])

ex = vrs_noncoding_mappings_dict['urn:mavedb:00000018-a-1']
print(ex.at[0, 'pre_mapping'])
print(ex.at[0, 'mapped'])

{'_id': 'ga4gh:VA.mEeUxPTsYR3nv2HatnR0fVPAN4whv_Vw', 'type': 'Allele', 'location': {'type': 'SequenceLocation', 'sequence_id': 'ga4gh:SQ.PyX9IDu95_tYLg1Jz9JpW5xpQkwn6bpB', 'interval': {'type': 'SequenceInterval', 'start': {'type': 'Number', 'value': 14}, 'end': {'type': 'Number', 'value': 15}}}, 'state': {'type': 'LiteralSequenceExpression', 'sequence': 'H'}}
{'_id': 'ga4gh:VA.NXnq2B9h0OLVbKfqGRoDLkf4Mqc0tKOP', 'type': 'Allele', 'location': {'type': 'SequenceLocation', 'sequence_id': 'ga4gh:SQ.uJDQo_HaTNFL2-0-6K5dVzVcweigexye', 'interval': {'type': 'SequenceInterval', 'start': {'type': 'Number', 'value': 283}, 'end': {'type': 'Number', 'value': 284}}}, 'state': {'type': 'LiteralSequenceExpression', 'sequence': 'H'}}
{'_id': 'ga4gh:VA.H0UXotsEF90OQKPLEsiuRbUkHCWgaW0q', 'type': 'Allele', 'location': {'type': 'SequenceLocation', 'sequence_id': 'ga4gh:SQ.jUOcLPDjSqWFEo9kSOG8ITe1dr9QK3h6', 'interval': {'type': 'SequenceInterval', 'start': {'type': 'Number', 'value': 2}, 'end': {'type': 'Num