In [1]:
import pandas as pd
import pickle
import operator
import requests
import Bio
from Bio.SeqUtils import seq1
from Bio.Seq import Seq
import math
from math import isnan
import io
import re
import hgvs.parser
import hgvs.location
import hgvs.posedit
import hgvs.edit
import hgvs.sequencevariant
import hgvs.dataproviders.uta
from biocommons.seqrepo import SeqRepo
sr = SeqRepo("/usr/local/share/seqrepo/latest")
from ga4gh.vrs.extras.translator import Translator
from ga4gh.vrs.dataproxy import SeqRepoDataProxy
from ga4gh.vrs.utils.hgvs_tools import HgvsTools
import json

Removing allOf attribute from RepeatedSequenceExpression to avoid python-jsonschema-objects error.


In [2]:
mave_dat = pd.read_csv('results/mave_dat.csv')

In [3]:
with open('results/mave_blat.pickle', 'rb') as fn:
    mave_blat = pickle.load(fn)

In [4]:
mave_blat

{'urn:mavedb:00000041-a-1': {'chrom': '20',
  'strand': 1,
  'target': 'Src catalytic domain',
  'target_type': 'Protein coding',
  'uniprot': 'P12931',
  'coverage': '750 / 750, 100.0',
  'identity': 99.86666666666666,
  'hits':   query_ranges           hit_ranges
  0       [0:52]  [37397802:37397854]
  1     [52:232]  [37400114:37400294]
  2    [232:309]  [37401601:37401678]
  3    [309:463]  [37402434:37402588]
  4    [463:595]  [37402748:37402880]
  5    [595:750]  [37403170:37403325]},
 'urn:mavedb:00000048-a-1': {'chrom': '2',
  'strand': -1,
  'target': 'CXCR4',
  'target_type': 'Protein coding',
  'uniprot': 'P61073',
  'coverage': '1041 / 1053, 98.86039886039886',
  'identity': 100.0,
  'hits':   query_ranges             hit_ranges
  0    [12:1053]  [136114871:136115912]},
 'urn:mavedb:00000068-b-1': {'chrom': '17',
  'strand': -1,
  'target': 'TP53 (P72R)',
  'target_type': 'Protein coding',
  'uniprot': nan,
  'coverage': '1180 / 1182, 99.83079526226734',
  'identity': 99.91

In [5]:
with open('results/scores_coding.pickle', 'rb') as fn:
    scores_coding = pickle.load(fn)
    
with open('results/scores_noncoding.pickle', 'rb') as fn:
    scores_noncoding = pickle.load(fn)

In [6]:
def get_hits_list(hitsdat):
    hits_list = []
    for i in range(len(hitsdat.index)):
        start = get_start(hitsdat.at[i, 'query_ranges'])
        end = get_end(hitsdat.at[i, 'query_ranges'])
        hits_list.append([start,end])
    return hits_list

def get_locs_list(hitsdat):
    locs_list = []
    for i in range(len(hitsdat.index)):
        start = get_start(hitsdat.at[i, 'hit_ranges'])
        end = get_end(hitsdat.at[i, 'hit_ranges'])
        locs_list.append([start,end])
    return locs_list

def get_start(string):
    return int(string.split(':')[0].strip('['))

def get_end(string):
    return int(string.split(':')[1].strip(']'))

def exonic_overlap(start, end, hits_list):
    if start == 0:
        return False
    for i in range(len(hits_list)):
        if hits_list[i][0] >= start and hits_list[i][0] <= end:
            return True
    return False

In [6]:
import json
diff_vars_dict = {}
var_count = 0

for key in mave_blat:
    if key != 'urn:mavedb:00000072-a-1' and key != 'urn:mavedb:00000105-a-1': # No mapping for these sequences
        f = open(f'mappings/{key[11::]}.json')
        dat = json.load(f)
        dat = dat['mapped_scores']
    
        diff_vars = []

        for j in range(len(dat)):
            if 'members' not in dat[j]['pre_mapped'].keys():
                var_count += 1
                seq_pre = dat[j]['pre_mapped']['vrs_ref_allele_seq']
                seq_post = dat[j]['post_mapped']['vrs_ref_allele_seq']
                seq_pre_rv = str(Seq(seq_pre).reverse_complement())

                if seq_pre != seq_post and seq_post != seq_pre_rv:
                    diff_vars.append(j)
                
            else:
                for k in range(len(dat[j]['pre_mapped']['members'])):
                    var_count += 1
                    seq_pre = dat[j]['pre_mapped']['members'][k]['vrs_ref_allele_seq']
                    seq_post = dat[j]['post_mapped']['members'][k]['vrs_ref_allele_seq']
                    seq_pre_rv = str(Seq(seq_pre).reverse_complement())
                
                    if seq_pre != seq_post and seq_post != seq_pre_rv: 
                        diff_vars.append(j)
        
            diff_vars_dict[key] = diff_vars
var_count

2499036

In [7]:
diff_vars_dict

{'urn:mavedb:00000041-a-1': [],
 'urn:mavedb:00000048-a-1': [],
 'urn:mavedb:00000068-b-1': [7738,
  7739,
  7740,
  7741,
  7742,
  7743,
  7744,
  7745,
  7746,
  7747,
  7748,
  7749,
  7750,
  7751,
  7752,
  7753,
  7754,
  7755,
  7756,
  7757,
  7758],
 'urn:mavedb:00000045-c-1': [],
 'urn:mavedb:00000018-a-1': [],
 'urn:mavedb:00000099-a-1': [],
 'urn:mavedb:00000001-c-1': [177,
  499,
  549,
  726,
  765,
  787,
  905,
  1019,
  1045,
  1047,
  1149,
  1236,
  1291,
  1483,
  1709,
  1714,
  1750,
  1866,
  2020,
  2060,
  2116,
  2118,
  2250,
  2412,
  2436,
  2459,
  2486,
  2539,
  2585,
  2709,
  2738,
  2834,
  3003,
  3009,
  3021,
  3233,
  3381,
  3393,
  3459,
  3541,
  3603,
  3792,
  3798,
  3957,
  4091,
  4120,
  4149,
  4334,
  4396,
  4465,
  4487,
  4512,
  4676,
  4677,
  4851,
  4990,
  5035,
  5083,
  5096,
  5115,
  5265,
  5354,
  5380,
  5665,
  5777],
 'urn:mavedb:00000049-a-3': [],
 'urn:mavedb:00000050-a-1': [],
 'urn:mavedb:00000061-i-1': [],
 'urn:m

In [39]:
import json
exon_err = []
non_homolog_err = []
var_count = 0

for key in mave_blat:
    if key != 'urn:mavedb:00000072-a-1' and key != 'urn:mavedb:00000105-a-1': # No mapping for these sequences
        print(key)
        f = open(f'mappings/{key[11::]}.json')
        dat = json.load(f)
        seq_acc = dat['mapped_reference_sequence']['sequence_accessions'][0]
        dat = dat['mapped_scores']
        hits = get_hits_list(mave_blat[key]['hits'])
        locs = get_locs_list(mave_blat[key]['hits'])
        
        for j in range(len(dat)):
            if 'members' not in dat[j]['pre_mapped'].keys():
                var_count += 1
                seq_pre = dat[j]['pre_mapped']['vrs_ref_allele_seq']
                seq_post = dat[j]['post_mapped']['vrs_ref_allele_seq']
                seq_pre_rv = str(Seq(seq_pre).reverse_complement())

                if seq_pre != seq_post and seq_post != seq_pre_rv:
                    if "NP" in seq_acc:
                        non_homolog_err.append(dat[j]['mavedb_id']) # All protein are homologous errors
                    else:
                        start_loc_pre = -1
                        start_pre = dat[j]['pre_mapped']['variation']['location']['interval']['start']['value']
                        end_loc_pre = -1
                        end_pre = dat[j]['pre_mapped']['variation']['location']['interval']['end']['value']
                        for n in range(len(hits)):
                             if start_pre >= hits[n][0] and start_pre < hits[n][1]:
                                start_loc_pre = n
                                break
                        for n in range(len(hits)):
                            if end_pre >= hits[n][0] and end_pre < hits[n][1]:
                                end_loc_pre = n
                                break
                        
                        start_loc_post = -1
                        start_post = dat[j]['post_mapped']['variation']['location']['interval']['start']['value']
                        end_loc_post = -1
                        end_post = dat[j]['post_mapped']['variation']['location']['interval']['end']['value']
                        for n in range(len(locs)):
                            if start_post >= locs[n][0] and start_post < locs[n][1]:
                                start_loc_post = n
                                break
                        for n in range(len(locs)):
                            if end_post >= locs[n][0] and end_post < locs[n][1]:
                                end_loc_post = n
                                break
                                    
                        if start_loc_pre != end_loc_pre:
                            exon_err.append(dat[j]['mavedb_id'])
                        elif start_loc_post != end_loc_post:
                            exon_err.append(dat[j]['mavedb_id'])  
                        else:
                            non_homolog_err.append(dat[j]['mavedb_id'])
                
            else:
                for k in range(len(dat[j]['pre_mapped']['members'])):
                    var_count += 1
                    seq_pre = dat[j]['pre_mapped']['members'][k]['vrs_ref_allele_seq']
                    seq_post = dat[j]['post_mapped']['members'][k]['vrs_ref_allele_seq']
                    seq_pre_rv = str(Seq(seq_pre).reverse_complement())
                
                    if seq_pre != seq_post and seq_post != seq_pre_rv:
                        if "NP" in seq_acc:
                            non_homolog_err.append(dat[j]['mavedb_id']) # All proteins are homologous errors
                        else:
                            start_loc_pre = -1
                            start_pre = dat[j]['pre_mapped']['members'][k]['variation']['location']['interval']['start']['value']
                            end_loc_pre = -1
                            end_pre = dat[j]['pre_mapped']['members'][k]['variation']['location']['interval']['end']['value']
                            for n in range(len(hits)):
                                if start_pre >= hits[n][0] and start_pre < hits[n][1]:
                                    start_loc_pre = n
                                    break
                            for n in range(len(hits)):
                                if end_pre >= hits[n][0] and end_pre < hits[n][1]:
                                    end_loc_pre = n
                                    break
                        
                            start_loc_post = -1
                            start_post = dat[j]['post_mapped']['members'][k]['variation']['location']['interval']['start']['value']
                            end_loc_post = -1
                            end_post = dat[j]['post_mapped']['members'][k]['variation']['location']['interval']['end']['value']
                            for n in range(len(locs)):
                                if start_post >= locs[n][0] and start_post < locs[n][1]:
                                    start_loc_post = n
                                    break
                            for n in range(len(locs)):
                                if end_post >= locs[n][0] and end_post < locs[n][1]:
                                    end_loc_post = n
                                    break
                                    
                            if start_loc_pre != end_loc_pre:
                                exon_err.append(dat[j]['mavedb_id'])
                            elif start_loc_post != end_loc_post:
                                exon_err.append(dat[j]['mavedb_id'])
                            else:
                                non_homolog_err.append(dat[j]['mavedb_id'])


urn:mavedb:00000041-a-1
urn:mavedb:00000048-a-1
urn:mavedb:00000068-b-1
urn:mavedb:00000045-c-1
urn:mavedb:00000018-a-1
urn:mavedb:00000099-a-1
urn:mavedb:00000001-c-1
urn:mavedb:00000049-a-3
urn:mavedb:00000050-a-1
urn:mavedb:00000061-i-1
urn:mavedb:00000083-b-1
urn:mavedb:00000094-a-5
urn:mavedb:00000043-a-2
urn:mavedb:00000055-0-1
urn:mavedb:00000104-a-2
urn:mavedb:00000005-a-6
urn:mavedb:00000083-h-1
urn:mavedb:00000098-a-1
urn:mavedb:00000108-a-2
urn:mavedb:00000001-a-4
urn:mavedb:00000094-a-6
urn:mavedb:00000078-a-1
urn:mavedb:00000103-c-1
urn:mavedb:00000043-a-1
urn:mavedb:00000061-d-1
urn:mavedb:00000046-a-1
urn:mavedb:00000081-a-1
urn:mavedb:00000097-r-1
urn:mavedb:00000067-a-1
urn:mavedb:00000001-c-2
urn:mavedb:00000049-a-7
urn:mavedb:00000060-a-2
urn:mavedb:00000058-a-1
urn:mavedb:00000045-k-1
urn:mavedb:00000094-a-14
urn:mavedb:00000069-a-2
urn:mavedb:00000106-b-1
urn:mavedb:00000046-a-2
urn:mavedb:00000069-a-1
urn:mavedb:00000034-b-1
urn:mavedb:00000049-a-8
urn:mavedb:0000

In [25]:
f = open(f'mappings/00000005-a-6.json')
dat = json.load(f)
dat['mapped_scores'][342]

{'pre_mapped': {'id': 'ga4gh:VA.Cq7Wt4bMXRMANWoXql2ZIfyUGQyt7V52',
  'type': 'VariationDescriptor',
  'variation': {'id': 'ga4gh:VA.Cq7Wt4bMXRMANWoXql2ZIfyUGQyt7V52',
   'type': 'Allele',
   'location': {'id': None,
    'type': 'SequenceLocation',
    'sequence_id': 'ga4gh:SQ.sVMC1jmTXRvuzBCDJ8aoBmZ_Uu35YFj7',
    'interval': {'type': 'SequenceInterval',
     'start': {'type': 'Number', 'value': 1551},
     'end': {'type': 'Number', 'value': 1554}}},
   'state': {'type': 'LiteralSequenceExpression', 'sequence': 'GGA'}},
  'vrs_ref_allele_seq': 'TAC'},
 'post_mapped': {'id': 'ga4gh:VA.je7Peb5yaXGXu_0wizFY-XJkqOva5TC9',
  'type': 'VariationDescriptor',
  'variation': {'id': 'ga4gh:VA.je7Peb5yaXGXu_0wizFY-XJkqOva5TC9',
   'type': 'Allele',
   'location': {'id': None,
    'type': 'SequenceLocation',
    'sequence_id': 'ga4gh:SQ.5ZUqxCmDDgN4xTRbaSjN8LwgZironmB8',
    'interval': {'type': 'SequenceInterval',
     'start': {'type': 'Number', 'value': 43056800},
     'end': {'type': 'Number', 

In [8]:
var_count = 0
for key in diff_vars_dict:
    var_count = var_count + len(diff_vars_dict[key])
print(var_count)
100*var_count/2499036

24878


0.9955038662908418

In [49]:
# Determine total number of paired VRS alleles in data set
allele_count = 0
var_count = 0
allele_ac_list_pre = []
allele_ac_list_post = []
for key in mave_blat:
    if key != 'urn:mavedb:00000072-a-1' and key != 'urn:mavedb:00000105-a-1':
        f = open(f'mappings/{key[11::]}.json')
        dat = json.load(f)
        dat = dat['mapped_scores'] 
        
        for j in range(len(dat)):
            var_count += 1
            if 'members' not in dat[j]['post_mapped'].keys():
                allele_count += 1
                allele_ac_list_pre.append(dat[j]['pre_mapped']['id'])
                allele_ac_list_post.append(dat[j]['post_mapped']['id'])
            else:
                for k in range(len(dat[j]['post_mapped']['members'])):
                    allele_count += 1
                    allele_ac_list_pre.append(dat[j]['pre_mapped']['members'][k]['id'])
                    allele_ac_list_post.append(dat[j]['post_mapped']['members'][k]['id'])

print(var_count)
print(allele_count)
print(len(set(allele_ac_list_pre)))
print(len(set(allele_ac_list_post)))

1586270
2499036
363002
349769


In [9]:
allele_list_dict = {}
for key in mave_blat:
    if key != 'urn:mavedb:00000072-a-1' and key != 'urn:mavedb:00000105-a-1':
        f = open(f'mappings/{key[11::]}.json')
        dat = json.load(f)
        dat = dat['mapped_scores'] 
        
        for j in range(len(dat)):
            if 'members' not in dat[j]['post_mapped'].keys():
                va = dat[j]['post_mapped']['id']
                if va not in allele_list_dict:
                    allele_list_dict[va] = [dat[j]['pre_mapped']['id']]
                else:
                    if dat[j]['pre_mapped']['id'] in allele_list_dict[va]:
                        continue
                    else:
                        l = allele_list_dict[va]
                        l.append(dat[j]['pre_mapped']['id'])
                        allele_list_dict[va] = l
            else:
                for k in range(len(dat[j]['post_mapped']['members'])):
                    va = dat[j]['post_mapped']['members'][k]['id']
                    if va not in allele_list_dict:
                        allele_list_dict[va] = [dat[j]['pre_mapped']['members'][k]['id']]
                    else:
                        if dat[j]['pre_mapped']['members'][k]['id'] in allele_list_dict[va]:
                            continue
                        l = allele_list_dict[va]
                        l.append(dat[j]['pre_mapped']['members'][k]['id'])
                        allele_list_dict[va] = l

In [18]:
count = 0
for key in allele_list_dict:
    if len(allele_list_dict[key]) > 1:
        count += 1
print(len(allele_list_dict))
print(count)

349769
9391


In [22]:
pre_mapped_digests = []
for key in allele_list_dict:
    curr = allele_list_dict[key]
    for i in range(len(curr)):
        pre_mapped_digests.append(curr[i])
len(set(pre_mapped_digests))

363002