In [93]:
import pandas as pd
import numpy as np
import math
from collections import defaultdict
import matplotlib.pyplot as plt
import pickle
import fileinput
import sys
from my_JSD import KLdiv, JSdiv, cons
from array import *
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

In [2]:
#Reading the Zinc-finger domain data
curr_dir = !pwd
my_path = curr_dir[0]+"/hmm_domains/"
filename = "zf-C2H2.csv"
#filename = "Homeobox.csv"
domain_data = pd.read_csv(my_path+filename, sep='\t', index_col=0)
#Sort the zinc finger data
sorted_domain_data = domain_data.sort_values(by=["chrom_num", "ensembl_id", "TargetStart"])
sorted_domain_data = sorted_domain_data.reset_index(drop=True)

In [3]:
#Get the canonic protein id for Zinc domain
with open(my_path+'zinc_canonic_prot.pik', 'rb') as handle:
    canonic_protein = pickle.load(handle)

In [4]:
#Get the frameshifts index and length of the exons
with open(curr_dir[0]+"/from_shilpa/exons_seqs/exons_index_length.pik", 'rb') as handle:
    exons_frameshifts = pickle.load(handle)

In [5]:
chromosome_names = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y"]

In [6]:
chrom_names_list = sorted_domain_data["chrom_num"].tolist()
starnge_chrom_sum = 0
for name in chrom_names_list:
    if (name not in chromosome_names):
        starnge_chrom_sum += 1
print "Leaving outside "+str(starnge_chrom_sum)+" out of "+str(len(chrom_names_list))

Leaving outside 74 out of 10492


In [7]:
def correct_exons_frameshift(exon_df, targetid):
    
    idx = exons_frameshifts[targetid+".exons.txt"][0]
    length = exons_frameshifts[targetid+".exons.txt"][1]
    bps = exons_frameshifts[targetid+".exons.txt"][2]
    
    #Find the exon we need to add bps to
    first_bp_count = 1
    for index, exon in exon_df.iterrows():
        ex_start = int(exon[0])
        ex_end = int(exon[1])
        exon_len = (ex_end - ex_start + 1)
        if (idx <= first_bp_count):
            exon_df.set_value(index, "start_pos", (ex_start - length))
            break
        elif (idx <= (first_bp_count + exon_len -1)):
            exon_df.set_value(index, "end_pos", (ex_end + length))
            break
        first_bp_count += exon_len

In [8]:
#A function that get chromosome raw data from the hmmer results and return a table of the exons
def create_exon_pos_table(chrom_raw, targetid):
    exons_raw = chrom_raw
    
    #Removing the complement bracates if exist
    if (exons_raw.find("complement(") >= 0):
        exons_raw = exons_raw[exons_raw.find("complement(")+11:-1]
    
    #Removing the join bracates if exist
    if (exons_raw.find("join(") >= 0):
        exons_raw = exons_raw[exons_raw.find("join(")+5:-1]
        
    #In case there's only one exon, take everything after the second ":"
    else:
        exons_raw = exons_raw[exons_raw.find(":", chrom_raw.find(":")+1)+1:]
    
    exons_list = exons_raw.split(",")
    exon_pos = []
    frameshift_flag = False
    for ex in exons_list:
        #flag cases where Shilpa added "-" to a position number to signify frameshift in the sequences
        if (ex[0] == "-"):
            frameshift_flag = True
            continue
            
        #Adding the real exons to exons_pos list
        exon_pos.append(ex.split(".."))
        
    #Creating a table for the start and end of exons    
    exon_df = pd.DataFrame(exon_pos)
    exon_df.columns = ["start_pos", "end_pos"]
    
    #Correct frameshift if frameshift exist
    if (frameshift_flag):
        correct_exons_frameshift(exon_df, targetid)
    
    exon_len = []
    for index, exon in exon_df.iterrows():
        exon_len.append(int(exon[1]) - int(exon[0])+1)
    exon_df["length"] = exon_len
    first_bp_count = 1
    first_bp_list = []
    for index, exon in exon_df.iterrows():
        first_bp_list.append(first_bp_count)
        first_bp_count += int(exon[2])
    exon_df["first_bp_count"] = first_bp_list
    return(exon_df)

In [9]:
#A function that get chromosome position and table of exons, and return the protein position or -1 if it's not within any exon
def find_protein_pos(chrom_pos, exon_df, chrom_raw):
    for index, exon in exon_df.iterrows():
        start_pos = int(exon[0])
        end_pos = int(exon[1])
        first_bp_count = int(exon[3])
        if (chrom_pos >= start_pos and chrom_pos <= end_pos):
            
            #Calculate position for reverse complement strand: the protein is translated from the end position towards the start position of the exon
            if (chrom_raw.find("complement") >= 0):
                len_from_exon_start = end_pos - chrom_pos
            #Calculate position for forward starnd
            else:
                len_from_exon_start = chrom_pos - start_pos
            
            #Calculate the position on the mRNA transcript
            transcript_pos = len_from_exon_start + first_bp_count
            
            #Calculate the position on the protein sequence
            protein_pos = int(math.ceil(float(transcript_pos)/3))
            return protein_pos
    
    #If the position wasn't in the regions of any exon
    return -1

In [10]:
#A function that get protein position and table of exons, and return the chromosome positions of the corresponding codon
def find_chrom_bps(protein_pos, exon_table, chrom_raw_data):
    
    #calculate the mRNA transcript index of this protein position (the 1st bp in the triplet)
    transcript_pos = (protein_pos*3)-2
    
    #Iterating over all the gene exons
    for index, exon in exon_table.iterrows():  
        first_bp_count = int(exon["first_bp_count"])
        exon_length = int(exon["length"])
        last_bp_count = first_bp_count + exon_length - 1
        
        #Checking if the transcript position is within this exon
        if (first_bp_count <= transcript_pos and transcript_pos <= last_bp_count):
            
            start_pos = int(exon["start_pos"])
            end_pos = int(exon["end_pos"])
            
            len_from_exon_start = transcript_pos - first_bp_count
            
            #Calculate bps position for reverse complement strand: the protein is translated from the end position towards the start position of the exon
            if (chrom_raw_data.find("complement") >= 0):
                chrom_pos_1st = end_pos - len_from_exon_start
                
                chrom_pos_2nd = chrom_pos_1st - 1
                #If the exons ends here: move to the next exon
                if (chrom_pos_2nd < start_pos):
                    index += 1
                    chrom_pos_2nd = int(exon_table["end_pos"][index])
                    start_pos = int(exon_table["start_pos"][index])
                    end_pos = int(exon_table["end_pos"][index])
                
                #If the exons ends here: move to the next exon
                chrom_pos_3rd = chrom_pos_2nd - 1
                if (chrom_pos_3rd < start_pos):
                    index += 1
                    chrom_pos_3rd = int(exon_table["end_pos"][index])
                    
            #Calculate position for forward strand
            else:
                chrom_pos_1st = start_pos + len_from_exon_start
                
                chrom_pos_2nd = chrom_pos_1st + 1
                 #If the exons ends here: move to the next exon
                if (chrom_pos_2nd > end_pos):
                    index += 1
                    chrom_pos_2nd = int(exon_table["start_pos"][index])
                    start_pos = int(exon_table["start_pos"][index])
                    end_pos = int(exon_table["end_pos"][index])
                
                #If the exons ends here: move to the next exon
                chrom_pos_3rd = chrom_pos_2nd + 1
                if (chrom_pos_3rd > end_pos):
                    index += 1
                    chrom_pos_3rd = int(exon_table["start_pos"][index])
            
            return (chrom_pos_1st, chrom_pos_2nd, chrom_pos_3rd)
        

In [11]:
#Boolean function - determine if a given text can be converted to a number
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

In [12]:
#A function that return the hmm state of that protein position
# return -1 for positions outside of domains regions, -2 for matching insertion
#TODO: do we need also transcript id? do we want to consider more than 1 transcript per gene?
def protein_pos_to_hmm_state_and_aa(protein_pos, domain_gene_table):
    for index, row in domain_gene_table.iterrows():
        target_start = row["TargetStart"]
        target_end = row["TargetEnd"]
        aa = "-"
        
        #Check if the position is inside this domain instance of the gene
        if (protein_pos >= target_start and protein_pos <= target_end):
            
            hmm_pos = (row["HMM_Pos"]).split(",")
            target_seq = list(row["Target_Seq"])
            index_inside_match = (protein_pos - target_start)
            
            #Correct index_inside_match for previous deletions '-'
            for j in range(index_inside_match):
                if (target_seq[j] == "-"):
                    index_inside_match += 1
            
            #Find the HMM match state
            while (aa == "-"):
                hmm_state_text = hmm_pos[index_inside_match]
                if (is_number(hmm_state_text) == True):
                    hmm_state = int(hmm_state_text)
                else:
                    #the position match insertion
                    hmm_state = -2

                #Find the amino acid
                aa = (target_seq[index_inside_match]).upper()
            
                #Handling a deletion in the domain instance in comparison to the HMM profile
                if (aa == "-"):
                    index_inside_match += 1
                
            return(hmm_state, aa)
            
    #The protein position isn't in any domain region        
    return (-1,'-')

In [13]:
#A function that create the new codon for the alteration
def create_alt_codon(exac_ref_bp, curr_alt_bp, ref_codon, alt_codon_pos, chrom_raw_data):
    
    #For error logging
    functionNameAsString = sys._getframe().f_code.co_name
    
    #Complement strand - transversing the bp to base-complement
    if (chrom_raw_data.find("complement") >= 0):
        new_bp = ""
        for c in curr_alt_bp:
            if (c.upper() == 'A'):
                new_bp = new_bp+'T'
            elif (c.upper() == 'T'):
                new_bp = new_bp+'A'
            elif (c.upper() == 'G'):
                new_bp = new_bp+'C'
            else:
                new_bp = new_bp+'G'
            new_bp = new_bp[::-1] #TODO: is the reverse needed?
        
        exac_ref_bp_adj = ""
        for c in exac_ref_bp:
            if (c.upper() == 'A'):
                exac_ref_bp_adj = exac_ref_bp_adj+'T'
            elif (c.upper() == 'T'):
                exac_ref_bp_adj = exac_ref_bp_adj+'A'
            elif (c.upper() == 'G'):
                exac_ref_bp_adj = exac_ref_bp_adj+'C'
            else:
                exac_ref_bp_adj = exac_ref_bp_adj+'G'
            exac_ref_bp_adj = exac_ref_bp_adj[::-1] #TODO: is the reverse needed?
        
    #Regular strand
    else:
        new_bp = curr_alt_bp
        exac_ref_bp_adj = exac_ref_bp
        
    #Validation: making sure the ref bp from ExAC is inside the ref codon sequence retrieved from hg19 or the other way around (at least one contain the other)
    if (ref_codon.find(exac_ref_bp_adj) == -1 and exac_ref_bp_adj.find(ref_codon) == -1):
        print functionNameAsString+" Error: ExAC ref sequence "+exac_ref_bp_adj+" isn't found in hg19 retrieved codon sequence "+ref_codon
        
    new_alt_codon = ref_codon[:alt_codon_pos]+new_bp+ref_codon[alt_codon_pos+len(exac_ref_bp_adj):]
    
    return new_alt_codon

In [14]:
codon_table = {
    'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
    'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
    'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
    'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
    'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
    'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
    'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
    'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
    'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
    'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
    'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
    'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
    'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
    'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
    'TAC':'Y', 'TAT':'Y', 'TAA':'*', 'TAG':'*',
    'TGC':'C', 'TGT':'C', 'TGA':'*', 'TGG':'W',
    }

In [15]:
#Retrieve the codon base-parirs from the ref sequence
def retrieve_codon_seq(chrom_pos_list, chrom_raw_data, chrom):
    chromsome_name = "chr"+chrom
    chrom_pos_1st = chrom_pos_list[0]
    chrom_pos_2nd = chrom_pos_list[1]
    chrom_pos_3rd = chrom_pos_list[2]
    
    if (chrom_pos_1st < chrom_pos_3rd):
        chrom_pos_list_adj = chrom_pos_list
    else:
        chrom_pos_list_adj = reversed(chrom_pos_list) #For reverse strand - order is reveresed
        
    seq = ""
    for chrom_pos in chrom_pos_list:
        seq_start = chrom_pos - 1
        query = !./twoBitToFa hg19.2bit stdout -seq=$chromsome_name -start=$seq_start -end=$chrom_pos
        seq = seq+query[1]
    
    #Complement strand - transversing the bp to base-complement
    if (chrom_raw_data.find("complement") >= 0):
        complement_seq = []
        for c in seq:
            if (c.upper() == 'A'):
                complement_seq.append('T')
            elif (c.upper() == 'T'):
                complement_seq.append('A')
            elif (c.upper() == 'G'):
                complement_seq.append('C')
            else:
                complement_seq.append('G')
        seq = complement_seq[0]+complement_seq[1]+complement_seq[2]
        
    return seq

In [16]:
FRAME_SHIFT_INDEL = 1
IN_FRAME_INDEL = 2
NO_INDEL = 0

In [17]:
#A boolean function that check according to ref and alt if it's frameshift indel or not
def is_indel(ref, alt, chrom_alter):
    #For error logging
    functionNameAsString = sys._getframe().f_code.co_name
    
    if (len(ref) != len(alt)):
        if (len(ref) > len(alt)):
            indel_len = len(ref) - len(alt)
        else: 
            indel_len = len(ref) - len(alt)
        if ((indel_len % 3) > 0):
            
            #Validation: ExAC identify this as a frameshift too
            conseq = chrom_alter["conseq"]
            #if ("frameshift" not in conseq):
             #   print functionNameAsString+" Error: ExAC doesn't recognize a frameshift indel - "+str(chrom_alter["pos"])
            
            #frameshift indel    
            return FRAME_SHIFT_INDEL
        
        #In frame indel
        else:
            return IN_FRAME_INDEL
        
    #Not an indel
    else:
        return NO_INDEL 

In [18]:
# A function that returns the indices of the strings differences
def diff(a, b):
    return [i for i in range(len(a)) if a[i] != b[i]]

In [19]:
#A function that adds more lines for all the effected chromosomal positions of each indel
def table_editing(chrom_gene_table):
    i_cnt = 0
    d_cnt = 0
    indels_table = pd.DataFrame(columns = chrom_gene_table.columns)
    indels_table_i = 0
    comments_col = []
    
    for index, line in chrom_gene_table.iterrows():
        ref = line["ref"]
        alt = line["alt"]
        pos = line["pos"]
        feature = line["feature"]
        conseq = line["conseq"]
        prot_pos = line["prot_pos"]
        strand = line["strand"]
        
        #Handling deletion
        if (len(ref) > len(alt)):
            d_cnt += 1
            comments_col.append("ignore for this position: d-"+str(d_cnt))
            #Adding to indels table only the inframe ones
            if (is_indel(ref, alt, line) == IN_FRAME_INDEL):
                init_pos = pos +  len(alt)
                deletion_len = len(ref) - len(alt)
                for j in range(deletion_len):
                    new_line = line.copy(deep=True)
                    new_line["pos"] = init_pos + j
                    new_line["ref"] = ref[j+1]
                    new_line["alt"] = "-"
                    new_line["comments"] = "d-"+str(d_cnt)
                    indels_table.loc[indels_table_i] = new_line
                    indels_table_i += 1
            
        #Handling insertion
        elif (len(alt) > len(ref)):
            i_cnt += 1
            comments_col.append("ignore for this position: i-"+str(i_cnt))
            
            #Adding to indels table only the inframe ones
            if (is_indel(ref, alt, line) == IN_FRAME_INDEL):
                init_pos = pos + len(ref)
                #insertion_len = len(alt) - len(ref)
                new_line = line.copy(deep=True)
                if (strand == 1):
                    new_line["pos"] = init_pos
                else:
                    new_line["pos"] = init_pos - 1
                new_line["ref"] = "-"
                new_line["alt"] = alt[len(ref):]
                new_line["comments"] = "i-"+str(i_cnt)
                indels_table.loc[indels_table_i] = new_line
                indels_table_i += 1
            
        #Handling mismtach written with redundant bps
        elif (len(ref) > 1):
            diff_idx = diff(ref, alt)
            #A case when only the first bp is the alteration
            if (diff_idx == [0]):
                #Fix ref and alt fields
                chrom_gene_table.set_value(index, "ref", ref[0])
                chrom_gene_table.set_value(index, "alt", alt[0])
                
                strand = line["strand"]
                
                #Fix amino_acids field
                aa = line["amino_acids"]
                #If aa field is not empty
                if (aa != ""):
                    if (strand == 1):
                        if (aa.find("/")!= -1):
                            new_aa = aa[0] + aa[aa.find("/"):aa.find("/")+2]
                        else:
                            new_aa = aa[0]
                    else:
                        if (aa.find("/")!= -1):
                            new_aa = aa[aa.find("/")-1:aa.find("/")+1]+aa[-1]
                        else:
                            new_aa = aa[-1]
                    chrom_gene_table.set_value(index, "amino_acids", new_aa)
                
                #Fix prot_pos field
                prot_pos = line["prot_pos"]
                if (prot_pos != ""):
                    if (prot_pos.find("-") != -1):
                        if (strand == 1):
                            chrom_gene_table.set_value(index, "prot_pos",  prot_pos[:prot_pos.find("-")])
                        else:
                            chrom_gene_table.set_value(index, "prot_pos",  prot_pos[prot_pos.find("-")+1:])
                
                comments_col.append("removed redundant bps")
            else:
                comments_col.append("")
                
        #No Indel
        else:
            comments_col.append("")
    
    chrom_gene_table["comments"] = comments_col
    #print "Number of insertions = "+str(i_cnt)
    #print "number of deletion = "+str(d_cnt)
    
    return indels_table

In [20]:
def exac_validation_checks(chrom_alter, protein_pos, aa, alt_codon_pos, chrom_pos, bp_ref):
    
    #For error logging
    functionNameAsString = sys._getframe().f_code.co_name
    
    error_flag = False
    
    #Validation: the ExAC chromosome position is within a protein
    exac_prot_data = True
    if (chrom_alter["prot_pos"] == ""):
        print functionNameAsString+" Error: ExAC chromosome position "+str(chrom_pos)+" doesn't correspond to a protein"
        #We assume it's an error in ExAC and logging alteration anyway.
        exac_prot_data = False
        
    else:
        #Validation: the ExAC protein position match the HMMER protein position
        exac_prot_pos = chrom_alter["prot_pos"]
         #in case there's more than one position listed
        if (exac_prot_pos.find("-") != -1):
            first_exac_prot_pos = int(exac_prot_pos[:exac_prot_pos.find("-")])
            last_exac_prot_pos = int(exac_prot_pos[exac_prot_pos.find("-")+1:])
        else:
            first_exac_prot_pos = int(exac_prot_pos)
            last_exac_prot_pos = first_exac_prot_pos
        #Checking of the protein position isn't within the range described by ExAC
        if not(first_exac_prot_pos <= protein_pos <= last_exac_prot_pos):
            print functionNameAsString+" "+ str(chrom_pos)+" Error: ExAC protein position "+str(first_exac_prot_pos)+" doesn't match HMMER protein position "+str(protein_pos)
            error_flag = True

        #Validation: the ExAC aa match the HMMER aa
        exac_aa = chrom_alter["amino_acids"]
        if (exac_aa.find("/") != -1):
            exac_ref_aa = exac_aa[:exac_aa.find("/")]
        else:
            exac_ref_aa = exac_aa
        exac_alt_aa = exac_aa[exac_aa.find("/")+1:]
        if (exac_ref_aa != aa):
            print functionNameAsString+" "+ str(chrom_pos)+" Error: ExAC amino acid identity "+exac_ref_aa+" doesn't match HMMER amino-acid "+aa
            error_flag = True

        #Extracting aa codon data if exist
        exac_codons = chrom_alter["codons"]
        exac_ref_codon = exac_codons[:exac_codons.find("/")]
        exac_alt_codon = exac_codons[exac_codons.find("/")+1:]
        #Validation: the ExAC codon match the returned codon sequence from hg19, or at least one contain the other
        if ((exac_ref_codon.upper().find(bp_ref.upper()) == -1) and (bp_ref.upper().find(exac_ref_codon.upper()) == -1)):
            print functionNameAsString+" "+ str(chrom_pos)+" Error: ExAC bp codon "+exac_ref_codon.upper()+" doesn't match hg19 codon sequence retrieved "+bp_ref.upper()
        
        #Getting the changed position inside the ExAC codon - ignore for now
        #exac_alt_codon_pos = 0
        #for c in exac_ref_codon:
            #if c.isupper():
                #break
            #exac_alt_codon_pos += 1
        #Validation: the ExAC alt position match my alt codon position calculation
        #if (exac_alt_codon_pos != alt_codon_pos):
            #print functionNameAsString+" "+ str(chrom_pos)+" Error: the ExAC alt position in codon "+str(exac_alt_codon_pos)+" doesn't match my codon position calculation "+str(alt_codon_pos)
    
        return (exac_prot_data, exac_alt_aa, exac_alt_codon, error_flag)
    

In [21]:
#Amino acids used for JSD functions
amino_acids = ['A','R','N','D','C','Q','E','G','H','I','L','K','M','F','P','S',
          'T','W','Y','V','*'] # 20 amino acids, and * for termination codon

In [22]:
def JSD_BLOSUM62(alterations_af_dict, aa_ref, maf, rand=False):
    """
    Compute the Jensen-Shannon divergence of the aa frequencies, as described in the dictionary
    with background frequency. Default is Blosum62 background frequencies, but can use 
    random if specified
    """
    
    #Background frequencies of amino acids (random or BLOSUM62 matrix): 
    q = [1./len(amino_acids)]*len(amino_acids) if rand else         [0.074, 0.052, 0.045, 0.054, 0.025, 0.034, 0.054, 0.074,
         0.026, 0.068, 0.099, 0.058, 0.025, 0.047, 0.039, 0.057,
         0.051, 0.013, 0.032, 0.073, 0]
    
    #Create the frequency vector according to the alterations dictionary
    feqs_vector = []
    for aa in amino_acids:
        if (aa in alterations_af_dict.keys()):
            feqs_vector.append(0.00001 + sum(alterations_af_dict[aa]))
        elif (aa == aa_ref):
            feqs_vector.append(0.00001 + (1 - maf))
        else:
            feqs_vector.append(0.00001)
    
    p = [f/sum(feqs_vector) for f in feqs_vector]
    
    assert str(sum(q))=='1.0' and str(sum(p))=='1.0', "Prob. vectors do not sum to 1"
    
    return JSdiv(p, q)

In [25]:
def JSD_major_allele(alterations_af_dict, aa_ref, maf, rand=False):
    
    #Create the background frequency vector according to the major allele
    q = [0.00001]*len(amino_acids)
    q[amino_acids.index(aa_ref)] += 1
    q = [f/sum(q) for f in q]
    
    #Create the frequency vector according to the alterations dictionary 
    feqs_vector = []
    for aa in amino_acids:
        if (aa in alterations_af_dict.keys()):
            feqs_vector.append(0.00001 + sum(alterations_af_dict[aa]))
        elif (aa == aa_ref):
            feqs_vector.append(0.00001 + (1 - maf))
        else:
            feqs_vector.append(0.00001)
    
    p = [f/sum(feqs_vector) for f in feqs_vector]
    
    assert str(sum(q))=='1.0' and str(sum(p))=='1.0', "Prob. vectors do not sum to 1"
    
    return JSdiv(p, q)   

In [144]:
#A function that return a dict with the MAF info for the protein position and corresponding chromosomal location
def calc_exac_maf_data(chrom_pos_list, chrom_gene_table, indels_table, protein_pos, aa, chrom_raw_data, chrom):
    
    res_dict = {}
    res_dict["chrom"] = chrom
    res_dict["chrom_pos"] = chrom_pos_list
    res_dict["prot_pos"] = protein_pos
    res_dict["aa_ref"] = aa
    res_dict["bp_ref"] = retrieve_codon_seq(chrom_pos_list, chrom_raw_data, chrom)
    frameshift_cnt = 0
    errors_cnt = 0
    filter_cnt = 0
    inframe_ids = []
    
    #For error logging
    functionNameAsString = sys._getframe().f_code.co_name
    
    #Validation: checking that the returned codon sequence from hg19 match the HMMER amino-acid
    translated_aa = codon_table[(res_dict["bp_ref"]).upper()]
    if (translated_aa != aa):
        print functionNameAsString+" Error: hg19 codon sequence retrieved "+(res_dict["bp_ref"]).upper()+"="+translated_aa+" doesn't match HMMER amino-acid "+aa
    
    #Save the AF of each aa
    alterations_af_dict = defaultdict(list)
    alterations_af_adj_dict = defaultdict(list)
    
    #Save the number of people for each aa
    aa_counts = defaultdict(int)
    aa_adj_counts = defaultdict(int)
    an_list = []
    an_adj_list = []
    
    for i in range(len(chrom_pos_list)):
        chrom_pos = chrom_pos_list[i]
        alt_codon_pos = i
            
        #Retreiving relevant ExAC entry
        chrom_alter_table = chrom_gene_table[chrom_gene_table["pos"] == chrom_pos]
        chrom_alter_table = chrom_alter_table.reset_index(drop=True)
                
        if (chrom_alter_table.shape[0] == 0):
            #No ExAC entry for this chromosome position - not adding alteration data
            continue
        
        else:
            #In case there are several alterations for that position, iterating
            for index, line in chrom_alter_table.iterrows():
                chrom_alter = line
                
                #Filtering out according to filter field
                exac_filter = chrom_alter["filter"]
                if (exac_filter != "PASS"):
                    filter_cnt += 1
                    continue
                
                #Extracting ref and alt
                exac_ref_bp = chrom_alter["ref"]
                exac_alt_bp = chrom_alter["alt"]
                
                #Check if frameshift indel - skip (we assume the whole protein may not function and don't add those to the MAF count)
                if (is_indel(exac_ref_bp, exac_alt_bp, chrom_alter) == FRAME_SHIFT_INDEL):
                    frameshift_cnt += 1
                    continue
                    
                #Check if inframe indel - take data from the indels table later
                elif (is_indel(exac_ref_bp, exac_alt_bp, chrom_alter) == IN_FRAME_INDEL):
                    indel_id = chrom_alter["comments"][chrom_alter["comments"].find("-"):]
                    inframe_ids.append(indel_id)
                    continue     
                
                #Perform validation checks (comparing ExAC and HMMER data)
                (exac_prot_data, exac_alt_aa, exac_alt_codon, errors) = exac_validation_checks(chrom_alter, protein_pos, aa, alt_codon_pos, chrom_pos, res_dict["bp_ref"])
                if (errors):
                    errors_cnt += 1
                    #Skipping if there are validation errors
                    continue

                #Extracting ExAC allele frequency data
                af = chrom_alter["AF"]
                an = int(chrom_alter["AN"])
                ac = int(chrom_alter["AC"])
                an_adj = int(chrom_alter["AN_Adj"])
                ac_adj = int(chrom_alter["AC_Adj"])
                
                #Summing AN from all the codon positions
                an_list.append(an)
                an_adj_list.append(an_adj)
                
                #Calculating the alteration relevant data
                alt_codon = create_alt_codon(exac_ref_bp, exac_alt_bp, res_dict["bp_ref"], alt_codon_pos, chrom_raw_data)
                if (len(alt_codon) != 3):
                    alt_aa = "indel"
                    continue #TODO: handle inframe indels
                else:
                    alt_aa = codon_table[alt_codon.upper()]

                #Validation: ExAC alt codon and aa for the first alteration match the calculated alt
                #if (exac_prot_data and exac_alt_codon.upper() != alt_codon):
                    #print functionNameAsString+" "+ str(chrom_pos)+" Error: the ExAC alt codon "+exac_alt_codon.upper()+" doesn't match my alt codon calculation "+alt_codon
                if (exac_prot_data and exac_alt_aa != alt_aa):
                    print functionNameAsString+" "+ str(chrom_pos)+" Error: the ExAC alt aa "+exac_alt_aa+" doesn't match my alt aa calculation "+alt_aa

                if (alt_aa == res_dict["aa_ref"]):
                    #Not logging alteration for synonymous mutations
                    continue

                #Non-synonymous(!!!) - logging the alteration
                else:
                    alterations_af_dict[alt_aa].append(float(af))
                    af_adj = float(ac_adj)/float(an_adj)
                    af_adj_format = float('{:.3e}'.format(float(af_adj)))
                    alterations_af_adj_dict[alt_aa].append(af_adj_format)
                    
                    #Counting the number of people with this aa
                    aa_counts[alt_aa] += ac
                    aa_adj_counts[alt_aa] += ac_adj
    
    #Averaging the AN in case there is more than one alteration
    an_avg = int(np.round(np.average(an_list)))
    an_adj_avg = int(np.round(np.average(an_adj_list)))
    
    #Adding the major allele to the aa counts
    aa_counts[res_dict["aa_ref"]] = (an_avg - sum(aa_counts.values()))
    aa_adj_counts[res_dict["aa_ref"]] = (an_adj_avg - sum(aa_adj_counts.values()))

    #Calculating the overall MAF from the alteration dicts
    res_dict["af"] = 0
    res_dict["af_adj"] = 0
    
    for aa in alterations_af_dict.keys():
        aa_sum = sum(alterations_af_dict[aa])
        aa_adj_sum = sum(alterations_af_adj_dict[aa])
        
        #Checking if any alteration is above 0.5, and changing the ref accordingly
        if (aa != "indel" and aa_sum > 0.5):
            
            #Adding the refrence allele to the alterations dicts
            old_ref = res_dict["aa_ref"]
            sum_of_all_alt = sum(sum(alterations_af_dict.values(), []))
            sum_of_all_alt_adj = sum(sum(alterations_af_adj_dict.values(), []))
            alterations_af_dict[old_ref] = [1 - sum_of_all_alt]
            alterations_af_adj_dict[old_ref] = [1 - sum_of_all_alt_adj]
            
            #Updating the aa to be the ref
            res_dict["aa_ref"] = aa
            res_dict["af"] =(1 - aa_sum)
            res_dict["af_adj"] = (1 - aa_adj_sum)
            
            #Deleting from the alterations dicts
            del alterations_af_dict[aa]
            del alterations_af_adj_dict[aa]
            break
        else:
            res_dict["af"] += aa_sum
            res_dict["af_adj"] += aa_adj_sum
        
        #Fix the AF format
        res_dict["af"] = float('{:.3e}'.format(float(res_dict["af"])))
        res_dict["af_adj"] = float('{:.3e}'.format(float(res_dict["af_adj"])))
        
    #Calculating the overall Jensen-Shannon Divergrence
    if (len(alterations_af_dict.keys()) == 0):
        res_dict["JSD"] = 0
        res_dict["JSD_adj"] = 0
    else:
        res_dict["JSD"] = JSD_major_allele(alterations_af_dict, res_dict["aa_ref"], res_dict["af"])
        res_dict["JSD_adj"] = JSD_major_allele(alterations_af_adj_dict, res_dict["aa_ref"], res_dict["af_adj"])
        
    return (res_dict, frameshift_cnt, errors_cnt, filter_cnt, aa_counts, aa_adj_counts)

In [158]:
int(np.round(3.53))

4

In [44]:
chrom_path = curr_dir[0]+"/parsed/"
chrom_filename = "parsed_chrom"
states_dict = defaultdict(list)
states_aa_dict = {}
states_aa_adj_dict = {}

print "Starting...."

#For error logging
functionNameAsString = sys._getframe().f_code.co_name

#A list of all the ens genes
domain_ens_genes_all = []

#A list to count frameshifts per gene
domain_ens_genes_frameshifts = []

#A list to count validation errors per gene
domain_ens_genes_errors = []

#A list to count ExAC filtered-out per gene
domain_ens_genes_filter = []

for chrom in chromosome_names:
    
    #Filtering the domain data relevant to this chromosome
    domain_chrom_data = sorted_domain_data[sorted_domain_data["chrom_num"] == chrom]
    
    #Loading the ExAC parsed data of this chromosome
    fields = ['chrom', 'pos', 'ref', 'alt', "filter", 'AC', 'AC_Adj', 'AF', 'AN', 'AN_Adj', 'gene', 'feature', 
              'feature_type', 'conseq', 'prot_pos', 'amino_acids', 'codons', 'strand', 'ENSP', 'exon', 
              'intron', 'domains']
    chrom_csv = pd.read_csv(chrom_path+chrom_filename+chrom+".csv", sep='\t', index_col=0, usecols=fields)
    chrom_csv = chrom_csv.sort_values(by=["pos"])
    chrom_csv = chrom_csv.reset_index(drop=True)
    chrom_csv.fillna('', inplace=True)
    chrom_csv["comments"] = ""
    
    #Getting a list of all the relevant ensembl gene ids for this chromosome
    domain_ens_genes = (domain_chrom_data["ensembl_id"]).unique()
    domain_ens_genes_all.extend(domain_ens_genes)
    
    #For each ensembl gene in the domain data - finding all the ExAC alterations
    for ens_gene in domain_ens_genes:
        
        #Filtering the domain data for this gene according to the canonical protein id
        canonic_prot = canonic_protein[ens_gene]
        domain_gene_table = domain_chrom_data[domain_chrom_data["prot_id"] == canonic_prot]
        #Making sure that if two HMM-matches overlaps, the higher bit score will come first in the table
        domain_gene_table = domain_gene_table.sort_values(by="BitScore", ascending=False)
        domain_gene_name = domain_gene_table["Hugo_symbol"].unique()[0]
        if (len(domain_gene_table["Hugo_symbol"].unique()) > 1):
            print functionNameAsString+" Error: "+ens_gene+": more than one Hugo symbol" #sanity check
        
        #Getting the chosen protein transcript id for the ExAC filtering
        #canonic_transcript_id = domain_gene_table["transcript_id"].tolist()[0]
        
        #Creating a table of the exons for this gene, according to the canonical protein
        chrom_raw_data = domain_gene_table["chromosome_id"].unique()[0] #there should be only one element here
        if (len(domain_gene_table["chromosome_id"].unique()) > 1):
            print functionNameAsString+" Error: "+ens_gene+": more than one chromosome raw data" #sanity check
        targetid = domain_gene_table["#TargetID"].unique()[0]
        exon_table = create_exon_pos_table(chrom_raw_data, targetid)
        
        #Filtering the chromosome data to the gene exons region
        exons_start_pos = min(exon_table["start_pos"][0],exon_table["start_pos"][len(exon_table)-1]) #in case of complelemt, the minimal position could be at the last row
        exons_end_pos = max(exon_table["end_pos"][0],exon_table["end_pos"][len(exon_table)-1]) #in case of complelemt, the maximal position could be at the first row
        chrom_gene_table = chrom_csv[chrom_csv["pos"] >= int(exons_start_pos)][chrom_csv["pos"] <= int(exons_end_pos)][chrom_csv["ENSP"] == canonic_prot]
        chrom_gene_table = chrom_gene_table.reset_index(drop=True)
        
        #Handling indels
        indels_table = table_editing(chrom_gene_table)
        
        #A counter for frameshifts inside the domain
        protein_frameshifts_cnt = 0
        #A counter for validation errors inside the domain
        protein_errors_cnt = 0
        #A counter for ExAC filter-out inside the domain
        protein_filter_cnt = 0
        
        #Iterating over the amino-acids of the protein
        prot_len = domain_gene_table["length"].unique()[0]
        for protein_pos in range(1,prot_len+1):
    
            #Trying to match HMM-state, and retreive the aa from HMMER results
            (hmm_state, aa) = protein_pos_to_hmm_state_and_aa(protein_pos, domain_gene_table) #TODO: what happens when two matches overlap? maybe sort to the best bit score?
                
            #If there's a match to HMM-state: find the corresponding codon bps chromosome positions
            if (hmm_state > 0):
                chrom_pos_list =find_chrom_bps(protein_pos, exon_table, chrom_raw_data)
                
                #Analysis of the amino-acid MAF and realted data, returned in a dictionary
                (info_dict, frameshift_cnt, errors_cnt, filter_cnt, aa_counts, aa_adj_counts) = calc_exac_maf_data(chrom_pos_list, chrom_gene_table, indels_table, protein_pos, aa, chrom_raw_data, chrom)
                info_dict["ens_gene"] = ens_gene
                
                #Adding the dictionary to the HMM-state list
                states_dict[hmm_state].append(info_dict)
                
                #Adding the aa counts to the aa dictionary
                if (hmm_state not in states_aa_dict.keys()):
                    states_aa_dict[hmm_state] = array('l', [0] * len(amino_acids))
                    states_aa_adj_dict[hmm_state] = array('l', [0] * len(amino_acids))
                
                for aa in aa_counts.keys():
                    states_aa_dict[hmm_state][amino_acids.index(aa)] += aa_counts[aa]
                    states_aa_adj_dict[hmm_state][amino_acids.index(aa)] += aa_adj_counts[aa]
                
                #Adding the frameshifts to the global counter
                protein_frameshifts_cnt += frameshift_cnt
                
                #Adding the errors to the global counter
                protein_errors_cnt += errors_cnt
                
                #Adding the filtered to the global counter
                protein_filter_cnt += filter_cnt
        
        domain_ens_genes_frameshifts.append(protein_frameshifts_cnt)
        domain_ens_genes_errors.append(protein_errors_cnt)
        domain_ens_genes_filter.append(protein_filter_cnt)
        print "Finished protein "+ens_gene
                                
    print "Finished chromosome "+chrom

with open(my_path+'zinc_hmm_states_dict_filter_9.7.pik', 'wb') as handle:
    pickle.dump(states_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

Starting....
Finished protein ENSG00000116809
Finished protein ENSG00000117010
Finished protein ENSG00000121903
Finished protein ENSG00000125945
Finished protein ENSG00000127124
Finished protein ENSG00000135747
Finished protein ENSG00000142611
Finished protein ENSG00000143067
Finished protein ENSG00000160094
Finished protein ENSG00000160685
Finished protein ENSG00000162676
Finished protein ENSG00000164011
Finished protein ENSG00000171161
Finished protein ENSG00000176083
Finished protein ENSG00000179930
Finished protein ENSG00000181450
alterations_af_dict = 
defaultdict(<type 'list'>, {'V': [0.0030000000000000027]})
AF = 
0.003
***
alterations_af_adj_dict = 
defaultdict(<type 'list'>, {'V': [0.0029000000000000137]})
Adj AF = 
0.0029
***
Finished protein ENSG00000184677
Finished protein ENSG00000185278
Finished protein ENSG00000187801
Finished protein ENSG00000187815
Finished protein ENSG00000188295
Finished protein ENSG00000196418
Finished protein ENSG00000197472
Finished protein ENSG00

In [45]:
sum(domain_ens_genes_errors)

0

In [46]:
sum(domain_ens_genes_frameshifts)

1360

In [47]:
sum(domain_ens_genes_filter)

5056

In [30]:
%save "output" repr(_25)

The following commands were written to file `output.py`:
1360


### Debugging code

In [147]:
chrom_path = curr_dir[0]+"/parsed/"
chrom_filename = "parsed_chrom"
states_dict = defaultdict(list)
states_aa_dict = {}
states_aa_adj_dict = {}
print "Starting...."

chrom = "10"

#For error logging
functionNameAsString = sys._getframe().f_code.co_name

#A list of all the ens genes
domain_ens_genes_all = []

#A list to count frameshifts per gene
domain_ens_genes_frameshifts = []

#A list to count validation errors per gene
domain_ens_genes_errors = []

#A list to count ExAC filtered-out per gene
domain_ens_genes_filter = []

#Filtering the domain data relevant to this chromosome
domain_chrom_data = sorted_domain_data[sorted_domain_data["chrom_num"] == chrom]

#Loading the ExAC parsed data of this chromosome
fields = ['chrom', 'pos', 'ref', 'alt', "filter", 'AC', 'AC_Adj', 'AF', 'AN', 'AN_Adj', 'gene', 'feature', 
          'feature_type', 'conseq', 'prot_pos', 'amino_acids', 'codons', 'strand', 'ENSP', 'exon', 
          'intron', 'domains']
chrom_csv = pd.read_csv(chrom_path+chrom_filename+chrom+".csv", sep='\t', index_col=0, usecols=fields)
chrom_csv = chrom_csv.sort_values(by=["pos"])
chrom_csv = chrom_csv.reset_index(drop=True)
chrom_csv.fillna('', inplace=True)
chrom_csv["comments"] = ""

#Getting a list of all the relevant ensembl gene ids for this chromosome
domain_ens_genes = (domain_chrom_data["ensembl_id"]).unique()

Starting....


In [141]:
domain_ens_genes

array(['ENSG00000075407', 'ENSG00000122877', 'ENSG00000148516',
       'ENSG00000165512', 'ENSG00000169740', 'ENSG00000175395',
       'ENSG00000189180', 'ENSG00000196693', 'ENSG00000196793',
       'ENSG00000198105', 'ENSG00000198298'], dtype=object)

In [148]:
ens_gene = "ENSG00000196793"

#Filtering the domain data for this gene according to the canonical protein id
canonic_prot = canonic_protein[ens_gene]
domain_gene_table = domain_chrom_data[domain_chrom_data["prot_id"] == canonic_prot]
#Making sure that if two HMM-matches overlaps, the higher bit score will come first in the table
domain_gene_table = domain_gene_table.sort_values(by="BitScore", ascending=False)

#Getting the chosen protein transcript id for the ExAC filtering
#canonic_transcript_id = domain_gene_table["transcript_id"].tolist()[0]

#Creating a table of the exons for this gene, according to the canonical protein
chrom_raw_data = domain_gene_table["chromosome_id"].unique()[0] #there should be only one element here
if (len(domain_gene_table["chromosome_id"].unique()) > 1):
    print functionNameAsString+" Error: "+ens_gene+": more than one chromosome raw data" #sanity check
targetid = domain_gene_table["#TargetID"].unique()[0]
exon_table = create_exon_pos_table(chrom_raw_data, targetid)

#Filtering the chromosome data to the gene exons region
exons_start_pos = min(exon_table["start_pos"][0],exon_table["start_pos"][len(exon_table)-1]) #in case of complelemt, the minimal position could be at the last row
exons_end_pos = max(exon_table["end_pos"][0],exon_table["end_pos"][len(exon_table)-1]) #in case of complelemt, the maximal position could be at the first row
chrom_gene_table = chrom_csv[chrom_csv["pos"] >= int(exons_start_pos)][chrom_csv["pos"] <= int(exons_end_pos)][chrom_csv["ENSP"] == canonic_prot]
chrom_gene_table = chrom_gene_table.reset_index(drop=True)

#Handling indels
indels_table = table_editing(chrom_gene_table)

#A counter for frameshifts inside the domain
protein_frameshifts_cnt = 0
#A counter for validation errors inside the domain
protein_errors_cnt = 0
#A counter for ExAC filter-out inside the domain
protein_filter_cnt = 0

#Iterating over the amino-acids of the protein
prot_len = domain_gene_table["length"].unique()[0]
for protein_pos in range(1,prot_len+1):

    #Trying to match HMM-state, and retreive the aa from HMMER results
    (hmm_state, aa) = protein_pos_to_hmm_state_and_aa(protein_pos, domain_gene_table) #TODO: what happens when two matches overlap? maybe sort to the best bit score?

    #If there's a match to HMM-state: find the corresponding codon bps chromosome positions
    if (hmm_state > 0):
        chrom_pos_list =find_chrom_bps(protein_pos, exon_table, chrom_raw_data)

        #Analysis of the amino-acid MAF and realted data, returned in a dictionary
        (info_dict, frameshift_cnt, errors_cnt, filter_cnt, aa_counts, aa_adj_counts) = calc_exac_maf_data(chrom_pos_list, chrom_gene_table, indels_table, protein_pos, aa, chrom_raw_data, chrom)
        info_dict["ens_gene"] = ens_gene
                
        #Adding the dictionary to the HMM-state list
        states_dict[hmm_state].append(info_dict)
        
        #Adding the aa counts to the aa dictionary
        if (hmm_state not in states_aa_dict.keys()):
            states_aa_dict[hmm_state] = array('f', [0] * len(amino_acids))
            states_aa_adj_dict[hmm_state] = array('f', [0] * len(amino_acids))

        for aa in aa_counts.keys():
            states_aa_dict[hmm_state][amino_acids.index(aa)] += aa_counts[aa]
            states_aa_adj_dict[hmm_state][amino_acids.index(aa)] += aa_adj_counts[aa]

        #Adding the frameshifts to the global counter
        protein_frameshifts_cnt += frameshift_cnt

        #Adding the errors to the global counter
        protein_errors_cnt += errors_cnt
        
        #Adding the filtered to the global counter
        protein_filter_cnt += filter_cnt
        
domain_ens_genes_frameshifts.append(protein_frameshifts_cnt)
domain_ens_genes_errors.append(protein_errors_cnt)
domain_ens_genes_filter.append(protein_filter_cnt)
print "Finished protein "+ens_gene

Finished protein ENSG00000196793


In [153]:
states_aa_dict

{1: array('f', [0.0, 0.0, 1.0, 0.0, 6.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 8.0, 0.0, 0.0, 0.0, 0.0, nan, 0.0, 0.0]),
 2: array('f', [0.0, 2.0, 2.0, 0.0, 0.0, nan, nan, 0.0, 106198.0, 0.0, 0.0, 424198.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]),
 3: array('f', [0.0, 0.0, 0.0, 0.0, nan, 0.0, 0.0, 59055.0, 0.0, 0.0, 0.0, 0.0, 0.0, 16.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]),
 4: array('f', [0.0, 0.0, 12.0, 212187.0, 80.0, 0.0, nan, nan, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, nan, 0.0, 0.0, 106119.0, 0.0, 0.0]),
 5: array('f', [0.0, 24.0, 0.0, 1.0, 0.0, nan, nan, 0.0, 0.0, 0.0, 0.0, nan, 0.0, 0.0, 0.0, 0.0, 7.0, 0.0, 0.0, 0.0, 0.0]),
 6: array('f', [0.0, 1.0, 0.0, 0.0, nan, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0, 0.0]),
 7: array('f', [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, nan, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]),
 8: array('f', [0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, nan, 106198.0, 0.0, 0.0, 0.0, 0.0, 0.0, 

In [52]:
exon_table

Unnamed: 0,start_pos,end_pos,length,first_bp_count
0,90611538,90611827,290,1
1,90616303,90616367,65,291
2,90617611,90617737,127,356


In [65]:
chrom_gene_table

Unnamed: 0,pos,ref,alt,filter,AC,AC_Adj,AF,AN,AN_Adj,gene,...,conseq,prot_pos,amino_acids,codons,strand,ENSP,exon,intron,domains,comments
0,2822001,G,C,VQSRTrancheSNP99.95to100.00,28,0,0.000436,64300,29724,ENSG00000067646,...,missense_variant,8,L/F,ttG/ttC,1,ENSP00000155093,2/8,,,
1,2822010,A,G,VQSRTrancheSNP99.60to99.80,4,2,0.000062,64034,29803,ENSG00000067646,...,synonymous_variant,11,Q,caA/caG,1,ENSP00000155093,2/8,,,
2,2822046,T,C,PASS,2,1,0.000032,62478,29044,ENSG00000067646,...,splice_region_variant&intron_variant,,,,1,ENSP00000155093,,2/7,,
3,2822065,C,G,PASS,4,2,0.000065,61960,28329,ENSG00000067646,...,intron_variant,,,,1,ENSP00000155093,,2/7,,
4,2822077,C,T,PASS,2,1,0.000032,61812,27715,ENSG00000067646,...,intron_variant,,,,1,ENSP00000155093,,2/7,,
5,2822087,A,T,PASS,2,1,0.000032,61794,27276,ENSG00000067646,...,intron_variant,,,,1,ENSP00000155093,,2/7,,
6,2829082,T,C,PASS,2,1,0.000032,62564,28277,ENSG00000067646,...,intron_variant,,,,1,ENSP00000155093,,2/7,,
7,2829086,AT,A,VQSRTrancheINDEL96.00to97.00,2,1,0.000032,62640,28506,ENSG00000067646,...,intron_variant&feature_truncation,,,,1,ENSP00000155093,,2/7,,ignore for this position: d-1
8,2829226,A,T,PASS,2,1,0.000030,67526,30129,ENSG00000067646,...,missense_variant,58,D/V,gAt/gTt,1,ENSP00000155093,3/8,,,
9,2829239,A,G,VQSRTrancheSNP99.60to99.80,2,1,0.000029,67808,30134,ENSG00000067646,...,synonymous_variant,62,S,tcA/tcG,1,ENSP00000155093,3/8,,,
