In [1]:
import pandas as pd
import numpy as np
import math
from collections import defaultdict
import matplotlib.pyplot as plt
import pickle
import sys
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

In [2]:
#Reading the Zinc-finger domain data
curr_dir = !pwd
my_path = curr_dir[0]+"/hmm_domains/"
filename = "zf-C2H2.csv"
zinc_finger = pd.read_csv(my_path+filename, sep='\t', index_col=0)
#Sort the zinc finger data
sorted_zinc = zinc_finger.sort_values(by=["chrom_num", "ensembl_id", "TargetStart"])
sorted_zinc = sorted_zinc.reset_index(drop=True)

In [3]:
#Get the canonic protein id for Zinc domain
with open(my_path+'zinc_canonic_prot.pik', 'rb') as handle:
    canonic_protein = pickle.load(handle)

In [4]:
chromosome_names = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y"]

In [5]:
chrom_names_list = sorted_zinc["chrom_num"].tolist()
starnge_chrom_sum = 0
for name in chrom_names_list:
    if (name not in chromosome_names):
        starnge_chrom_sum += 1
print "Leaving outside "+str(starnge_chrom_sum)+" out of "+str(len(chrom_names_list))

Leaving outside 74 out of 10492


In [6]:
#A function that get chromosome raw data from the hmmer results and return a table of the exons
def create_exon_pos_table(chrom_raw):
    exons_raw = chrom_raw
    
    #Removing the complement bracates if exist
    if (exons_raw.find("complement(") >= 0):
        exons_raw = exons_raw[exons_raw.find("complement(")+11:-1]
    
    #Removing the join bracates if exist
    if (exons_raw.find("join(") >= 0):
        exons_raw = exons_raw[exons_raw.find("join(")+5:-1]
        
    #In case there's only one exon, take everything after the second ":"
    else:
        exons_raw = exons_raw[exons_raw.find(":", chrom_raw.find(":")+1)+1:]
    
    exons_list = exons_raw.split(",")
    exon_pos = []
    for ex in exons_list:
        exon_pos.append(ex.split(".."))
    exon_df = pd.DataFrame(exon_pos)
    exon_df.columns = ["start_pos", "end_pos"]
    exon_len = []
    for index, exon in exon_df.iterrows():
        exon_len.append(int(exon[1]) - int(exon[0])+1)
    exon_df["length"] = exon_len
    first_bp_count = 1
    first_bp_list = []
    for index, exon in exon_df.iterrows():
        first_bp_list.append(first_bp_count)
        first_bp_count += int(exon[2])
    exon_df["first_bp_count"] = first_bp_list
    return(exon_df)

In [7]:
#A function that get chromosome position and table of exons, and return the protein position or -1 if it's not within any exon
def find_protein_pos(chrom_pos, exon_df, chrom_raw):
    for index, exon in exon_df.iterrows():
        start_pos = int(exon[0])
        end_pos = int(exon[1])
        first_bp_count = int(exon[3])
        if (chrom_pos >= start_pos and chrom_pos <= end_pos):
            
            #Calculate position for reverse complement strand: the protein is translated from the end position towards the start position of the exon
            if (chrom_raw.find("complement") >= 0):
                len_from_exon_start = end_pos - chrom_pos
            #Calculate position for forward starnd
            else:
                len_from_exon_start = chrom_pos - start_pos
            
            #Calculate the position on the mRNA transcript
            transcript_pos = len_from_exon_start + first_bp_count
            
            #Calculate the position on the protein sequence
            protein_pos = int(math.ceil(float(transcript_pos)/3))
            return protein_pos
    
    #If the position wasn't in the regions of any exon
    return -1

In [8]:
#A function that get protein position and table of exons, and return the chromosome positions of the corresponding codon
def find_chrom_bps(protein_pos, exon_table, chrom_raw_data):
    
    #calculate the mRNA transcript index of this protein position (the 1st bp in the triplet)
    transcript_pos = (protein_pos*3)-2
    
    #Iterating over all the gene exons
    for index, exon in exon_table.iterrows():  
        first_bp_count = int(exon["first_bp_count"])
        exon_length = int(exon["length"])
        last_bp_count = first_bp_count + exon_length - 1
        
        #Checking if the transcript position is within this exon
        if (first_bp_count <= transcript_pos and transcript_pos <= last_bp_count):
            
            start_pos = int(exon["start_pos"])
            end_pos = int(exon["end_pos"])
            
            len_from_exon_start = transcript_pos - first_bp_count
            
            #Calculate bps position for reverse complement strand: the protein is translated from the end position towards the start position of the exon
            if (chrom_raw_data.find("complement") >= 0):
                chrom_pos_1st = end_pos - len_from_exon_start
                
                chrom_pos_2nd = chrom_pos_1st - 1
                #If the exons ends here: move to the next exon
                if (chrom_pos_2nd < start_pos):
                    index += 1
                    chrom_pos_2nd = int(exon_table["end_pos"][index])
                    start_pos = int(exon_table["start_pos"][index])
                    end_pos = int(exon_table["end_pos"][index])
                
                #If the exons ends here: move to the next exon
                chrom_pos_3rd = chrom_pos_2nd - 1
                if (chrom_pos_3rd < start_pos):
                    index += 1
                    chrom_pos_3rd = int(exon_table["end_pos"][index])
                    
            #Calculate position for forward strand
            else:
                chrom_pos_1st = start_pos + len_from_exon_start
                
                chrom_pos_2nd = chrom_pos_1st + 1
                 #If the exons ends here: move to the next exon
                if (chrom_pos_2nd > end_pos):
                    index += 1
                    chrom_pos_2nd = int(exon_table["start_pos"][index])
                    start_pos = int(exon_table["start_pos"][index])
                    end_pos = int(exon_table["end_pos"][index])
                
                #If the exons ends here: move to the next exon
                chrom_pos_3rd = chrom_pos_2nd + 1
                if (chrom_pos_3rd > end_pos):
                    index += 1
                    chrom_pos_3rd = int(exon_table["start_pos"][index])
            
            return (chrom_pos_1st, chrom_pos_2nd, chrom_pos_3rd)
        

In [9]:
#Boolean function - determine if a given text can be converted to a number
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

In [10]:
#A function that return the hmm state of that protein position
# return -1 for positions outside of domains regions, -2 for matching insertion
#TODO: do we need also transcript id? do we want to consider more than 1 transcript per gene?
def protein_pos_to_hmm_state_and_aa(protein_pos, domain_gene_table):
    for index, row in domain_gene_table.iterrows():
        target_start = row["TargetStart"]
        target_end = row["TargetEnd"]
        
        #Check if the position is inside this domain instance of the gene
        if (protein_pos >= target_start and protein_pos <= target_end):
            
            #Find the HMM match state
            hmm_pos = (row["HMM_Pos"]).split(",")
            index_inside_match = (protein_pos - target_start)
            hmm_state_text = hmm_pos[index_inside_match]
            if is_number(hmm_state_text):
                hmm_state = int(hmm_state_text)
            else:
                #the position match insertion
                hmm_state = -2
                
            #Find the amino acid
            target_seq = list(row["Target_Seq"])
            aa = (target_seq[index_inside_match]).upper()
            
            return(hmm_state, aa)
            
    #The protein position isn't in any domain region        
    return (-1,'-')

In [25]:
#A function that create the new codon for the alteration
def create_alt_codon(exac_ref_bp, curr_alt_bp, ref_codon, alt_codon_pos, chrom_raw_data):
    
    #For error logging
    functionNameAsString = sys._getframe().f_code.co_name
    
    #Complement strand - transversing the bp to base-complement
    if (chrom_raw_data.find("complement") >= 0):
        new_bp = ""
        for c in curr_alt_bp:
            if (c == 'A'):
                new_bp = new_bp+'T'
            elif (c == 'T'):
                new_bp = new_bp+'A'
            elif (c == 'G'):
                new_bp = new_bp+'C'
            else:
                new_bp = new_bp+'G'
        
        exac_ref_bp_adj = ""
        for c in exac_ref_bp:
            if (c == 'A'):
                exac_ref_bp_adj = exac_ref_bp_adj+'T'
            elif (c == 'T'):
                exac_ref_bp_adj = exac_ref_bp_adj+'A'
            elif (c == 'G'):
                exac_ref_bp_adj = exac_ref_bp_adj+'C'
            else:
                exac_ref_bp_adj = exac_ref_bp_adj+'G'
        
    #Regular starnd
    else:
        new_bp = curr_alt_bp
        exac_ref_bp_adj = exac_ref_bp
        
    #Validation: making sure the ref bp from ExAC is indeed inside the ref codon sequence retrieved from hg19
    if (ref_codon.find(exac_ref_bp_adj) == -1):
        print functionNameAsString+" Error: ExAC ref sequence "+exac_ref_bp_adj+" isn't found in hg19 retrieved codon sequence "+ref_codon
        
    new_alt_codon = ref_codon[:alt_codon_pos]+new_bp+ref_codon[alt_codon_pos+len(exac_ref_bp_adj):]
    #new_alt_codon = ref_codon[:alt_codon_pos]+new_bp+ref_codon[alt_codon_pos+1:]
    
    return new_alt_codon

In [12]:
codon_table = {
    'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
    'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
    'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
    'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
    'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
    'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
    'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
    'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
    'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
    'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
    'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
    'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
    'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
    'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
    'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
    'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W',
    }

In [62]:
#Retrieve the codon base-parirs from the ref sequence
def retrieve_codon_seq(chrom_pos_list, chrom_raw_data, chrom):
    chromsome_name = "chr"+chrom
    chrom_pos_1st = chrom_pos_list[0]
    chrom_pos_2nd = chrom_pos_list[1]
    chrom_pos_3rd = chrom_pos_list[2]
    
    if (chrom_pos_1st < chrom_pos_3rd):
        chrom_pos_list_adj = chrom_pos_list
    else:
        chrom_pos_list_adj = reversed(chrom_pos_list) #For reverse strand
        
    seq = ""
    for chrom_pos in chrom_pos_list:
        seq_start = chrom_pos - 1
        query = !./twoBitToFa hg19.2bit stdout -seq=$chromsome_name -start=$seq_start -end=$chrom_pos
        seq = seq+query[1]
    
    #Complement strand - transversing the bp to base-complement
    if (chrom_raw_data.find("complement") >= 0):
        complement_seq = []
        for c in seq:
            if (c == 'A'):
                complement_seq.append('T')
            elif (c == 'T'):
                complement_seq.append('A')
            elif (c == 'G'):
                complement_seq.append('C')
            else:
                complement_seq.append('G')
        seq = complement_seq[0]+complement_seq[1]+complement_seq[2]
        
    return seq

In [56]:
print type(2)

<type 'int'>


In [59]:
#A function that return a dict with the MAF info for the protein position and corresponding chromosomal location
def calc_exac_maf_data(chrom_pos_list, chrom_gene_table, protein_pos, aa, chrom_raw_data, chrom):
    
    res_dict = {}
    res_dict["chrom"] = chrom
    res_dict["chrom_pos"] = chrom_pos_list
    res_dict["prot_pos"] = protein_pos
    res_dict["aa_ref"] = aa
    res_dict["bp_ref"] = retrieve_codon_seq(chrom_pos_list, chrom_raw_data, chrom)
    
    #For error logging
    functionNameAsString = sys._getframe().f_code.co_name
    print "chrom_pos_list = "+str(chrom_pos_list)
    #Validation: checking that the returned codon sequence from hg19 match the HMMER amino-acid
    translated_aa = codon_table[(res_dict["bp_ref"]).upper()]
    if (translated_aa != aa):
        print functionNameAsString+" Error: hg19 codon sequence retrieved "+(res_dict["bp_ref"]).upper()+"="+translated_aa+" doesn't match HMMER amino-acid "+aa
    
    alterations_af_dict = defaultdict(list)
    alterations_af_adj_dict = defaultdict(list)
    
    for i in range(len(chrom_pos_list)):
        chrom_pos = chrom_pos_list[i]
        alt_codon_pos = i
        #print str(chrom_pos)
            
        #Retreiving relevant ExAC entry
        chrom_alter = chrom_gene_table[chrom_gene_table["pos"] == chrom_pos]
        chrom_alter = chrom_alter.reset_index(drop=True)
                
        if (chrom_alter.shape[0] == 0):
            #No ExAC entry for this chromosome position - not adding alteration data
            continue
        else:
            exac_prot_data = True
            #Validation: the ExAC chromosome position is within the protein
            if (chrom_alter["prot_pos"][0] == ""):
                #print functionNameAsString+" Error: ExAC chromosome position "+str(chrom_pos)+" doesn't correspond to a protein"
                #We assume it's an error in ExAC and logging alteration anyway.
                exac_prot_data = False
            else:
                #Validation: the ExAC protein position match the HMMER protein position
                exac_prot_pos = chrom_alter["prot_pos"][0]
                if (exac_prot_pos.find("-") >=0):
                    first_exac_prot_pos = int(exac_prot_pos[:exac_prot_pos.find("-")])
                else:
                    first_exac_prot_pos = int(exac_prot_pos)
                #in case there's more than one position listed
                if (first_exac_prot_pos != protein_pos):
                    print "type(first_exac_prot_pos) = "+str(type(first_exac_prot_pos))
                    print "type(protein_pos) = "+str(type(protein_pos))
                    print functionNameAsString+" Error: ExAC protein position "+str(first_exac_prot_pos)+" doesn't match HMMER protein position "+str(protein_pos)

                #Validation: the ExAC aa match the HMMER aa
                exac_aa = chrom_alter["amino_acids"][0]
                exac_ref_aa = exac_aa[:1]
                exac_alt_aa = exac_aa[exac_aa.find("/")+1:]
                if (exac_ref_aa != aa):
                    print functionNameAsString+" Error: ExAC amino acid identity "+exac_ref_aa+" doesn't match HMMER amino-acid "+aa
                
                #Extracting aa codon data if exist
                exac_codons = chrom_alter["codons"][0]
                exac_ref_codon = exac_codons[:exac_codons.find("/")]
                exac_alt_codon = exac_codons[exac_codons.find("/")+1:]
                #Validation: the ExAC codon match the returned codon sequence from hg19
                if (exac_ref_codon.upper() != (res_dict["bp_ref"]).upper()):
                    print functionNameAsString+" Error: ExAC bp codon "+exac_ref_codon.upper()+" doesn't match hg19 codon sequence retrieved "+(res_dict["bp_ref"]).upper()
                
                #Getting the changed position inside the ExAC codon
                exac_alt_codon_pos = 0
                for c in exac_ref_codon:
                    if c.isupper():
                        break
                    exac_alt_codon_pos += 1
                #Validation: the ExAC alt position match my alt codon position calculation
                if (exac_alt_codon_pos != alt_codon_pos):
                    print functionNameAsString+" Error: the ExAC alt position in codon "+str(exac_alt_codon_pos)+" doesn't match my codon position calculation "+str(alt_codon_pos)

            #Creating lists in case there's more than one alteration in this chromosomal position
            alt_bp_list = (chrom_alter["alt"][0]).split(",")
            af_list = (chrom_alter["AF"][0]).split(",")
            an_adj = int(chrom_alter["AN_Adj"][0])
            ac_adj_list = (chrom_alter["AC_Adj"][0]).split(",")
            exac_ref_bp = chrom_alter["ref"][0]
                
            for j in range(len(alt_bp_list)):
                
                #Calculating the alteration relevant data
                curr_alt_bp = alt_bp_list[j]
                #print "curr_alt_bp = "+curr_alt_bp
                alt_codon = create_alt_codon(exac_ref_bp, curr_alt_bp, res_dict["bp_ref"], alt_codon_pos, chrom_raw_data)
                if (len(alt_codon) != 3):
                    alt_aa = "indel"
                else:
                    alt_aa = codon_table[alt_codon.upper()]
                #print "alt codon = "+alt_codon
                #print "alt aa = "+alt_aa
                
                #Validation: ExAC alt codon and aa for the first alteration match the claculated alt
                if (j == 0 and exac_prot_data and exac_alt_codon.upper() != alt_codon):
                    #print "alt_codon = "+alt_codon
                    #print "exac_alt_codon = "+exac_alt_codon.upper()
                    print functionNameAsString+" Error: the ExAC alt codon "+exac_alt_codon+" doesn't match my alt codon calculation "+alt_codon
                if (j == 0 and exac_prot_data and exac_alt_aa != alt_aa):
                    print functionNameAsString+" Error: the ExAC alt aa "+exac_alt_aa+" doesn't match my alt aa calculation "+alt_aa
                    
                if (alt_aa == res_dict["aa_ref"]):
                    #Not logging alteration for synonymous mutations
                    #print "Synonuymous"
                    continue
                
                #Non-synonymous(!!!) - logging the alteration
                else:
                    #print "Non-Synonuymous"
                    alterations_af_dict[alt_aa].append(float(af_list[j]))
                    af_adj = float(ac_adj_list[j])/float(an_adj)
                    af_adj_format = float('{:.3e}'.format(float(af_adj)))
                    alterations_af_adj_dict[alt_aa].append(af_adj_format)
    
    #Calculating the overall MAF from the alteration dicts
    res_dict["af"] = 0
    res_dict["af_adj"] = 0
    
    #Checking if any alteration is above 0.5, and changing the ref accordingly
    for aa in alterations_af_dict.keys():
        aa_sum = sum(alterations_af_dict[aa])
        aa_adj_sum = sum(alterations_af_adj_dict[aa])
        if (aa != "indel" and aa_sum > 0.5):
            res_dict["aa_ref"] = aa
            res_dict["af"] =(1 - aa_sum)
            res_dict["af_adj"] = (1 - aa_adj_sum)
            break
        else:
            res_dict["af"] += aa_sum
            res_dict["af_adj"] += aa_adj_sum
        
        #Fix the AF format
        res_dict["af"] = float('{:.3e}'.format(float(res_dict["af"])))
        res_dict["af_adj"] = float('{:.3e}'.format(float(res_dict["af_adj"])))
    
    return res_dict

In [70]:
ens_gene = "ENSG00000121903"
canonic_prot = canonic_protein[ens_gene]
domain_gene_table = domain_chrom_data[domain_chrom_data["prot_id"] == canonic_prot]
#Making sure that if two HMM-matches overlaps, the higher bit score will come first in the table
domain_gene_table = domain_gene_table.sort_values(by="BitScore", ascending=False)

#Creating a table of the exons for this gene, according to the canonical protein
chrom_raw_data = domain_gene_table["chromosome_id"].unique()[0] #there should be only one element here
if (len(domain_gene_table["chromosome_id"].unique()) > 1):
    print functionNameAsString+" Error: "+ens_gene+": more than one chromosome raw data" #sanity check
exon_table = create_exon_pos_table(chrom_raw_data)

exons_start_pos = min(exon_table["start_pos"][0],exon_table["start_pos"][len(exon_table)-1]) #in case of complelemt, the minimal position could be at the last row
exons_end_pos = max(exon_table["end_pos"][0],exon_table["end_pos"][len(exon_table)-1]) #in case of complelemt, the maximal position could be at the first row
chrom_gene_table = chrom_csv[chrom_csv["pos"] >= int(exons_start_pos)][chrom_csv["pos"] <= int(exons_end_pos)]
chrom_gene_table = chrom_gene_table.reset_index(drop=True)
chrom_gene_size = chrom_gene_table.shape[0]

In [63]:
chrom_path = curr_dir[0]+"/parsed/"
chrom_filename = "parsed_chrom"
states_dict = defaultdict(list)

#For error logging
functionNameAsString = sys._getframe().f_code.co_name

for chrom in chromosome_names:
    
    #Filtering the domain data relevant to this chromosome
    domain_chrom_data = sorted_zinc[sorted_zinc["chrom_num"] == chrom]
    
    #Loading the ExAC parsed data of this chromosome
    chrom_csv = pd.read_csv(chrom_path+chrom_filename+chrom+".csv", sep='\t', index_col=0)
    chrom_csv = chrom_csv.sort_values(by=["pos"])
    chrom_csv = chrom_csv.reset_index(drop=True)
    chrom_csv.fillna('', inplace=True)
    
    #Getting a list of all the relevant ensembl gene ids for this chromosome
    domain_ens_genes = (domain_chrom_data["ensembl_id"]).unique()
    
    #For each ensembl gene in the domain data - finding all the ExAC alterations
    for ens_gene in domain_ens_genes:
        
        #Filtering the domain data for this gene according to the canonical protein id
        canonic_prot = canonic_protein[ens_gene]
        domain_gene_table = domain_chrom_data[domain_chrom_data["prot_id"] == canonic_prot]
        #Making sure that if two HMM-matches overlaps, the higher bit score will come first in the table
        domain_gene_table = domain_gene_table.sort_values(by="BitScore", ascending=False)
        
        #Creating a table of the exons for this gene, according to the canonical protein
        chrom_raw_data = domain_gene_table["chromosome_id"].unique()[0] #there should be only one element here
        if (len(domain_gene_table["chromosome_id"].unique()) > 1):
            print functionNameAsString+" Error: "+ens_gene+": more than one chromosome raw data" #sanity check
        exon_table = create_exon_pos_table(chrom_raw_data)
        
        #Filtering the chromosome data to the exons region
        exons_start_pos = min(exon_table["start_pos"][0],exon_table["start_pos"][len(exon_table)-1]) #in case of complelemt, the minimal position could be at the last row
        exons_end_pos = max(exon_table["end_pos"][0],exon_table["end_pos"][len(exon_table)-1]) #in case of complelemt, the maximal position could be at the first row
        chrom_gene_table = chrom_csv[chrom_csv["pos"] >= int(exons_start_pos)][chrom_csv["pos"] <= int(exons_end_pos)]
        chrom_gene_table = chrom_gene_table.reset_index(drop=True)
        chrom_gene_size = chrom_gene_table.shape[0]
        
        #Iterating over the amino-acids of the protein
        prot_len = domain_gene_table["length"].unique()[0]
        for protein_pos in range(1,prot_len+1):
    
            #Trying to match HMM-state, and retreive the aa from HMMER results
            (hmm_state, aa) = protein_pos_to_hmm_state_and_aa(protein_pos, domain_gene_table) #TODO: what happens when two matches overlap? maybe sort to the best bit score?
                
            #If there's a match to HMM-state: find the corresponding codon bps chromosome positions
            if (hmm_state > 0):
                chrom_pos_list =find_chrom_bps(protein_pos, exon_table, chrom_raw_data)
                
                #Analysis of the amino-acid MAF and realted data, returned in a dictionary
                info_dict = calc_exac_maf_data(chrom_pos_list, chrom_gene_table, protein_pos, aa, chrom_raw_data, chrom)
                
                #Adding the dictionary to the HMM-state list
                states_dict[hmm_state].append(info_dict)
                    
    print "Finished chromosome "+chrom

with open(my_path+'zinc_hmm_states_dict.pik', 'wb') as handle:
    pickle.dump(states_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

chrom_pos_list = (16271443, 16271442, 16271441)
chrom_pos_list = (16271440, 16271439, 16271438)
chrom_pos_list = (16271437, 16271436, 16271435)
chrom_pos_list = (16271434, 16271433, 16271432)
chrom_pos_list = (16271334, 16271333, 16271332)
chrom_pos_list = (16271331, 16271330, 16271329)
chrom_pos_list = (16271328, 16271327, 16271326)
chrom_pos_list = (16271325, 16271324, 16271323)
chrom_pos_list = (16271322, 16271321, 16271320)
chrom_pos_list = (16271319, 16271318, 16271317)
chrom_pos_list = (16271316, 16271315, 16271314)
chrom_pos_list = (16271313, 16271312, 16271311)
chrom_pos_list = (16271310, 16271309, 16271308)
chrom_pos_list = (16271307, 16271306, 16271305)
chrom_pos_list = (16271304, 16271303, 16271302)
chrom_pos_list = (16271301, 16271300, 16271299)
chrom_pos_list = (16271298, 16271297, 16271296)
chrom_pos_list = (16271295, 16271294, 16271293)
chrom_pos_list = (16271292, 16271291, 16271290)
chrom_pos_list = (16271289, 16271288, 16271287)
chrom_pos_list = (16271286, 16271285, 16

KeyboardInterrupt: 

In [72]:
exon_table

Unnamed: 0,start_pos,end_pos,length,first_bp_count
0,33944890,33945306,417,1
1,33954065,33954251,187,418
2,33954715,33954791,77,605
3,33955118,33955202,85,682
4,33956625,33957302,678,767
5,33958787,33959215,429,1445
6,33959818,33961076,1259,1874


In [71]:
domain_gene_table

Unnamed: 0,#TargetID,Hugo_symbol,pfam_id,domain_name,E-value,BitScore,TargetStart,TargetEnd,HMM_Seq,Target_Seq,HMM_Pos,prot_id,ensembl_id,transcript_id,chromosome_id,chrom_num,length,refseq,hmm_start,hmm_end
34,ZSCAN20.001,ZSCAN20,PF00096,zf-C2H2,9.8e-11,36.6,794,816,ykCpdCgksFkrksnLkrHirtH,YKCGECWKSFNQSSNLLKHQRIH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000355053,ENSG00000121903,ENST00000361328,"GRCh37.75:1:join(33944890..33945306,33954065.....",1,1043,NP_660281,1,23
32,ZSCAN20.001,ZSCAN20,PF00096,zf-C2H2,2e-10,35.6,766,788,ykCpdCgksFkrksnLkrHirtH,YKCLECGKSFSDHSNLITHQRIH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000355053,ENSG00000121903,ENST00000361328,"GRCh37.75:1:join(33944890..33945306,33954065.....",1,1043,NP_660281,1,23
36,ZSCAN20.001,ZSCAN20,PF00096,zf-C2H2,1.9e-10,35.6,903,925,ykCpdCgksFkrksnLkrHirtH,YECAECGKSFSKSSTLANHQRTH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000355053,ENSG00000121903,ENST00000361328,"GRCh37.75:1:join(33944890..33945306,33954065.....",1,1043,NP_660281,1,23
31,ZSCAN20.001,ZSCAN20,PF00096,zf-C2H2,2.3e-10,35.3,738,760,ykCpdCgksFkrksnLkrHirtH,YKCLECGKNFSDRSNLNTHQRIH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000355053,ENSG00000121903,ENST00000361328,"GRCh37.75:1:join(33944890..33945306,33954065.....",1,1043,NP_660281,1,23
35,ZSCAN20.001,ZSCAN20,PF00096,zf-C2H2,2.3e-10,35.3,875,897,ykCpdCgksFkrksnLkrHirtH,YECSECGRSFSKSSALISHQRIH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000355053,ENSG00000121903,ENST00000361328,"GRCh37.75:1:join(33944890..33945306,33954065.....",1,1043,NP_660281,1,23
39,ZSCAN20.001,ZSCAN20,PF00096,zf-C2H2,4.4e-10,34.4,987,1009,ykCpdCgksFkrksnLkrHirtH,YKCRECGKCFNQSSSLIIHQRIH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000355053,ENSG00000121903,ENST00000361328,"GRCh37.75:1:join(33944890..33945306,33954065.....",1,1043,NP_660281,1,23
38,ZSCAN20.001,ZSCAN20,PF00096,zf-C2H2,5e-10,34.2,959,981,ykCpdCgksFkrksnLkrHirtH,YKCLECGKFFRDRSNLITHQRIH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000355053,ENSG00000121903,ENST00000361328,"GRCh37.75:1:join(33944890..33945306,33954065.....",1,1043,NP_660281,1,23
37,ZSCAN20.001,ZSCAN20,PF00096,zf-C2H2,1.5e-09,32.6,931,953,ykCpdCgksFkrksnLkrHirtH,YKCVDCGKCFSERSKLITHQRVH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000355053,ENSG00000121903,ENST00000361328,"GRCh37.75:1:join(33944890..33945306,33954065.....",1,1043,NP_660281,1,23
40,ZSCAN20.001,ZSCAN20,PF00096,zf-C2H2,2.7e-09,31.8,1015,1037,ykCpdCgksFkrksnLkrHirtH,YKCTECGKDFNNSSHFSAHRRTH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000355053,ENSG00000121903,ENST00000361328,"GRCh37.75:1:join(33944890..33945306,33954065.....",1,1043,NP_660281,1,23
30,ZSCAN20.001,ZSCAN20,PF00096,zf-C2H2,2.7e-08,28.5,710,732,ykCpdCgksFkrksnLkrHirtH,YKCDTCMKSFSRSSHFIAHQRIH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000355053,ENSG00000121903,ENST00000361328,"GRCh37.75:1:join(33944890..33945306,33954065.....",1,1043,NP_660281,1,23


In [76]:
protein_pos = 730
(hmm_state, aa) = protein_pos_to_hmm_state_and_aa(protein_pos, domain_gene_table)
chrom_pos_list = find_chrom_bps(protein_pos, exon_table, chrom_raw_data)
info_dict = calc_exac_maf_data(chrom_pos_list, chrom_gene_table, protein_pos, aa, chrom_raw_data, chrom)

chrom_pos_list = (33960132, 33960133, 33960134)
calc_exac_maf_data Error: the ExAC alt aa * doesn't match my alt aa calculation _


In [934]:
info_dict

{'aa_ref': 'K',
 'af': 0,
 'af_adj': 0,
 'bp_ref': 'AAG',
 'chrom': '1',
 'chrom_pos': (41012506, 41012507, 41012508),
 'prot_pos': 171}

In [69]:
chrom_csv[chrom_csv["pos"] == 33960184]

Unnamed: 0,chrom,pos,id,ref,alt,qual,filter,AC,AC_Adj,AF,...,DP,gene,conseq,prot_pos,amino_acids,codons,SWISSPROT,SIFT,domains,clin_sig
212234,1,33960184,.,TTA,T,2685.49,PASS,1,1,9.431e-06,...,1501809,ENSG00000121903,frameshift_variant&missense_variant&feature_tr...,747-748,FS/LX,ttTAgt/ttgt,ZSC20_HUMAN,,Pfam_domain:PF00096&PROSITE_profiles:PS50157&S...,


In [65]:
chrom

'1'