In [1]:
import pandas as pd
import numpy as np
import math
from collections import defaultdict
import matplotlib.pyplot as plt
import pickle
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

In [2]:
#Reading the Zinc-finger domain data
curr_dir = !pwd
my_path = curr_dir[0]+"/hmm_domains/"
filename = "zf-C2H2.csv"
zinc_finger = pd.read_csv(my_path+filename, sep='\t', index_col=0)
#Sort the zinc finger data
sorted_zinc = zinc_finger.sort_values(by=["chrom_num", "ensembl_id", "TargetStart"])
sorted_zinc = sorted_zinc.reset_index(drop=True)

In [3]:
#Get the canonic protein id for Zinc domain
with open(my_path+'zinc_canonic_prot.pik', 'rb') as handle:
    canonic_protein = pickle.load(handle)

In [4]:
chromosome_names = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y"]

In [5]:
chrom_names_list = sorted_zinc["chrom_num"].tolist()
starnge_chrom_sum = 0
for name in chrom_names_list:
    if (name not in chromosome_names):
        starnge_chrom_sum += 1
print "Leaving outside "+str(starnge_chrom_sum)+" out of "+str(len(chrom_names_list))

Leaving outside 74 out of 10492


In [6]:
#A function that get chromosome raw data from the hmmer results and return a table of the exons
def create_exon_pos_table(chrom_raw):
    exons_raw = chrom_raw
    
    #Removing the complement bracates if exist
    if (exons_raw.find("complement(") >= 0):
        exons_raw = exons_raw[exons_raw.find("complement(")+11:-1]
    
    #Removing the join bracates if exist
    if (exons_raw.find("join(") >= 0):
        exons_raw = exons_raw[exons_raw.find("join(")+5:-1]
        
    #In case there's only one exon, take everything after the second ":"
    else:
        exons_raw = exons_raw[exons_raw.find(":", chrom_raw.find(":")+1)+1:]
    
    exons_list = exons_raw.split(",")
    exon_pos = []
    for ex in exons_list:
        exon_pos.append(ex.split(".."))
    exon_df = pd.DataFrame(exon_pos)
    exon_df.columns = ["start_pos", "end_pos"]
    exon_len = []
    for index, exon in exon_df.iterrows():
        exon_len.append(int(exon[1]) - int(exon[0])+1)
    exon_df["length"] = exon_len
    first_bp_count = 1
    first_bp_list = []
    for index, exon in exon_df.iterrows():
        first_bp_list.append(first_bp_count)
        first_bp_count += int(exon[2])
    exon_df["first_bp_count"] = first_bp_list
    return(exon_df)

In [7]:
#A function that get chromosome position and table of exons, and return the protein position or -1 if it's not within any exon
def find_protein_pos(chrom_pos, exon_df, chrom_raw):
    for index, exon in exon_df.iterrows():
        start_pos = int(exon[0])
        end_pos = int(exon[1])
        first_bp_count = int(exon[3])
        if (chrom_pos >= start_pos and chrom_pos <= end_pos):
            
            #Calculate position for reverse complement strand: the protein is translated from the end position towards the start position of the exon
            if (chrom_raw.find("complement")):
                len_from_exon_start = end_pos - chrom_pos
            #Calculate position for forward starnd
            else:
                len_from_exon_start = chrom_pos - start_pos
            
            #Calculate the position on the mRNA transcript
            transcript_pos = len_from_exon_start + first_bp_count
            
            #Calculate the position on the protein sequence
            protein_pos = int(math.ceil(float(transcript_pos)/3))
            return protein_pos
    
    #If the position wasn't in the regions of any exon
    return -1

In [50]:
#A function that get protein position and table of exons, and return the chromosome position of the 1st bp of the codon
def find_chrom_bps(protein_pos, exon_table, chrom_raw_data):
    
    #calculate the mRNA transcript index of this protein position (the 1st bp in the triplet)
    transcript_pos = (protein_pos*3)-2
    
    #Iterating over all the gene exons
    for index, exon in exon_table.iterrows():  
        first_bp_count = int(exon["first_bp_count"])
        exon_length = int(exon["length"])
        last_bp_count = first_bp_count + exon_length - 1
        
        #Checking if the transcript position is within this exon
        if (first_bp_count <= transcript_pos and transcript_pos <= last_bp_count):
            
            start_pos = int(exon["start_pos"])
            end_pos = int(exon["end_pos"])
            
            len_from_exon_start = transcript_pos - first_bp_count
            
            #Calculate bps position for reverse complement strand: the protein is translated from the end position towards the start position of the exon
            if (chrom_raw_data.find("complement")):
                chrom_pos_1st = end_pos - len_from_exon_start
                
                chrom_pos_2nd = chrom_pos_1st - 1
                #If the exons ends here: move to the next exon
                if (chrom_pos_2nd < start_pos):
                    index += 1
                    chrom_pos_2nd = int(exon_table["end_pos"][index])
                    start_pos = int(exon["start_pos"][index])
                    end_pos = int(exon["end_pos"][index])
                
                #If the exons ends here: move to the next exon
                chrom_pos_3rd = chrom_pos_2nd - 1
                if (chrom_pos_3rd < start_pos):
                    index += 1
                    chrom_pos_3rd = int(exon_table["end_pos"][index])
                    
            #Calculate position for forward strand
            else:
                chrom_pos_1st = start_pos + len_from_exon_start
                
                chrom_pos_2nd = chrom_pos_1st + 1
                 #If the exons ends here: move to the next exon
                if (chrom_pos_2nd > end_pos):
                    index += 1
                    chrom_pos_2nd = int(exon_table["start_pos"][index])
                    start_pos = int(exon["start_pos"][index])
                    end_pos = int(exon["end_pos"][index])
                
                #If the exons ends here: move to the next exon
                chrom_pos_3rd = chrom_pos_2nd + 1
                if (chrom_pos_3rd > end_pos):
                    index += 1
                    chrom_pos_3rd = int(exon_table["start_pos"][index])
            
            return (chrom_pos_1st, chrom_pos_2nd, chrom_pos_3rd)
        

In [8]:
#Boolean function - determine if a given text can be converted to a number
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

In [94]:
#A function that return the hmm state of that protein position
# return -1 for positions outside of domains regions, -2 for matching insertion
#TODO: do we need also transcript id? do we want to consider more than 1 transcript per gene?
def protein_pos_to_hmm_state_and_aa(protein_pos, domain_gene_table):
    for index, row in domain_gene_table.iterrows():
        target_start = row["TargetStart"]
        target_end = row["TargetEnd"]
        
        #Check if the position is inside this domain instance of the gene
        if (protein_pos >= target_start and protein_pos <= target_end):
            
            #Find the HMM match state
            hmm_pos = (row["HMM_Pos"]).split(",")
            index_inside_match = (protein_pos - target_start)
            hmm_state_text = hmm_pos[index_inside_match]
            if is_number(hmm_state_text):
                hmm_state = int(hmm_state_text)
            else:
                #the position match insertion
                hmm_state = -2
                
            #Find the amino acid
            target_seq = list(row["Target_Seq"])
            aa = target_seq[index_inside_match]
            
            return(hmm_state, aa)
            
    #The protein position isn't in any domain region        
    return (-1,'-')

In [203]:
#A function that create the new codon for the alteration
def create_alt_codon(curr_alt_bp, ref_codon, alt_codon_pos, chrom_raw_data):
    #Complement strand - transversing the bp to base-complement
    if (chrom_raw_data.find("complement")):
        if (curr_alt_bp == 'A'):
            new_bp = 'T'
        elif (curr_alt_bp == 'T'):
            new_bp = 'A'
        elif (curr_alt_bp == 'G'):
            new_bp = 'C'
        else:
            new_bp = 'G'
    #Regular starnd
    else:
        new_bp = curr_alt_bp
        
    new_alt_codon = ref_codon[:alt_codon_pos]+new_bp+ref_codon[alt_codon_pos+1:]
    
    return new_alt_codon

In [212]:
codon_table = {
    'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
    'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
    'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
    'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
    'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
    'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
    'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
    'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
    'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
    'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
    'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
    'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
    'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
    'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
    'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
    'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W',
    }

In [264]:
#A function that return a dict with the MAF info for the protein position and corresponding chromosomal location
def calc_exac_maf_data(chrom_pos_list, chrom_gene_table, protein_pos, aa, chrom_raw_data):
    
    res_dict = {}
    res_dict["chrom"] = chrom
    res_dict["chrom_pos"] = (chrom_pos_1st, chrom_pos_2nd, chrom_pos_3rd)
    res_dict["prot_pos"] = protein_pos
    res_dict["aa_ref"] = aa
    
    alterations_af_dict = defaultdict(list)
    alterations_af_adj_dict = defaultdict(list)
    for chrom_pos in chrom_pos_list:
        #Retreiving relevant ExAC entry
        chrom_alter = chrom_gene_table[chrom_gene_table["pos"] == chrom_pos]
        chrom_alter = chrom_alter.reset_index(drop=True)
                
        if (chrom_alter.shape[0] == 0):
            #No ExAC entry for this chromosome position
            continue
        else:
            #Validation: the ExAC chromosome position is within the protein
            if (chrom_alter["prot_pos"][0] == ""):
                print "Error: ExAC chromosome position doesn't correspond to a protein"
                #Not adding any alteration data
                continue
                    
            #The ExAC entry has protein details
            else:
                #Validation: the ExAC protein position match the HMMER protein position
                exac_prot_pos = int(chrom_alter_1st["prot_pos"][0])
                if (exac_prot_pos != protein_pos):
                    print "Error: ExAC protein position mismatch"

                #Validation: the ExAC aa match the HMMER aa
                exac_aa = chrom_alter["amino_acids"][0]
                exac_aa_ref = exac_aa[:1]
                exac_aa_alt = exac_aa[exac_aa.find("/")+1:]
                if (exac_aa[:1] != aa):
                    print "Error: ExAC amino acid identity mismatch"
                
                #Creating lists in case there's more than one alteration in this chromosomal position
                alt_bp_list = (chrom_alter["alt"][0]).split(",")
                af_list = (chrom_alter["AF"][0]).split(",")
                an_adj = int(chrom_alter["AN_Adj"][0])
                ac_adj_list = (chrom_alter["AC_Adj"][0]).split(",")
                codons = chrom_alter["codons"][0]
                ref_codon = codons[:codons.find("/")]
                alt_codon_pos = 0
                for c in ref_codon:
                    if c.isupper():
                        break
                    alt_codon_pos += 1
                
                for j in range(len(alt_bp_list)):
                    #The first alteration has ExAC mata-data
                    if (j == 0):
                        #Checking if it's synonymous
                        if (exac_aa_alt == exac_aa_ref):
                            #Not adding any alteration data
                            continue
                        #Non-synonymous
                        else:
                            alterations_af_dict[exac_aa_alt].append(af_list[0])
                            af_adj = float(ac_adj_list[0])/float(an_adj)
                            af_adj_format = float('{:.3e}'.format(float(af_adj)))
                            alterations_af_adj_dict[exac_aa_alt].append(af_adj_format)
                    #Additional alterations don't have ExAC meta-data
                    else:
                        curr_alt_bp = alt_bp_list[j]
                        alt_codon = create_alt_codon(curr_alt_bp, ref_codon, alt_codon_pos, chrom_raw_data)
                        alt_aa = codon_table[alt_codon.upper()]
                        #Checking if it's synonymous
                        if (alt_aa == exac_aa_ref):
                            #Not adding any alteration data
                            continue
                        #Non-synonymous
                        else:
                            alterations_af_dict[alt_aa].append(af_list[j])
                            af_adj = float(ac_adj_list[j])/float(an_adj)
                            af_adj_format = float('{:.3e}'.format(float(af_adj)))
                            alterations_af_adj_dict[alt_aa].append(af_adj_format)
    
    #Calculating the overall MAF from the alteration dicts
    res_dict["af"] = 0
    res_dict["af_adj"] = 0
    
    #Checking if any alteration is above 0.5, and changing the ref accordingly
    for aa in alterations_af_dict.keys():
        aa_sum = sum(alterations_af_dict[aa])
        aa_adj_sum = sum(alterations_af_adj_dict[aa])
        if (aa_sum > 0.5):
            res_dict["aa_ref"] = aa
            res_dict["af"] = (1 - aa_sum)
            res_dict["af_adj"] = (1 - aa_adj_sum)
        else:
            res_dict["af"] += aa_sum
            res_dict["af_adj"] += aa_adj_sum
    
    return res_dict

In [267]:
domain_gene_table

Unnamed: 0,#TargetID,Hugo_symbol,pfam_id,domain_name,E-value,BitScore,TargetStart,TargetEnd,HMM_Seq,Target_Seq,HMM_Pos,prot_id,ensembl_id,transcript_id,chromosome_id,chrom_num,length,refseq,hmm_start,hmm_end
7,ZBTB17.002,ZBTB17,PF00096,zf-C2H2,2.7e-10,35.1,390,412,ykCpdCgksFkrksnLkrHirtH,YRCEDCGKLFTTSGNLKRHQLVH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000364895,ENSG00000116809,ENST00000375743,GRCh37.75:1:complement(join(16274786..16274990...,1,803,NP_003434,1,23
15,ZBTB17.002,ZBTB17,PF00096,zf-C2H2,9.1e-10,33.4,474,496,ykCpdCgksFkrksnLkrHirtH,LKCRECGKQFTTSGNLKRHLRIH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000364895,ENSG00000116809,ENST00000375743,GRCh37.75:1:complement(join(16274786..16274990...,1,803,NP_003434,1,23
2,ZBTB17.002,ZBTB17,PF00096,zf-C2H2,1.5e-09,32.7,306,328,ykCpdCgksFkrksnLkrHirtH,HKCEDCGKEFTHTGNFKRHIRIH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000364895,ENSG00000116809,ENST00000375743,GRCh37.75:1:complement(join(16274786..16274990...,1,803,NP_003434,1,23
21,ZBTB17.002,ZBTB17,PF00096,zf-C2H2,1.6e-09,32.5,558,580,ykCpdCgksFkrksnLkrHirtH,YVCERCGKRFVQSSQLANHIRHH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000364895,ENSG00000116809,ENST00000375743,GRCh37.75:1:complement(join(16274786..16274990...,1,803,NP_003434,1,23
10,ZBTB17.002,ZBTB17,PF00096,zf-C2H2,4.8e-09,30.9,418,440,ykCpdCgksFkrksnLkrHirtH,YQCDYCGRSFSDPTSKMRHLETH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000364895,ENSG00000116809,ENST00000375743,GRCh37.75:1:complement(join(16274786..16274990...,1,803,NP_003434,1,23
12,ZBTB17.002,ZBTB17,PF00096,zf-C2H2,9.6e-09,30.0,446,468,ykCpdCgksFkrksnLkrHirtH,HKCPHCDKKFNQVGNLKAHLKIH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000364895,ENSG00000116809,ENST00000375743,GRCh37.75:1:complement(join(16274786..16274990...,1,803,NP_003434,1,23
18,ZBTB17.002,ZBTB17,PF00096,zf-C2H2,1.4e-08,29.4,530,552,ykCpdCgksFkrksnLkrHirtH,CQCVMCGKAFTQASSLIAHVRQH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000364895,ENSG00000116809,ENST00000375743,GRCh37.75:1:complement(join(16274786..16274990...,1,803,NP_003434,1,23
23,ZBTB17.002,ZBTB17,PF00096,zf-C2H2,4.5e-08,27.7,614,637,ykCpdCgksFkrksnLkrHirt.H,YLCDKCGRGFNRVDNLRSHVKTvH,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...",ENSP00000364895,ENSG00000116809,ENST00000375743,GRCh37.75:1:complement(join(16274786..16274990...,1,803,NP_003434,1,23


In [268]:
protein_pos = 550
(hmm_state, aa) = protein_pos_to_hmm_state_and_aa(protein_pos, domain_gene_table)
chrom_pos_list = find_chrom_bps(protein_pos, exon_table, chrom_raw_data)
info_dict = calc_exac_maf_data(chrom_pos_list, chrom_gene_table, protein_pos, aa, chrom_raw_data)

Error: ExAC chromosome position doesn't correspond to a protein


In [282]:
aa

'R'

In [274]:
exon_table

Unnamed: 0,start_pos,end_pos,length,first_bp_count
0,16274786,16274990,205,1
1,16273430,16273618,189,206
2,16272649,16272789,141,395
3,16272210,16272335,126,536
4,16271432,16271697,266,662
5,16271192,16271334,143,928
6,16270795,16271095,301,1071
7,16270303,16270390,88,1372
8,16270094,16270210,117,1460
9,16269894,16270014,121,1577


In [270]:
chrom_pos_list

(16269943, 16269942, 16269941)

In [281]:
chrom_gene_table[chrom_gene_table["pos"] == 16269941]

Unnamed: 0,chrom,pos,id,ref,alt,qual,filter,AC,AC_Adj,AF,...,DP,gene,conseq,prot_pos,amino_acids,codons,SWISSPROT,SIFT,domains,clin_sig
175,1,16269941,.,G,C,1631.48,PASS,1,1,9.416e-06,...,1363977,ENSG00000116809,non_coding_transcript_exon_variant&non_coding_...,,,,,,,


In [279]:
problem = chrom_gene_table[chrom_gene_table["pos"] == 16269941]

In [280]:
problem.to_dict()

{'AC': {175: '1'},
 'AC_Adj': {175: '1'},
 'AF': {175: '9.416e-06'},
 'AN': {175: 106202},
 'AN_Adj': {175: 101894},
 'DP': {175: 1363977},
 'SIFT': {175: ''},
 'SWISSPROT': {175: ''},
 'alt': {175: 'C'},
 'amino_acids': {175: ''},
 'chrom': {175: 1},
 'clin_sig': {175: ''},
 'codons': {175: ''},
 'conseq': {175: 'non_coding_transcript_exon_variant&non_coding_transcript_variant'},
 'domains': {175: ''},
 'filter': {175: 'PASS'},
 'gene': {175: 'ENSG00000116809'},
 'id': {175: '.'},
 'pos': {175: 16269941},
 'prot_pos': {175: ''},
 'qual': {175: 1631.48},
 'ref': {175: 'G'}}

In [255]:
res_dict = {}
res_dict["chrom"] = chrom
res_dict["chrom_pos"] = (chrom_pos_1st, chrom_pos_2nd, chrom_pos_3rd)
res_dict["prot_pos"] = protein_pos
res_dict["aa_ref"] = aa

alterations_af_dict = defaultdict(list)
alterations_af_adj_dict = defaultdict(list)

for chrom_pos in chrom_pos_list:
        #Retreiving relevant ExAC entry
        chrom_alter = chrom_gene_table[chrom_gene_table["pos"] == chrom_pos]
        chrom_alter = chrom_alter_1st.reset_index(drop=True)
                
        if (chrom_alter.shape[0] == 0):
            #No ExAC entry for this chromosome position
            continue
        else:
            #Validation: the ExAC chromosome position is within the protein
            if (chrom_alter["prot_pos"][0] == ""):
                print "Error: ExAC chromosome position doesn't correspond to a protein"
                #Not adding any alteration data
                continue

In [253]:
chrom_pos = chrom_pos_list[2]
chrom_alter = chrom_gene_table[chrom_gene_table["pos"] == chrom_pos]

if (chrom_alter.shape[0] == 0):
    print "No ExAC"

No ExAC


In [252]:
res_dict["af"] = 0
res_dict["af_adj"] = 0

for aa in alterations_af_dict.keys():
    aa_sum = sum(alterations_af_dict[aa])
    aa_adj_sum = sum(alterations_af_adj_dict[aa])
    if (aa_sum > 0.5):
        res_dict["aa_ref"] = aa
        res_dict["af"] = (1 - aa_sum)
        res_dict["af_adj"] = (1 - aa_adj_sum)
    else:
        res_dict["af"] += aa_sum
        res_dict["af_adj"] += aa_adj_sum

In [None]:
chrom_path = curr_dir[0]+"/parsed/"
chrom_filename = "parsed_chrom"
states_dict = defaultdict(list)

for chrom in chromosome_names:
    
    #Filtering the domain data relevant to this chromosome
    domain_chrom_data = sorted_zinc[sorted_zinc["chrom_num"] == chrom]
    
    #Loading the ExAC parsed data of this chromosome
    chrom_csv = pd.read_csv(chrom_path+chrom_filename+chrom+".csv", sep='\t', index_col=0)
    chrom_csv = chrom_csv.sort_values(by=["pos"])
    chrom_csv = chrom_csv.reset_index(drop=True)
    chrom_csv.fillna('', inplace=True)
    
    #Getting a list of all the relevant ensembl gene ids for this chromosome
    domain_ens_genes = (domain_chrom_data["ensembl_id"]).unique()
    
    #For each ensembl gene in the domain data - finding all the ExAC alterations
    for ens_gene in domain_ens_genes:
        
        #Filtering the domain data for this gene according to the canonical protein id
        canonic_prot = canonic_protein[ens_gene]
        domain_gene_table = domain_chrom_data[domain_chrom_data["prot_id"] == canonic_prot]
        #Making sure that if two HMM-matches overlaps, the higher bit score will come first in the table
        domain_gene_table = domain_gene_table.sort_values(by="BitScore", ascending=False)
        
        #Creating a table of the exons for this gene, according to the canonical protein
        chrom_raw_data = domain_gene_table["chromosome_id"].unique()[0] #there should be only one element here
        if (len(domain_gene_table["chromosome_id"].unique()) > 1):
            print "Error: "+ens_gene+": more than one chromosome raw data" #sanity check
        exon_table = create_exon_pos_table(chrom_raw_data)
        
        #Filtering the chromosome data to the exons region
        exons_start_pos = min(exon_table["start_pos"][0],exon_table["start_pos"][len(exon_table)-1]) #in case of complelemt, the minimal position could be at the last row
        exons_end_pos = max(exon_table["end_pos"][0],exon_table["end_pos"][len(exon_table)-1]) #in case of complelemt, the maximal position could be at the first row
        chrom_gene_table = chrom_csv[chrom_csv["pos"] >= int(exons_start_pos)][chrom_csv["pos"] <= int(exons_end_pos)]
        chrom_gene_table = chrom_gene_table.reset_index(drop=True)
        chrom_gene_size = chrom_gene_table.shape[0]
        
        #Iterating over the amino-acids of the protein
        prot_len = domain_gene_table["length"].unique()[0]
        for protein_pos in range(1,prot_len+1):
    
            #Trying to match HMM-state, and retreive the aa from HMMER results
            (hmm_state, aa) = protein_pos_to_hmm_state_and_aa(protein_pos, domain_gene_table) #TODO: what happens when two matches overlap? maybe sort to the best bit score?
                
            #If there's a match to HMM-state: find the corresponding codon bps chromosome positions
            if (hmm_state > 0):
                chrom_pos_list =find_chrom_bps(protein_pos, exon_table, chrom_raw_data)
                
                #Analysis of the amino-acid MAF and realted data, returned in a dictionary
                info_dict = calc_exac_maf_data(chrom_pos_list, chrom_gene_table, protein_pos, aa, chrom_raw_data)
                
                #Adding the dictionary to the HMM-state list
                states_dict[hmm_state].append(info_dict)
                    
    print "Finished chromosome "+chrom

In [14]:
with open(my_path+'zinc_hmm_states_dict.pik', 'wb') as handle:
    pickle.dump(states_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)