In [1]:
import pandas as pd
import numpy as np
import math
from collections import defaultdict
import matplotlib.pyplot as plt
import pickle
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

In [2]:
#Reading the Zinc-finger domain data
curr_dir = !pwd
my_path = curr_dir[0]+"/hmm_domains/"
filename = "zf-C2H2.csv"
zinc_finger = pd.read_csv(my_path+filename, sep='\t', index_col=0)
#Sort the zinc finger data
sorted_zinc = zinc_finger.sort_values(by=["chrom_num", "ensembl_id", "TargetStart"])
sorted_zinc = sorted_zinc.reset_index(drop=True)

In [3]:
#Get the canonic protein id for Zinc domain
with open(my_path+'zinc_canonic_prot.pik', 'rb') as handle:
    canonic_protein = pickle.load(handle)

In [4]:
chromosome_names = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y"]

In [5]:
chrom_names_list = sorted_zinc["chrom_num"].tolist()
starnge_chrom_sum = 0
for name in chrom_names_list:
    if (name not in chromosome_names):
        starnge_chrom_sum += 1
print "Leaving outside "+str(starnge_chrom_sum)+" out of "+str(len(chrom_names_list))

Leaving outside 74 out of 10492


In [6]:
#A function that get chromosome raw data from the hmmer results and return a table of the exons
def create_exon_pos_table(chrom_raw):
    exons_raw = chrom_raw
    
    #Removing the complement bracates if exist
    if (exons_raw.find("complement(") >= 0):
        exons_raw = exons_raw[exons_raw.find("complement(")+11:-1]
    
    #Removing the join bracates if exist
    if (exons_raw.find("join(") >= 0):
        exons_raw = exons_raw[exons_raw.find("join(")+5:-1]
        
    #In case there's only one exon, take everything after the second ":"
    else:
        exons_raw = exons_raw[exons_raw.find(":", chrom_raw.find(":")+1)+1:]
    
    exons_list = exons_raw.split(",")
    exon_pos = []
    for ex in exons_list:
        exon_pos.append(ex.split(".."))
    exon_df = pd.DataFrame(exon_pos)
    exon_df.columns = ["start_pos", "end_pos"]
    exon_len = []
    for index, exon in exon_df.iterrows():
        exon_len.append(int(exon[1]) - int(exon[0])+1)
    exon_df["length"] = exon_len
    first_bp_count = 1
    first_bp_list = []
    for index, exon in exon_df.iterrows():
        first_bp_list.append(first_bp_count)
        first_bp_count += int(exon[2])
    exon_df["first_bp_count"] = first_bp_list
    return(exon_df)

In [7]:
#A function that get chromosome position and table of exons, and return the protein position or -1 if it's not within any exon
def find_protein_pos(chrom_pos, exon_df, chrom_raw):
    for index, exon in exon_df.iterrows():
        start_pos = int(exon[0])
        end_pos = int(exon[1])
        first_bp_count = int(exon[3])
        if (chrom_pos >= start_pos and chrom_pos <= end_pos):
            
            #Calculate position for reverse complement strand: the protein is translated from the end position towards the start position of the exon
            if (chrom_raw.find("complement")):
                len_from_exon_start = end_pos - chrom_pos
            #Calculate position for forward starnd
            else:
                len_from_exon_start = chrom_pos - start_pos
            
            #Calculate the position on the mRNA transcript
            transcript_pos = len_from_exon_start + first_bp_count
            
            #Calculate the position on the protein sequence
            protein_pos = int(math.ceil(float(transcript_pos)/3))
            return protein_pos
    
    #If the position wasn't in the regions of any exon
    return -1

In [8]:
#Boolean function - determine if a given text can be converted to a number
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

In [9]:
#A function that return the hmm state of that protein position
# return -1 for positions outside of domains regions, -2 for matching insertion
#TODO: do we need also transcript id? do we want to consider more than 1 transcript per gene?
def protein_pos_to_hmm_state(protein_pos, domain_gene_table):
    for index, row in domain_gene_table.iterrows():
        target_start = row[6]
        target_end = row[7]
        hmm_pos = (row[10]).split(",")
        if (protein_pos >= target_start and protein_pos <= target_end):
            index_inside_match = (protein_pos - target_start)
            hmm_state_text = hmm_pos[index_inside_match]
            if is_number(hmm_state_text):
                hmm_state = int(hmm_state_text)
                return hmm_state
            else:
                #the position match insertion
                return -2
            
    #The protein position isn't in any domain region        
    return -1

In [10]:
chrom_path = curr_dir[0]+"/parsed/"
chrom_filename = "parsed_chrom"
states_dict = defaultdict(list)

for chrom in chromosome_names:
    
    #Filtering the domain data relevant to this chromosome
    domain_chrom_data = sorted_zinc[sorted_zinc["chrom_num"] == chrom]
    
    #Loading the ExAC parsed data of this chromosome
    chrom_csv = pd.read_csv(chrom_path+chrom_filename+chrom+".csv", sep='\t', index_col=0)
    
    #Getting a list of all the relevant ensembl gene ids for this chromosome
    domain_ens_genes = (domain_chrom_data["ensembl_id"]).unique()
    
    #For each ensembl gene in the domain data - finding all the ExAC alterations
    for ens_gene in domain_ens_genes:
        
        #Filtering the domain data for this gene according to the canonical protein id
        canonic_prot = canonic_protein[ens_gene]
        domain_gene_table = domain_chrom_data[domain_chrom_data["prot_id"] == canonic_prot]
        #Making sure that if two HMM-matches overlaps, the higher bit score will come first in the table
        domain_gene_table = domain_gene_table.sort_values(by="BitScore", ascending=False)
        
        #Creating a table of the exons for this gene, according to the canonical protein
        chrom_raw_data = domain_gene_table["chromosome_id"].unique()[0] #there should be only one element here
        if (len(domain_gene_table["chromosome_id"].unique()) > 1):
            print "Error: "+ens_gene+": more than one chromosome raw data" #sanity check
        exon_table = create_exon_pos_table(chrom_raw_data)
        
        #Filtering the chromosome data to the exons region
        exons_start_pos = min(exon_table["start_pos"][0],exon_table["start_pos"][len(exon_table)-1]) #in case of complelemt, the minimal position could be at the last row
        exons_end_pos = max(exon_table["end_pos"][0],exon_table["end_pos"][len(exon_table)-1]) #in case of complelemt, the maximal position could be at the first row
        chrom_gene_table = chrom_csv[chrom_csv["pos"] >= int(exons_start_pos)][chrom_csv["pos"] <= int(exons_end_pos)]
        chrom_gene_table = chrom_gene_table.reset_index(drop=True)
        chrom_gene_size = chrom_gene_table.shape[0]
        
        #Iterating over the exons of this gene
        for i, row in exon_table.iterrows():
            ex_begin = int(row["start_pos"])
            ex_end = int(row["end_pos"])
            
            #Itrerating over the exon positions
            for chrom_pos in range(ex_begin, ex_end+1):
                
                #calculate the protein position 
                protein_pos = find_protein_pos(chrom_pos, exon_table, chrom_raw_data)
                    
                if (protein_pos < 0):
                    print "sanity check: exon position not inside the protein? "+str(chrom_pos)+" "+ens_gene
                    continue
                    
                #Trying to match HMM-state
                hmm_state = protein_pos_to_hmm_state(protein_pos, domain_gene_table) #TODO: what happens when two matches overlap? maybe sort to the best bit score?
                
                #If there's a match to HMM-state (our goal!): saving relevant information to the dictionary
                if (hmm_state > 0):
                    
                    #Checking if there's an ExAC alteration at that position
                    chrom_alter = chrom_gene_table[chrom_gene_table["pos"] == chrom_pos]
                    chrom_alter = chrom_alter.reset_index(drop=True)
                    
                    #In case no alteration is found, we assume AF = 0
                    if (chrom_alter.shape[0] == 0):
                        info_dict = {}
                        info_dict["af"] = 0
                        info_dict["af_adj"] = 0
                        info_dict["pos"] = int(chrom_pos)
                        info_dict["chrom"] = chrom
                        states_dict[hmm_state].append(info_dict)
                        
                    #In case there is ExAC alteration, we save the relevant AF and other parameters
                    else:
                        #Extracting the relevant information from the ExAC data table
                        
                        info_dict = {}
                        info_dict["an"] = int(chrom_alter["AN"][0])
                        info_dict["an_adj"] = int(chrom_alter["AN_Adj"][0])
                        info_dict["ref"] = chrom_alter["ref"][0]
                        info_dict["alt"] = chrom_alter["alt"][0]
                        info_dict["qual"] = chrom_alter["qual"][0]
                        info_dict["pos"] = int(chrom_pos)
                        info_dict["chrom"] = chrom
                        
                        #In case there is more than one alteration in this position: adding their AF sum to the list
                        ac_list = (chrom_alter["AC"][0]).split(",")
                        af_list = (chrom_alter["AF"][0]).split(",")
                        ac_adj_list = (chrom_alter["AC_Adj"][0]).split(",")
                        info_dict["ac"] = 0
                        info_dict["af"] = 0
                        info_dict["ac_adj"] = 0
                        info_dict["af_adj"] = 0
                        for j in range(len(ac_list)):
                            info_dict["ac"] += int(ac_list[j])
                            info_dict["af"] += float(af_list[j])
                            info_dict["ac_adj"] += int(ac_adj_list[j])
                            af_adj = float(info_dict["ac_adj"])/float(info_dict["an_adj"])
                            af_adj_format = float('{:.3e}'.format(float(af_adj)))
                            info_dict["af_adj"] += af_adj_format
                        
                        #Adding the dictionary to the HMM-state list
                        states_dict[hmm_state].append(info_dict)
                    
    print "Finished chromosome "+chrom

protein pos 636 matched insertion
protein pos 636 matched insertion
protein pos 636 matched insertion
protein pos 1805 matched insertion
protein pos 1805 matched insertion
protein pos 1805 matched insertion
protein pos 1804 matched insertion
protein pos 1804 matched insertion
protein pos 1804 matched insertion
protein pos 359 matched insertion
protein pos 359 matched insertion
protein pos 359 matched insertion
protein pos 1001 matched insertion
protein pos 1001 matched insertion
protein pos 1001 matched insertion
protein pos 316 matched insertion
protein pos 316 matched insertion
protein pos 316 matched insertion
protein pos 315 matched insertion
protein pos 315 matched insertion
protein pos 315 matched insertion
protein pos 277 matched insertion
protein pos 277 matched insertion
protein pos 277 matched insertion
protein pos 357 matched insertion
protein pos 357 matched insertion
protein pos 357 matched insertion
protein pos 391 matched insertion
protein pos 391 matched insertion
prote

  interactivity=interactivity, compiler=compiler, result=result)



protein pos 148 matched insertion
protein pos 148 matched insertion
protein pos 148 matched insertion
protein pos 147 matched insertion
protein pos 147 matched insertion
protein pos 147 matched insertion
protein pos 178 matched insertion
protein pos 178 matched insertion
protein pos 178 matched insertion
protein pos 177 matched insertion
protein pos 177 matched insertion
protein pos 177 matched insertion
protein pos 401 matched insertion
protein pos 401 matched insertion
protein pos 401 matched insertion
protein pos 400 matched insertion
protein pos 400 matched insertion
protein pos 400 matched insertion
protein pos 431 matched insertion
protein pos 431 matched insertion
protein pos 431 matched insertion
protein pos 430 matched insertion
protein pos 430 matched insertion
protein pos 430 matched insertion
protein pos 200 matched insertion
protein pos 200 matched insertion
protein pos 200 matched insertion
protein pos 199 matched insertion
protein pos 199 matched insertion
protein pos 1

In [14]:
with open(my_path+'zinc_hmm_states_dict.pik', 'wb') as handle:
    pickle.dump(states_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)