In [609]:
#Import packages
import pandas as pd
import numpy as np
import math
import json
from collections import defaultdict
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

In [20]:
#Reading the Zinc-finger domain data
curr_dir = !pwd
my_path = curr_dir[0]+"/hmm_domains/"
filename = "zf-C2H2.csv"
zinc_finger = pd.read_csv(my_path+filename, sep='\t', index_col=0)
#Sort the zinc finger data
sorted_zinc = zinc_finger.sort_values(by=["chrom_num", "ensembl_id", "TargetStart"])
sorted_zinc = sorted_zinc.reset_index(drop=True)

In [682]:
#Get the canonic transcript for Zinc domain
canonic_trans_prot = json.load(file(my_path+'zinc_canonic_trans_prot.json'))

In [535]:
chromosome_names = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y"]

In [661]:
#A function that get chromosome raw data from the hmmer results and return a table of the exons
def create_exon_pos_table(chrom_raw):
    exons_raw = chrom_raw
    
    #Removing the complement bracates if exist
    if (exons_raw.find("complement(") >= 0):
        exons_raw = exons_raw[exons_raw.find("complement(")+11:-1]
    
    #Removing the join bracates if exist
    if (exons_raw.find("join(") >= 0):
        exons_raw = exons_raw[exons_raw.find("join(")+5:-1]
        
    #In case there's only one exon, take everything after the second ":"
    else:
        exons_raw = exons_raw[exons_raw.find(":", chrom_raw.find(":")+1)+1:]
    
    exons_list = exons_raw.split(",")
    exon_pos = []
    for ex in exons_list:
        exon_pos.append(ex.split(".."))
    exon_df = pd.DataFrame(exon_pos)
    exon_df.columns = ["start_pos", "end_pos"]
    exon_len = []
    for index, exon in exon_df.iterrows():
        exon_len.append(int(exon[1]) - int(exon[0])+1)
    exon_df["length"] = exon_len
    first_bp_count = 1
    first_bp_list = []
    for index, exon in exon_df.iterrows():
        first_bp_list.append(first_bp_count)
        first_bp_count += int(exon[2])
    exon_df["first_bp_count"] = first_bp_list
    return(exon_df)

In [279]:
#A function that get chromosome position and table of exons, and return the protein position or -1 if it's not within any exon
def find_protein_pos(chrom_pos, exon_df, chrom_raw):
    for index, exon in exon_df.iterrows():
        start_pos = int(exon[0])
        end_pos = int(exon[1])
        first_bp_count = int(exon[3])
        if (chrom_pos >= start_pos and chrom_pos <= end_pos):
            
            #Calculate position for reverse complement strand: the protein is translated from the end position towards the start position of the exon
            if (chrom_raw.find("complement")):
                len_from_exon_start = end_pos - chrom_pos
            #Calculate position for forward starnd
            else:
                len_from_exon_start = chrom_pos - start_pos
            
            #Calculate the position on the mRNA transcript
            transcript_pos = len_from_exon_start + first_bp_count
            
            #Calculate the position on the protein sequence
            protein_pos = int(math.ceil(float(transcript_pos)/3))
            return protein_pos
    
    #If the position wasn't in the regions of any exon
    return -1

In [240]:
#Boolean function - determine if a given text can be converted to a number
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

In [568]:
#A function that return the hmm state of that protein position
# return -1 for positions outside of domains regions, -2 for matching insertion
#TODO: do we need also transcript id? do we want to consider more than 1 transcript per gene?
def protein_pos_to_hmm_state(protein_pos, domain_gene_table):
    for index, row in domain_gene_table.iterrows():
        target_start = row[6]
        target_end = row[7]
        hmm_pos = (row[10]).split(",")
        if (protein_pos >= target_start and protein_pos <= target_end):
            index_inside_match = (protein_pos - target_start)
            hmm_state_text = hmm_pos[index_inside_match]
            if is_number(hmm_state_text):
                hmm_state = int(hmm_state_text)
                return hmm_state
            else:
                #the position match insertion
                return -2
            
    #The protein position isn't in any domain region        
    return -1

In [683]:
chrom_path = curr_dir[0]+"/parsed/"
chrom_filename = "parsed_chrom"
states_dict = defaultdict(list)

for chrom in chromosome_names:
    
    #Filtering the domain data relevant to this chromosome
    domain_chrom_data = sorted_zinc[sorted_zinc["chrom_num"] == chrom]
    
    #Loading the ExAC parsed data of this chromosome
    chrom_csv = pd.read_csv(chrom_path+chrom_filename+chrom+".csv", sep='\t', index_col=0)
    
    #Getting a list of all the relevant ensembl gene ids for this chromosome
    domain_ens_genes = (domain_chrom_data["ensembl_id"]).unique()
    
    #For each ensembl gene in the domain data - finding all the ExAC alterations
    for ens_gene in domain_ens_genes:
        
        #Filtering the domain data for this gene according to the canonical transcript id
        canonic_trans = canonic_trans_prot[ens_gene][0]
        canonic_prot = canonic_trans_prot[ens_gene][1]
        domain_gene_table = domain_chrom_data[domain_chrom_data["transcript_id"] == canonic_trans]
        domain_gene_table = domain_gene_table[domain_gene_table["prot_id"] == canonic_prot]
        
        #Filtering the chromosome data to this gene
        chrom_gene_table = chrom_csv[chrom_csv["gene"] == ens_gene]
        chrom_gene_table = chrom_gene_table.reset_index(drop=True)
        chrom_gene_size = chrom_gene_table.shape[0]
        
        #Creating a table of the exons for this gene, according to the canonical transcript
        chrom_raw_data = domain_gene_table["chromosome_id"].unique()[0] #there should be only one element here
        if (len(domain_gene_table["chromosome_id"].unique()) > 1):
            print ens_gene+": more than one chromosome raw data" #sanity check
        exon_table = create_exon_pos_table(chrom_raw_data)
        
        #Iterating over the gene alterations in the ExAC data to see if they match to HMM-states
        for i in range(chrom_gene_size):
            #Getting the alteration chromosome position
            chrom_pos = chrom_gene_table["pos"][i]
            protein_pos = find_protein_pos(chrom_pos, exon_table, chrom_raw_data)
            
            #If the position is inside one of the exons
            if (protein_pos > 0):
                #Trying to match HMM-state
                hmm_state = protein_pos_to_hmm_state(protein_pos, domain_gene_table)
                
                #If there's a match to HMM-state (our goal!): saving relevant information to the dictionary
                if (hmm_state > 0):
                    
                    #Extracting the relevant information from the ExAC data table
                    ac_list = (chrom_gene_table["AC"][i]).split(",")
                    an = chrom_gene_table["AN"][i]
                    af_list = (chrom_gene_table["AF"][i]).split(",")
                    ac_adj_list = (chrom_gene_table["AC_Adj"][i]).split(",")
                    an_adj = chrom_gene_table["AN_Adj"][i]
                    ref = chrom_gene_table["ref"][i]
                    alt_list = (chrom_gene_table["alt"][i]).split(",")
                    qual = chrom_gene_table["qual"][i]
                    
                    #In case there is more than one alteration in this position: adding them as different entries to the list
                    for j in range(len(ac_list)):
                        info_dict = {}
                        info_dict["ac"] = int(ac_list[j])
                        info_dict["an"] = int(an)
                        info_dict["af"] = float(af_list[j])
                        info_dict["ac_adj"] = int(ac_adj_list[j])
                        info_dict["an_adj"] = int(an_adj)
                        af_adj = float(info_dict["ac_adj"])/float(info_dict["an_adj"])
                        af_adj_format = float('{:.3e}'.format(float(af_adj)))
                        info_dict["af_adj"] = af_adj_format
                        info_dict["ref"] = ref
                        info_dict["alt"] = alt_list[j]
                        info_dict["qual"] = qual
                        info_dict["pos"] = int(chrom_pos)
                        states_dict[hmm_state].append(info_dict)
                    
    print "Finished chromosome "+chrom

Finished chromosome 1
Finished chromosome 2
Finished chromosome 3
Finished chromosome 4
Finished chromosome 5
Finished chromosome 6
Finished chromosome 7
Finished chromosome 8
Finished chromosome 9
Finished chromosome 10
Finished chromosome 11
Finished chromosome 12
Finished chromosome 13
Finished chromosome 14
Finished chromosome 15
Finished chromosome 16
Finished chromosome 17
Finished chromosome 18
Finished chromosome 19
Finished chromosome 20
Finished chromosome 21
Finished chromosome 22
Finished chromosome X
Finished chromosome Y


In [684]:
json.dump(states_dict, file(my_path+'zinc_hmm_states_dict.json', 'w'))

In [309]:
chrom1_zinc[chrom1_zinc["HMM_Pos"] != "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23"]["HMM_Pos"][136]

'1,2,3,4,a4-0,a4-1,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23'