# Parse the HMM results, filter by domain

Read and parse the hmmer results obtained from Shilpa. Then filter several domains and save them seperatelly.

In [1]:
#Import packages
import pandas as pd
import numpy as np
import unicodedata
from IPython.core.display import HTML
from get_domain_func import process_hmmer_results
from collections import defaultdict
import pickle
import os.path
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

In [3]:
#Constants
curr_dir = !pwd
input_path = curr_dir[0]+"/from_shilpa/"
out_path = curr_dir[0]+"/allhmm_parsed/"
filename = "allhmmresbyprot-v30.tsv"
pfam_aa_order = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

# Reading the HMMER results
if os.path.exists(out_path+"allhmm_parsed-v30.csv"):
    allhmm = pd.read_csv(out_path+"allhmm_parsed-v30.csv", sep='\t', index_col=0)
else:
    allhmm = pd.read_csv(input_path+filename, sep='\t', skiprows=[0,1], header=0)

    # A little more processing to the data to look better in the data-frame
    allhmm = process_hmmer_results(allhmm)

    #Saving the processed data-frame
    allhmm.to_csv(out_path+"allhmm_parsed-v30.csv", sep='\t')

#Reading the table of Pfam domains gathering threshold
domains_GA = pd.read_csv(curr_dir[0]+"/../2.parse_Pfam/v30/domains_GA.csv", sep='\t', index_col=0)

#Reading the dictionary of HMM probabilities
with open(curr_dir[0]+'/../2.parse_Pfam/v30/domains_hmm_prob_dict.pik', 'rb') as handle:
    domains_hmm_prob_dict = pickle.load(handle)

In [11]:
def domain_conserved_states_filter(domain_data, domain_hmm_prob, con_threshold):
    """
    Filter the given domain data to the domain instances that contain the major allele of the conserved states.
    Conserved states are determined as having Pfam hmm emission prob. above the "con_threshold" given.
    """
    #Find the conserved states and their major allele:
    con_states_dict = {}
    for state in domain_hmm_prob.keys():
        prob_list = domain_hmm_prob[state]
        for i in range(len(prob_list)):
            p = prob_list[i]
            if (p > con_threshold):
                major_allele = pfam_aa_order[i]
                con_states_dict[state] = major_allele
    
    #Filter the domain instances missing any conserved state
    #Creating a new data frame for the filtered table
    if (len(con_states_dict.keys()) > 0):
        
        #Creating a new data frame for the filtered table
        domain_filtered = pd.DataFrame(columns = domain_data.columns)
        domain_filtered_i = 0
    
        #Iterating over all domain instances and check the conserved states
        for index, row in domain_data.iterrows():
            target_seq = list(row["Target_Seq"])
            hmm_pos = (row["HMM_Pos"]).split(",")
            add_flag = True
            for con_state in con_states_dict.keys():
                try:
                    hmm_pos.index(str(con_state))
                #The conserved state is not even in the alignment
                except ValueError: 
                    add_flag = False
                    break
                state_idx = hmm_pos.index(str(con_state))
                aligned_aa = target_seq[state_idx]
                
                #Compare the instance aa with the major allele
                if (aligned_aa != con_states_dict[con_state]):
                    add_flag = False
                    break

            if (add_flag):
                new_row = row.copy(deep=True)
                domain_filtered.loc[domain_filtered_i] = new_row
                domain_filtered_i += 1
                
        return domain_filtered
    else:
        return domain_data

In [34]:
all_domains = allhmm["domain_name"].unique()
all_domains.sort()
len(all_domains)

12246

In [35]:
%%time
con_threshold = 0.8 #Conservation threshold

#Filtering to the domains in the input list
for dom_sym in all_domains:
    #Filter the table to the domain
    domain_data = allhmm[allhmm["domain_name"] == dom_sym]
    domain_data = domain_data.reset_index(drop=True)
    
    #Get the domain gathering threshold
    domain_GA = float(domains_GA[domains_GA["name"] == dom_sym]["GA"])
    
    #Filter according to the gathering thresold
    domain_ga_filtered = domain_data[domain_data["BitScore"] >= domain_GA]
    domain_ga_filtered = domain_ga_filtered.reset_index(drop=True)
    
    #Filter the domains to have both start and end positions
    domain_beg = min([int(x) for x in domain_data["HMMStart"].tolist()])
    domain_end = max([int(x) for x in domain_data["HMMEnd"].tolist()])
    try:
        domain_ga_len_filtered = domain_ga_filtered[domain_ga_filtered["HMMStart"] == str(domain_beg)][domain_ga_filtered["HMMEnd"] == str(domain_end)]
    except: #In case the "hmm_start" and "hmm_end" columns are defined as int series
        domain_ga_len_filtered = domain_ga_filtered[domain_ga_filtered["HMMStart"] == domain_beg][domain_ga_filtered["HMMEnd"] == domain_end]
    domain_ga_len_filtered = domain_ga_len_filtered.reset_index(drop=True)
    
    #Get the domain Pfam HMM probabilities
    domain_hmm_prob = domains_hmm_prob_dict[dom_sym]
    
    #Filter the domains to have the conserved states in the HMM profile, according to the specified threshold
    domain_ga_len_con_filtered = domain_conserved_states_filter(domain_ga_len_filtered, domain_hmm_prob, con_threshold)
    
    #Saving the domain data to file
    if (domain_ga_len_con_filtered.shape[0] > 0):
        domain_ga_len_con_filtered.to_csv(curr_dir[0]+"/hmm_domains/pfam-v30/"+dom_sym+".csv", sep='\t')
        print "Saved domain: "+dom_sym
    else:
        print "Excluded all instances in domain: "+dom_sym

Finished domain: 1-cysPrx_C
Finished domain: 120_Rick_ant
Finished domain: 14-3-3
Finished domain: 2-Hacid_dh
Finished domain: 2-Hacid_dh_C
Finished domain: 2-oxoacid_dh
Finished domain: 2-oxogl_dehyd_N
Finished domain: 23ISL
Finished domain: 23S_rRNA_IVP
Finished domain: 2CSK_N
Finished domain: 2Fe-2S_thioredx
Finished domain: 2H-phosphodiest
Finished domain: 2HCT
Finished domain: 2OG-FeII_Oxy
Finished domain: 2OG-FeII_Oxy_2
Finished domain: 2OG-FeII_Oxy_3
Finished domain: 2OG-FeII_Oxy_4
Finished domain: 2OG-Fe_Oxy_2
Finished domain: 2TM
Finished domain: 2_5_RNA_ligase2
Finished domain: 3-HAO
Finished domain: 3-PAP
Finished domain: 3-alpha
Finished domain: 3Beta_HSD
Finished domain: 3D
Finished domain: 3H
Finished domain: 3HBOH
Finished domain: 3HCDH
Finished domain: 3HCDH_N
Finished domain: 40S_S4_C
Finished domain: 40S_SA_C
Finished domain: 4F5
Finished domain: 4HBT
Finished domain: 4HBT_2
Finished domain: 4HBT_3
Finished domain: 4HB_MCP_1
Finished domain: 4_1_CTD
Finished domain: 5

In [38]:
domain_ga_len_con_filtered.shape[0]

1