# Parse the HMM results, filter by domain

Read and parse the hmmer results obtained from Shilpa. Then filter several domains and save them seperatelly.

In [1]:
#Import packages
import pandas as pd
import numpy as np
import unicodedata
from IPython.core.display import HTML
from get_domain_func import process_hmmer_results
from collections import defaultdict
import pickle
import os.path
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

In [3]:
#Constants
curr_dir = !pwd
pfam_version = "32"
input_path = curr_dir[0]+"/from_shilpa/"
out_path = curr_dir[0]+"/allhmm_parsed/"
filename = "allhmmresbyprot-v"+pfam_version+".tsv"
pfam_aa_order = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

# Reading the HMMER results
if os.path.exists(out_path+"allhmm_parsed-v"+pfam_version+".csv"):
    allhmm = pd.read_csv(out_path+"allhmm_parsed-v"+pfam_version+".csv", sep='\t', index_col=0)
else:
    allhmm = pd.read_csv(input_path+filename, sep='\t', skiprows=[0,1], header=0)

    # A little more processing to the data to look better in the data-frame
    allhmm = process_hmmer_results(allhmm)
    
    #Filter pseudo-genes
    allhmm = allhmm[allhmm["gene_biotype"] != "polymorphic_pseudogene"]
    
    #Filter non-coding transcripts
    allhmm = allhmm[allhmm["transcript_biotype"] != "nonsense_mediated_decay"][allhmm["transcript_biotype"] != "non_stop_decay"][allhmm["transcript_biotype"] != "polymorphic_pseudogene"]

    #Saving the processed data-frame
    allhmm.to_csv(out_path+"allhmm_parsed-v"+pfam_version+".csv", sep='\t')

#Reading the table of Pfam domains gathering threshold
domains_GA = pd.read_csv(curr_dir[0]+"/../2.parse_Pfam/v"+pfam_version+"/domains_GA.csv", sep='\t', index_col=0)

#Reading the table of Pfam domains length
domains_len = pd.read_csv(curr_dir[0]+"/../2.parse_Pfam/v"+pfam_version+"/domains_len.csv", sep='\t', index_col=0)

#Reading the dictionary of HMM probabilities
with open(curr_dir[0]+"/../2.parse_Pfam/v"+pfam_version+"/domains_hmm_prob_dict.pik", 'rb') as handle:
    domains_hmm_prob_dict = pickle.load(handle)



In [4]:
def domain_conserved_states_filter(domain_data, domain_hmm_prob, con_threshold):
    """
    Filter the given domain data to the domain instances that contain the major allele of the conserved states.
    Conserved states are determined as having Pfam hmm emission prob. above the "con_threshold" given.
    """
    #Find the conserved states and their major allele:
    con_states_dict = {}
    for state in domain_hmm_prob.keys():
        prob_list = domain_hmm_prob[state]
        for i in range(len(prob_list)):
            p = prob_list[i]
            if (p >= con_threshold):
                major_allele = pfam_aa_order[i]
                con_states_dict[state] = major_allele
    
    #Filter the domain instances missing any conserved state
    #Creating a new data frame for the filtered table
    if (len(con_states_dict.keys()) > 0):
        
        #Creating a new data frame for the filtered table
        domain_filtered = pd.DataFrame(columns = domain_data.columns)
        domain_filtered_i = 0
    
        #Iterating over all domain instances and check the conserved states
        for index, row in domain_data.iterrows():
            target_seq = list(row["Target_Seq"])
            hmm_pos = (row["HMM_Pos"]).split(",")
            add_flag = True
            for con_state in con_states_dict.keys():
                try:
                    hmm_pos.index(str(con_state))
                #The conserved state is not even in the alignment
                except ValueError: 
                    add_flag = False
                    break
                state_idx = hmm_pos.index(str(con_state))
                aligned_aa = target_seq[state_idx]
                
                #Compare the instance aa with the major allele
                if (aligned_aa != con_states_dict[con_state]):
                    add_flag = False
                    break

            if (add_flag):
                new_row = row.copy(deep=True)
                domain_filtered.loc[domain_filtered_i] = new_row
                domain_filtered_i += 1
                
        return domain_filtered
    else:
        return domain_data

In [5]:
all_domains = (allhmm["domain_name"]).unique().tolist()
all_domains.sort()
con_threshold = 0.99 #Conservation threshold (after witnessing a small bump in the prob. distribution after 0.99)
saved_domains = []
excluded_domains = []

#Filtering to the domains in the input list
for dom_sym in all_domains:
    #Filter the table to the domain
    domain_data = allhmm[allhmm["domain_name"] == dom_sym]
    domain_data = domain_data.reset_index(drop=True)
    
    #Get the domain gathering threshold
    domain_GA = float(domains_GA[domains_GA["name"] == dom_sym]["GA"])
    
    #Filter according to the gathering thresold
    domain_ga_filtered = domain_data[domain_data["BitScore"] >= domain_GA]
    domain_ga_filtered = domain_ga_filtered.reset_index(drop=True)
    
    #Get the domain length
    domain_len = int(domains_len[domains_len["name"] == dom_sym]["length"])
    
    #Filter the domains to have both start and end positions
    try:
        domain_ga_len_filtered = domain_ga_filtered[domain_ga_filtered["HMMStart"] == str(1)][domain_ga_filtered["HMMEnd"] == str(domain_len)]
    except: #In case the "hmm_start" and "hmm_end" columns are defined as int series
        domain_ga_len_filtered = domain_ga_filtered[domain_ga_filtered["HMMStart"] == 1][domain_ga_filtered["HMMEnd"] == domain_len]
    domain_ga_len_filtered = domain_ga_len_filtered.reset_index(drop=True)
    
    #Get the domain Pfam HMM probabilities
    domain_hmm_prob = domains_hmm_prob_dict[dom_sym]
    
    #Filter the domains to have the conserved states in the HMM profile, according to the specified threshold
    domain_ga_len_con_filtered = domain_conserved_states_filter(domain_ga_len_filtered, domain_hmm_prob, con_threshold)
    
    #Saving the domain data to file (only if there is any)
    if (domain_ga_len_con_filtered.shape[0] > 0):
        domain_ga_len_con_filtered.to_csv(curr_dir[0]+"/hmm_domains/pfam-v"+pfam_version+"/"+dom_sym+".csv", sep='\t')
        saved_domains.append(dom_sym)
    else:
        excluded_domains.append(dom_sym)

### Debugging: how many each filter removes individually?

In [6]:
#Pfam v31: 12,040
#Pfam v32: 13,066
len(all_domains)

13066

In [7]:
#Pfam v31: 12,040
#Pfam v32: 13,066
len(excluded_domains) + len(saved_domains)

13066

In [8]:
#Pfam v31: 6,011
#Pfam v32: 6,556
len(excluded_domains)

6556

In [69]:
#Pfam v31: 19
num_removed

19

In [68]:
num_added = 0
num_removed = 0
con_threshold = 0.99 #Conservation threshold

for dom_sym in all_domains:
    #Filter the table to the domain
    domain_data = allhmm[allhmm["domain_name"] == dom_sym]
    domain_data = domain_data.reset_index(drop=True)
    domain_ga_len_filtered = domain_data
    
    #Get the domain Pfam HMM probabilities
    domain_hmm_prob = domains_hmm_prob_dict[dom_sym]
    
    #Filter the domains to have the conserved states in the HMM profile, according to the specified threshold
    domain_ga_len_con_filtered = domain_conserved_states_filter(domain_ga_len_filtered, domain_hmm_prob, con_threshold)
    
    if (domain_ga_len_con_filtered.shape[0] > 0):
        num_added += 1
    else:
        num_removed += 1

In [None]:
#Passed gathering threshold = 6274
#Didn't passed GA = 5715

#no_start_end = 4340
#yes start end = 7649

#no_conserved = 19
#yes_conserved = 11970