# Parse the HMM results, filter by domain

Read and parse the hmmer results obtained from Shilpa. Then filter several domains and save them seperatelly.

In [1]:
#Import packages
import pandas as pd
import numpy as np
import unicodedata
from IPython.core.display import HTML
from get_domain_func import process_hmmer_results
import pickle
HTML("<style>.container { width:100% !important; }</style>");

In [2]:
#Constants
domains = ["zf-C2H2", "Homeobox", "WW", "PUF", "SH3_1"] #The domains to filter
curr_dir = !pwd
input_path = curr_dir[0]+"/from_shilpa/"
out_path = curr_dir[0]+"/allhmm_parsed/"
filename = "allhmmresbyprot-v29.tsv.gz"

# Reading the HMMER results
allhmm = pd.read_csv(input_path+filename, sep='\t')

# A little more processing to the data to look better in the data-frame
allhmm = process_hmmer_results(allhmm)

#Saving the processed data-frame
allhmm.to_csv(out_path+"allhmm_parsed-v29.csv", sep='\t')

#Reading the table of Pfam domains gathering threshold
domains_GA = pd.read_csv(curr_dir[0]+"/../2.parse_Pfam/domains_GA.csv", sep='\t', index_col=0)

In [3]:
#Filtering to the domains in the input list
for dom_sym in domains:
    #Filter the table to the domain
    domain_data = allhmm[allhmm["domain_name"] == dom_sym]
    domain_data = domain_data.reset_index(drop=True)
    
    #Get the domain gathering threshold
    domain_GA = float(domains_GA[domains_GA["name"] == dom_sym]["GA"])
    
    #Filter according to the gathering thresold
    domain_ga_filtered = domain_data[domain_data["BitScore"] >= domain_GA]
    domain_ga_filtered = domain_ga_filtered.reset_index(drop=True)
    
    #Saving the domain data to file
    domain_ga_filtered.to_csv(curr_dir[0]+"/hmm_domains/"+dom_sym+".csv", sep='\t')

### Debugging code to find missing data points between two hmmer results

In [None]:
allhmm = pd.read_csv(out_path+"allhmm_parsed.csv", sep='\t')
allhmm_v29 = pd.read_csv(out_path+"allhmm_parsed-v29.csv", sep='\t')

In [None]:
old_table = pd.DataFrame([allhmm["#TargetID"], allhmm["pfam_id"], allhmm["TargetStart"], allhmm["TargetEnd"]]).transpose()

In [None]:
new_table = pd.DataFrame([allhmm_v29["#TargetID"], allhmm_v29["pfam_id"], allhmm_v29["TargetStart"], allhmm_v29["TargetEnd"]]).transpose()

In [None]:
old_tuples = set()
for index, line in old_table.iterrows():
    old_tuples.add(tuple(line))

In [None]:
new_tuples = set()
for index, line in new_table.iterrows():
    new_tuples.add(tuple(line))

In [None]:
with open(out_path+'old_tuples.pik', 'wb') as handle:
    pickle.dump(old_tuples, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open(out_path+'new_tuples.pik', 'wb') as handle:
    pickle.dump(new_tuples, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
a = old_tuples.difference(new_tuples)

In [None]:
len(a)

In [None]:
with open(out_path+'missing_tuples.pik', 'wb') as handle:
    pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
b = new_tuples.difference(old_tuples)

In [None]:
len(b)

In [None]:
allhmm[allhmm["domain_name"] == "SH3_1"]