## Running hhalign from the HHsuite
Goal: create hhm files and run hhalign between them for all the relevant domains.

First, clone the HHsuite github for HHalign version 3.0.3 (14-07-2017) from: https://github.com/soedinglab/hh-suite according to their README

Than, install (make + sudo make install) into: /usr/local/hhsuite

After installation, create inside the cloned github directory "databases" directory and download "pfamA_31.0.tgz" from: http://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/hhsuite_dbs/ (and unzip there). This is the database of hhm (not hmm!) files for pfam-v31, which is the prefered format for using HHsuite tools.

In [8]:
import subprocess
import pandas as pd
import pickle
import os.path
from collections import defaultdict

In [13]:
curr_dir = !pwd
pfam_version = "31"

sim_pairs_df = pd.read_csv("pfam-v"+pfam_version+"/domains_pairs_for_hhalign_filtered10_noclan.csv", sep=',', index_col=0)
with open(curr_dir[0]+"/../../2.parse_Pfam/v"+pfam_version+"/domain_to_pfam_acc_dict.pik", 'rb') as handle:
    domain_to_pfam_acc_dict = pickle.load(handle)    
    
hhsuite_hhm_database_path = "../../../../HHsuite/databases/pfam-v"+pfam_version+"/"
hhsuite_hhm_database_filename = "pfam_hhm.ffdata"
hhsuite_hhm_database_idx = "pfam_hhm.ffindex"

In [5]:
def create_domain_hhm_file(domain_pfam_id, start_pos, length):
    
    outfilename = "pfam-v"+pfam_version+"/hhm_files/"+domain_pfam_id+".hhm"
    creating_hhm_cmd = "dd if=../../../../HHsuite/databases/pfam-v31/pfam_hhm.ffdata of="+outfilename+" bs=1 count="+length+" skip="+start_pos
    not_important_prints = !$creating_hhm_cmd

In [6]:
def process_query(query, res_dict, direction):
    
    
    for line in query:
        
        #Get the probability of the query and the template to be homologus
        if (line.find("Probab=") >= 0):
            end_of_line = line[line.find("Probab=")+7:]
            prob = end_of_line[:end_of_line.find(" ")]
            res_dict["prob"+str(direction)].append(prob)
        
        #Get the alignment p-value
        if (line.find("E-value=") >= 0):
            end_of_line = line[line.find("E-value=")+8:]
            pval = end_of_line[:end_of_line.find(" ")]
            res_dict["pval"+str(direction)].append(pval)
            
        #Get the alignment raw score
        if (line.find("Score=") >= 0):
            end_of_line = line[line.find("Score=")+6:]
            score = end_of_line[:end_of_line.find(" ")]
            res_dict["score"+str(direction)].append(score)
        
        #Get the number of aligned columns
        if (line.find("Aligned_cols=") >= 0):
            end_of_line = line[line.find("Aligned_cols=")+13:]
            aligned = end_of_line[:end_of_line.find(" ")]
            res_dict["aligned_cols"+str(direction)].append(aligned)
        
        #Get the percentage of aligned residue pairs of the query and the template master sequences that are identical.
        if (line.find("Identities=") >= 0):
            end_of_line = line[line.find("Identities=")+11:]
            ident = end_of_line[:end_of_line.find("%")]
            res_dict["ident_perc"+str(direction)].append(ident)

In [9]:
%%time
res_dict = defaultdict(list)

header = "/usr/local/hhsuite/bin/hhalign -i "
ext = ".hhm"
flag = " -t "
hhm_path = "pfam-v"+pfam_version+"/hhm_files/"
flags = " -hide_cons -hide_pred -hide_dssp -o hhalign_out"

for index, pair in sim_pairs_df_filtered_no_clan.iterrows():
    #Get the domains names
    dom1 =  pair["sim_dom1"]
    dom2 =  pair["sim_dom2"]
    
    #Get the domains PFAM ids
    dom1_pfam_id = domain_to_pfam_acc_dict[dom1]
    dom2_pfam_id = domain_to_pfam_acc_dict[dom2]
    
    outfilename1 = "pfam-v"+pfam_version+"/hhm_files/"+dom1_pfam_id+".hhm"
    if (os.path.isfile(outfilename1) == False):
        #Get domain 1 indices for fast access
        dom1_idx_cmd = "less "+hhsuite_hhm_database_path+hhsuite_hhm_database_idx+" | grep "+dom1_pfam_id
        dom1_idx_line = !$dom1_idx_cmd
        dom1_idx_line = dom1_idx_line[0].split("\t")
        dom1_start_pos = dom1_idx_line[1]
        dom1_len = dom1_idx_line[2]
        #Create the .hhm (special HHsuite format for HMM profiles) for domain 1
        create_domain_hhm_file(dom1_pfam_id,dom1_start_pos,dom1_len)
    
    outfilename2 = "pfam-v"+pfam_version+"/hhm_files/"+dom2_pfam_id+".hhm"
    if (os.path.isfile(outfilename2) == False):
        #Get domain 2 indices for fast access
        dom2_idx_cmd = "less "+hhsuite_hhm_database_path+hhsuite_hhm_database_idx+" | grep "+dom2_pfam_id
        dom2_idx_line = !$dom2_idx_cmd
        dom2_idx_line = dom2_idx_line[0].split("\t")
        dom2_start_pos = dom2_idx_line[1]
        dom2_len = dom2_idx_line[2]
        #Create the .hhm (special HHsuite format for HMM profiles) for domain 2
        create_domain_hhm_file(dom2_pfam_id,dom2_start_pos,dom2_len)
    
    #Running hhalign direction #1
    script = header+hhm_path+dom1_pfam_id+ext+flag+hhm_path+dom2_pfam_id+ext+flags
    query = !$script
    process_query(query, res_dict, 1)
    #Running hhalign direction #2
    script = header+hhm_path+dom2_pfam_id+ext+flag+hhm_path+dom1_pfam_id+ext+flags
    query = !$script
    process_query(query, res_dict, 2)
    
    if (index % 100 == 0):
        print index

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
CPU times: user 6.21 s, sys: 19 s, total: 25.2 s
Wall time: 1h 18min 33s


In [11]:
#Add infomration to the domains pairs table
sim_pairs_df_filtered_no_clan["prob1"] = res_dict["prob1"]
sim_pairs_df_filtered_no_clan["pval1"] = res_dict["pval1"]
sim_pairs_df_filtered_no_clan["score1"] = res_dict["score1"]
sim_pairs_df_filtered_no_clan["aligned_cols1"] = res_dict["aligned_cols1"]
sim_pairs_df_filtered_no_clan["ident_perc1"] = res_dict["ident_perc1"]

sim_pairs_df_filtered_no_clan["prob2"] = res_dict["prob2"]
sim_pairs_df_filtered_no_clan["pval2"] = res_dict["pval2"]
sim_pairs_df_filtered_no_clan["score2"] = res_dict["score2"]
sim_pairs_df_filtered_no_clan["aligned_cols2"] = res_dict["aligned_cols2"]
sim_pairs_df_filtered_no_clan["ident_perc2"] = res_dict["ident_perc2"]

#Save the file with all the results
sim_pairs_df_filtered_no_clan.to_csv("pfam-v"+pfam_version+"/domains_hhlign_scores.csv", sep='\t')

In [12]:
sim_pairs_df_filtered_no_clan

Unnamed: 0,index,sim_dom1,sim_dom2,prob1,pval1,score1,aligned_cols1,prob2,pval2,score2,ident_perc2,ident_perc1,aligned_cols2
0,22,F5_F8_type_C,Laminin_B,0.07,0.72,5.33,2,0.08,0.66,5.05,0,50,1
1,23,F5_F8_type_C,Calpain_III,0.06,0.74,5.33,5,0.09,0.57,5.56,0,0,1
2,25,F5_F8_type_C,Laminin_N,97.53,1.7e-10,47.53,67,96.43,1.4e-08,48.96,25,25,68
3,34,F5_F8_type_C,Ephrin_lbd,0.15,0.42,8.34,1,0.20,0.32,8.35,11,0,9
4,112,Laminin_N,Laminin_B,0.15,0.44,9.00,9,0.13,0.52,8.92,22,22,9
5,113,Laminin_N,Calpain_III,0.07,0.68,7.43,8,0.05,0.8,7.26,25,0,4
6,118,Laminin_N,Ephrin_lbd,0.14,0.46,10.69,6,0.07,0.73,8.75,40,17,5
7,141,Laminin_B,Calpain_III,0.39,0.15,10.05,1,0.34,0.17,9.73,0,0,1
8,144,Laminin_B,Ephrin_lbd,0.10,0.57,8.24,9,0.09,0.61,7.55,67,22,3
9,148,Calpain_III,Ephrin_lbd,0.05,0.79,7.01,1,0.08,0.65,7.37,0,0,1
