## Running hhalign from the HHsuite
Goal: create hhm files and run hhalign between them for all the relevant domains.

First, clone the HHsuite github for HHalign version 3.0.3 (14-07-2017) from: https://github.com/soedinglab/hh-suite according to their README

Than, install (make + sudo make install) into: /usr/local/hhsuite

After installation, create inside the cloned github directory "databases" directory and download "pfamA_31.0.tgz" from: http://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/hhsuite_dbs/ (and unzip there). This is the database of hhm (not hmm!) files for pfam-v31, which is the prefered format for using HHsuite tools.

In [1]:
import subprocess
import pandas as pd
import pickle
import os.path

In [2]:
curr_dir = !pwd
pfam_version = "31"

sim_pairs_df = pd.read_csv("pfam-v"+pfam_version+"/domains_pairs_for_hhalign_filtered10.csv", sep=',', index_col=0)
with open(curr_dir[0]+"/../../2.parse_Pfam/v"+pfam_version+"/domain_to_pfam_acc_dict.pik", 'rb') as handle:
    domain_to_pfam_acc_dict = pickle.load(handle)    
    
hhsuite_hhm_database_path = "../../../../HHsuite/databases/pfam-v"+pfam_version+"/"
hhsuite_hhm_database_filename = "pfam_hhm.ffdata"
hhsuite_hhm_database_idx = "pfam_hhm.ffindex"

### Remove no clan from the table

In [3]:
clan_path = curr_dir[0]+"/../../2.parse_Pfam/v"+pfam_version+"/"
with open(clan_path+"domain_to_clan_dict.pik", 'rb') as handle:
    domain_to_clan_dict = pickle.load(handle)

drop_idx = []
for index, row in sim_pairs_df.iterrows():
    clan1 = domain_to_clan_dict[row["sim_dom1"]]
    clan2 = domain_to_clan_dict[row["sim_dom2"]]
    if (clan1 == "No_clan" and clan2 == "No_clan"):
        drop_idx.append(index)
        
sim_pairs_df_filtered_no_clan = sim_pairs_df.drop(drop_idx)
sim_pairs_df_filtered_no_clan = sim_pairs_df_filtered_no_clan.reset_index(drop=True)

In [4]:
sim_pairs_df_filtered_no_clan.shape

(1886, 3)

In [5]:
def create_domain_hhm_file(domain_pfam_id, start_pos, length):
    
    outfilename = "pfam-v"+pfam_version+"/hhm_files/"+domain_pfam_id+".hhm"
    creating_hhm_cmd = "dd if=../../../../HHsuite/databases/pfam-v31/pfam_hhm.ffdata of="+outfilename+" bs=1 count="+length+" skip="+start_pos
    not_important_prints = !$creating_hhm_cmd

In [None]:
%%time
scores = []
pvals = []
aligned_cols = []

header = "/usr/local/hhsuite/bin/hhalign -i "
ext = ".hhm"
flag = " -t "
hhm_path = "pfam-v"+pfam_version+"/hhm_files/"
flags = " -hide_cons -hide_pred -hide_dssp -o hhalign_out"

for index, pair in sim_pairs_df_filtered_no_clan.iterrows():
    #Get the domains names
    dom1 =  pair["sim_dom1"]
    dom2 =  pair["sim_dom2"]
    
    #Get the domains PFAM ids
    dom1_pfam_id = domain_to_pfam_acc_dict[dom1]
    dom2_pfam_id = domain_to_pfam_acc_dict[dom2]
    
    outfilename1 = "pfam-v"+pfam_version+"/hhm_files/"+dom1_pfam_id+".hhm"
    if (os.path.isfile(outfilename1) == False):
        #Get domain 1 indices for fast access
        dom1_idx_cmd = "less "+hhsuite_hhm_database_path+hhsuite_hhm_database_idx+" | grep "+dom1_pfam_id
        dom1_idx_line = !$dom1_idx_cmd
        dom1_idx_line = dom1_idx_line[0].split("\t")
        dom1_start_pos = dom1_idx_line[1]
        dom1_len = dom1_idx_line[2]
        #Create the .hhm (special HHsuite format for HMM profiles) for domain 1
        create_domain_hhm_file(dom1_pfam_id,dom1_start_pos,dom1_len)
    
    outfilename2 = "pfam-v"+pfam_version+"/hhm_files/"+dom2_pfam_id+".hhm"
    if (os.path.isfile(outfilename2) == False):
        #Get domain 2 indices for fast access
        dom2_idx_cmd = "less "+hhsuite_hhm_database_path+hhsuite_hhm_database_idx+" | grep "+dom2_pfam_id
        dom2_idx_line = !$dom2_idx_cmd
        dom2_idx_line = dom2_idx_line[0].split("\t")
        dom2_start_pos = dom2_idx_line[1]
        dom2_len = dom2_idx_line[2]
        #Create the .hhm (special HHsuite format for HMM profiles) for domain 2
        create_domain_hhm_file(dom2_pfam_id,dom2_start_pos,dom2_len)
    
    #Running hhalign
    script = header+hhm_path+dom1_pfam_id+ext+flag+hhm_path+dom2_pfam_id+ext+flags
    query = !$script
    
    for line in query:
    #Get the alignment score
        if (line.find("Score=") >= 0):
            end_of_line = line[line.find("Score=")+6:]
            score = end_of_line[:end_of_line.find(" ")]
        #Get the alignment p-value
        if (line.find("E-value=") >= 0):
            end_of_line = line[line.find("E-value=")+8:]
            pval = end_of_line[:end_of_line.find(" ")]
        if (line.find("Aligned_cols=") >= 0):
            end_of_line = line[line.find("Aligned_cols=")+13:]
            aligned = end_of_line[:end_of_line.find(" ")]
    try:
        float(score)
    except: 
        print index
        print "score isn't a number"
    scores.append(score)
    pvals.append(pval)
    aligned_cols.append(aligned)
    
    #Printing and saving intermidiate files
    if (index % 100 == 0):
        print index
    if (index % 1000 == 0):
        #Saving tmp results
        scores_pvals_alone = pd.DataFrame({"scores": scores, "pvals": pvals, "aligned": aligned_cols})
        scores_pvals_alone.to_csv("pfam-v"+pfam_version+"/scores_pvals_alone_"+str(index)+".csv", sep='\t')

    
sim_pairs_df_filtered_no_clan["scores"] = scores
sim_pairs_df_filtered_no_clan["p-values"] = pvals
sim_pairs_df_filtered_no_clan["aligned-cols"] = aligned_cols

#Save the file with all the results
sim_pairs_df_filtered_no_clan.to_csv("pfam-v"+pfam_version+"/domains_hhlign_scores.csv", sep='\t')

In [9]:
sim_pairs_df_filtered_no_clan

Unnamed: 0,index,sim_dom1,sim_dom2,scores,p-values,aligned-cols
0,22,F5_F8_type_C,Laminin_B,5.33,0.72,2
1,23,F5_F8_type_C,Calpain_III,5.33,0.74,5
2,25,F5_F8_type_C,Laminin_N,47.53,1.7e-10,67
3,34,F5_F8_type_C,Ephrin_lbd,8.34,0.42,1
4,112,Laminin_N,Laminin_B,9.00,0.44,9
5,113,Laminin_N,Calpain_III,7.43,0.68,8
6,118,Laminin_N,Ephrin_lbd,10.69,0.46,6
7,141,Laminin_B,Calpain_III,10.05,0.15,1
8,144,Laminin_B,Ephrin_lbd,8.24,0.57,9
9,148,Calpain_III,Ephrin_lbd,7.01,0.79,1
