In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats, integrate
import seaborn as sns
import pickle

In [2]:
curr_dir = !pwd
pfam_version = "31"
domains_th = "10"

hhalign_scores = pd.read_csv(curr_dir[0]+"/processed_domains_hhlign_scores.csv", sep='\t', index_col=0)

### Adding the raw score divided by cols aligned
from the HHsuite userguide: https://github.com/soedinglab/hh-suite/blob/master/hhsuite-userguide.pdf (page 23):

How can I build a phylogenetic tree for HMMs? I would use a similarity measure like
the raw score per alignment length. You might also add the secondary structure score to the
raw score with some weight. Whereas probabilities, E-values, and P-values are useful for deciding
whether a match is a reliable homolog or not, they are not suitable for measuring similarities
because they strongly depend on the length of the alignment

In [3]:
score_alignment_cols_ratio_list1 = []
score_alignment_cols_ratio_list2 = []
for index, row in hhalign_scores.iterrows():
    score_norm = row["score1"]/float(row["aligned_cols1"])
    score_alignment_cols_ratio_list1.append(score_norm)
    score_norm = row["score2"]/float(row["aligned_cols2"])
    score_alignment_cols_ratio_list2.append(score_norm)

hhalign_scores["score_norm1"] = score_alignment_cols_ratio_list1
hhalign_scores["score_norm2"] = score_alignment_cols_ratio_list2

### Domain similarity filters

0.001 p-value is the diffault threshold for Viterbi:
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3197634/

In [4]:
#Has both directions with meaningful alignments (pfal <= 0.001, precent identity >= 25)
significantly_pvals = hhalign_scores[hhalign_scores["pval1"] <= 0.001][hhalign_scores["pval2"] <= 0.001]

#According to HHsuite userguide, this is the % of aligned columns that match
significantly_ident_perc = significantly_pvals[significantly_pvals["ident_perc1"] >= 20][significantly_pvals["ident_perc2"] >= 20]

#By HHsuite userguide (page-30): A unit of column score corresponds approximately to 0.6 bits
significantly_score = significantly_ident_perc[significantly_ident_perc["score_norm1"] >= 0.6][significantly_ident_perc["score_norm2"] >= 0.6]

significantly_similar_pairs = significantly_score



In [5]:
significantly_similar_pairs.shape

(1745, 14)

In [22]:
#Saving to .csv
significantly_similar_pairs = significantly_similar_pairs.reset_index(drop=True)
significantly_similar_pairs.to_csv("processed_domains_sig_pairs_pvals0.001_ident20_score0.6_2directions.csv", sep='\t')

### Filter to processed domains not in the significantly similar pairs

In [8]:
#Reading the test domains that weren't used in the pipeline
with open(curr_dir[0]+"/processed_domains_with_labels_not_in_pipeline.pik", 'rb') as handle:
    processed_domains_list = pickle.load(handle)
processed_domains_list.sort()  

In [9]:
#Domains with no significant similarity at all
non_similar_domains = []
for domain_name in processed_domains_list:
    if (domain_name not in significantly_similar_pairs["sim_dom1"].tolist() and domain_name not in significantly_similar_pairs["sim_dom2"].tolist()):
        non_similar_domains.append(domain_name)

In [27]:
#Save to file
with open(curr_dir[0]+"/processed_domains_non_similar_at_all.pik", 'wb') as handle:
    pickle.dump(non_similar_domains, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Of the other domains - find the ones that form clusters just amongst the processed domains

In [30]:
might_be_similar_domains = list(set(processed_domains_list) - set(non_similar_domains))

In [45]:
pipeline_verify = []
for domain in significantly_similar_pairs["sim_dom1"]:
    if domain not in processed_domains_list:
        pipeline_verify.append(domain)

In [49]:
domains_for_clusters_analysis = []
for domain in might_be_similar_domains:
    add = True
    domain_table1 = significantly_similar_pairs[significantly_similar_pairs["sim_dom1"] == domain]
    domain_table2 = significantly_similar_pairs[significantly_similar_pairs["sim_dom2"] == domain]
    
    partners1 = domain_table1["sim_dom2"]
    partners2 = domain_table2["sim_dom1"]
    
    for partner in partners1:
        if partner in pipeline_verify:
            add = False
    for partner in partners2:
        if partner in pipeline_verify:
            add = False
    if (add):
        domains_for_clusters_analysis.append(domain)

In [56]:
len(domains_for_clusters_analysis)

157

In [51]:
#Domains similar to other processed domains
processed_sim_clusters_idx = []
for index, row in significantly_similar_pairs.iterrows():
    if (row["sim_dom1"] in domains_for_clusters_analysis and row["sim_dom2"] in domains_for_clusters_analysis):
        processed_sim_clusters_idx.append(index)

#Save pairs of processed domains similarity clusters
signsignificantly_similar_pairs_clusters = significantly_similar_pairs.iloc[processed_sim_clusters_idx,:]
signsignificantly_similar_pairs_clusters = signsignificantly_similar_pairs_clusters.reset_index(drop=True)

In [53]:
#Saving to .csv
signsignificantly_similar_pairs_clusters.to_csv("processed_domains_sig_pairs_clusters.csv", sep='\t')

In [54]:
signsignificantly_similar_pairs_clusters.shape

(89, 14)

In [55]:
len(non_similar_domains)

775