In [61]:
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
from scipy.stats import kendalltau, rankdata

In [26]:
curr_dir = !pwd
ligands = ["dna", "dnabase", "dnabackbone", "rna", "rnabase", "rnabackbone", "peptide", "ion", "metabolite", "all_ligands"]
models = ["RF", "KNN", "SVM", "ADA", "Logistic"]

input_path = curr_dir[0]+"/../10.Prediction/domains_similarity/filtered_features_table/"
filename = "positions_features_mediode_filter_01.25.18.csv"
features_all = pd.read_csv(input_path+filename, sep='\t', index_col=0)
features_all["stridx"] = features_all.index

In [88]:
tau_corr_dict = defaultdict(dict)

for ligand in ligands:
    
    for classifier in models:
        
        #Get the prediction results
        ligand_model_pred_df= pd.read_csv(curr_dir[0]+"/../10.Prediction/pred_AUC_AUPRC/mediode_NegAbs_NoFilter/NoDown/01.25.2018/features_pred_tables/"+ligand+"_"+classifier+"_features_pred.csv", sep='\t', index_col=0)
        ligand_model_pred_df['stridx'] = ligand_model_pred_df.index
        pred_df = ligand_model_pred_df[["stridx", "prob"]]
        
        #Get the binding scores
        if (ligand == "all_ligands"):
            ligand_bind_str = "max_binding_score"
        else:
            ligand_bind_str = ligand+"_binding_score"
        bind_score_df = features_all[["stridx", ligand_bind_str]]
            
        #match the binding scores
        pred_bind_merge_df = pred_df.merge(bind_score_df, left_index=True, left_on="stridx", right_on="stridx")
        
        #compute ranking for all positions according to binding score and according to predictions
        sorted_positions_list = pred_bind_merge_df.sort_index().index.tolist()
        pred_ranking = rankdata(pred_bind_merge_df.sort_index()["prob"].tolist())
        bind_score_ranking = rankdata(pred_bind_merge_df.sort_index()[ligand_bind_str].tolist())
        
        tau, p_value = kendalltau(pred_ranking, bind_score_ranking)
        tau_corr_dict[ligand][classifier] = {"tau": tau, "pval": p_value}
        
    print "Finished ligand "+ligand
        

Finished ligand dna
Finished ligand dnabase
Finished ligand dnabackbone
Finished ligand rna
Finished ligand rnabase
Finished ligand rnabackbone
Finished ligand peptide
Finished ligand ion
Finished ligand metabolite
Finished ligand all_ligands


In [124]:
tau_corr_dict["all_ligands"]

{'ADA': {'pval': 0.0, 'tau': 0.22427336766144995},
 'KNN': {'pval': 6.6417707935478493e-297, 'tau': 0.16740680886371895},
 'Logistic': {'pval': 1.0697554418908385e-21, 'tau': 0.042332874024985791},
 'RF': {'pval': 0.0, 'tau': 0.22727592028307894},
 'SVM': {'pval': 0.0, 'tau': 0.17618208629102949}}