In [1]:
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
from copy import deepcopy

from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

### Reading the data

In [2]:
curr_dir = !pwd
tuning_type = "global_auprc"
input_path = curr_dir[0]+"/../../10.Prediction/phase2_pred_res/08.06.18_5f_comb_dna0.5_rna0.5_ion0.75/"+tuning_type+"/"
ligands = ["dna", "rna", "ion", "peptide", "sm"]
classifiers = ["XGB", "RF", "SVM","Logistic","NN"]
folds_num = 5

pred_tables_all = defaultdict(dict)
domain_auc_all = defaultdict(dict)
domain_auprc_all = defaultdict(dict)

pred_tables = dict()
for ligand in ligands:
    pred_tables[ligand] = pd.read_csv(input_path+ligand+"_"+str(folds_num)+"w.csv", sep='\t', index_col=0)
    pred_tables[ligand]["domain"] = [x[:x.rfind("_")] for x in pred_tables[ligand]["idx"]]
pred_tables_all["tuned_not_stacked"] = deepcopy(pred_tables)

#Save domain AUC tables
domain_auc_tables = dict()
for ligand in ligands:
    domain_auc_tables[ligand] = pd.read_csv(input_path+ligand+"_"+str(folds_num)+"w_d_auc.csv", sep='\t', index_col=0)
domain_auc_all["tuned_not_stacked"] = deepcopy(domain_auc_tables)

#Save domain AUPRC tables
domain_auprc_tables = dict()
for ligand in ligands:
    domain_auprc_tables[ligand] = pd.read_csv(input_path+ligand+"_"+str(folds_num)+"w_d_auprc.csv", sep='\t', index_col=0)
domain_auprc_all["tuned_not_stacked"] = deepcopy(domain_auprc_tables)

In [3]:
#Reading data of stacked models
tuning_type = "global_auprc"
stacking_types = ["ligand_features_probs", "model_features", "all_features_probs", "just_probs"]
stacking_names = ["ens-LIGAND", "ens-MODEL", "ens-ALL", "ens-PROB"]

for i in range(len(stacking_types)):
    features_type = stacking_types[i]
    stacked_path = curr_dir[0]+"/../../10.Prediction/stacking/2nd_level_pred/08.06.18_dna0.5_rna0.5_ion0.75/"+tuning_type+"/"+features_type+"/"

    pred_tables = dict()
    for ligand in ligands:
        pred_tables[ligand] = pd.read_csv(stacked_path+ligand+"_"+str(folds_num)+"w.csv", sep='\t', index_col=0)
        pred_tables[ligand]["domain"] = [x[:x.rfind("_")] for x in pred_tables[ligand]["idx"]]
    pred_tables_all[stacking_names[i]] = deepcopy(pred_tables)

    #Save domain AUC tables
    domain_auc_tables = dict()
    for ligand in ligands:
        domain_auc_tables[ligand] = pd.read_csv(stacked_path+ligand+"_"+str(folds_num)+"w_d_auc.csv", sep='\t', index_col=0)
    domain_auc_all[stacking_names[i]] = deepcopy(domain_auc_tables)

    #Save domain AUPRC tables
    domain_auprc_tables = dict()
    for ligand in ligands:
        domain_auprc_tables[ligand] = pd.read_csv(stacked_path+ligand+"_"+str(folds_num)+"w_d_auprc.csv", sep='\t', index_col=0)
    domain_auprc_all[stacking_names[i]] = deepcopy(domain_auprc_tables)

### Creating the data tables

In [12]:
percision_prob_tables_dict = defaultdict(dict)
table_type = "ens-ALL"

ligands = ["sm"]

model = "XGB"
prob_intervals = {"ion": [0.99, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 0.05, 0.01, 0.005, 0.001, 0],
                  "dna": [0.98,0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 0.05, 0.01, 0.005, 0.001, 0],
                  "rna": [0.1, 0.075, 0.05, 0.025, 0.01, 0.005, 0.001, 0.0005, 0.0001, 0],
                  "rnabase": [0.5, 0.4, 0.3, 0.2, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001, 0],
                  "rnabackbone": [0.5, 0.4, 0.3, 0.2, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001, 0],
                  "peptide": [0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 0.05, 0.01, 0.005, 0.001, 0],
                  "sm": [0.95, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 0.05, 0.01, 0.005, 0.001, 0]
                 }

for ligand in ligands:
    
    prob_plot_dict = defaultdict(list)
    domain_prob_dict = defaultdict(list)
    precision_list = []


    curr_table = pred_tables_all[table_type][ligand][pred_tables_all[table_type][ligand]["model"] == model]
    curr_auprc_table = curr_auprc_table = domain_auprc_all[table_type][ligand]
    all_pos = np.count_nonzero(curr_table["obs"] == 1)

    #Get the list of domains for this ligand
    domains_list = pred_tables_all[table_type][ligand]["domain"].unique().tolist()

    for prob in  prob_intervals[ligand]:

        #precision at prob
        results_at_prob = curr_table[curr_table["prob"] >= prob]
        pos_at_prob = np.count_nonzero(results_at_prob["obs"] == 1)
        try:
            precision = pos_at_prob/float(results_at_prob.shape[0])
        except:
            continue

        #prob
        prob_plot_dict["prob"].append(prob)

        #Precision
        prob_plot_dict["precision"].append(precision)

        #recal at prob
        recall = pos_at_prob/float(all_pos)
        prob_plot_dict["recall"].append(recall)

        #Other
        prob_plot_dict["positives"].append(pos_at_prob)
        prob_plot_dict["total"].append(results_at_prob.shape[0])
        prob_plot_dict["frac_text"].append(str(pos_at_prob)+"/"+str(results_at_prob.shape[0]))


        #Calculate per-domain precision for prob >= 0.01
        #if (prob < 0.01):
         #   print "Finished "+str(prob)
          #  continue
        domains_precision_list = []
        domains_recall_list = []
        domain_name_list = []
        prob_list = []
        pos_num_list = []
        domain_length_list = []
        pos_frac_list = []
        fold_imp_list = []
        

        for domain_name in domains_list:
            
            #Skipping domains without AUPRC (without positives)
            domain_auprc = curr_auprc_table[curr_auprc_table["domain"] == domain_name]
            if (domain_auprc.shape[0] == 0):
                continue

            #Calc precision
            domain_table = pred_tables_all[table_type][ligand][pred_tables_all[table_type][ligand]["domain"] == domain_name][pred_tables_all[table_type][ligand]["model"] == model]
            #If there are not positions that passed this prob. skipping
            if (domain_table[domain_table["prob"] >= prob].shape[0] == 0):
                continue
            pos_num = domain_table[domain_table["prob"] >= prob][domain_table["obs"] == 1].shape[0]
            neg_num = domain_table[domain_table["prob"] >= prob][domain_table["obs"] == 0].shape[0]
            domain_precision = pos_num/float(pos_num+neg_num)
            domains_precision_list.append(domain_precision)
            
            #Calc recall
            total_pos_num = domain_table[domain_table["obs"] == 1].shape[0]
            domain_recall = pos_num/float(total_pos_num)
            domains_recall_list.append(domain_recall)
            
            #pos num list (number of positions in the domain >= prob)
            pos_num_list.append((pos_num + neg_num))

            #domain_name list
            domain_name_list.append(domain_name)

            #domain_name list
            prob_list.append(prob)
            
            #domain length list
            domain_length_list.append(domain_table.shape[0])
            
            #total domain pos fraction
            total_pos = np.count_nonzero([domain_table["obs"] == 1])
            total_neg = np.count_nonzero([domain_table["obs"] == 0])
            pos_frac_list.append(total_pos/float(total_neg+total_pos))
            
            #fold improvment
            fold_imp_list.append(domain_precision/float(total_pos/float(total_neg)))
            

        domain_prob_dict["domain_precison"].extend(domains_precision_list)
        domain_prob_dict["domain_recall"].extend(domains_recall_list)
        domain_prob_dict["domain"].extend(domain_name_list)
        domain_prob_dict["prob"].extend(prob_list)
        domain_prob_dict["pos_num"].extend(pos_num_list)
        domain_prob_dict["domain_len"].extend(domain_length_list)
        domain_prob_dict["pos_frac"].extend(pos_frac_list)
        domain_prob_dict["fold_imp"].extend(fold_imp_list)

        print "Finished "+str(prob)
    
    prob_plot_df = pd.DataFrame.from_dict(prob_plot_dict)
    domain_prob_df = pd.DataFrame.from_dict(domain_prob_dict)
    percision_prob_tables_dict[ligand]["prob"] = prob_plot_df
    percision_prob_tables_dict[ligand]["domain"] = domain_prob_df
    
    prob_plot_df.to_csv(curr_dir[0]+"/domains_prec_tables/"+ligand+"_"+model+"_"+table_type+".csv", sep="\t")
    domain_prob_df.to_csv(curr_dir[0]+"/domains_prec_tables/"+ligand+"_domains_"+model+"_"+table_type+"_prob.csv", sep="\t")
    print "Finished "+ligand

Finished 0.95
Finished 0.8
Finished 0.7
Finished 0.6
Finished 0.5
Finished 0.4
Finished 0.3
Finished 0.2
Finished 0.1
Finished 0.05
Finished 0.01
Finished 0.005
Finished 0.001
Finished 0
Finished sm


In [11]:
prob_plot_df

Unnamed: 0,frac_text,positives,precision,prob,recall,total
0,1/1,1,1.0,0.95,0.001212,1
1,4/5,4,0.8,0.9,0.004848,5
2,16/19,16,0.842105,0.8,0.019394,19
3,30/38,30,0.789474,0.7,0.036364,38
4,42/61,42,0.688525,0.6,0.050909,61
5,60/94,60,0.638298,0.5,0.072727,94
6,80/140,80,0.571429,0.4,0.09697,140
7,120/236,120,0.508475,0.3,0.145455,236
8,168/393,168,0.427481,0.2,0.203636,393
9,287/1616,287,0.177599,0.1,0.347879,1616


In [164]:
table_type = "rnas_0.5th_comb"
ligand = "rna"
pred_tables_all[table_type][ligand][pred_tables_all[table_type][ligand]["model"] == "XGB"].sort_values(by="prob", ascending=False)

Unnamed: 0,fold,idx,model,obs,prob,domain
26032,4,S4_21,XGB,1,7.827294e-01,S4
29831,4,S4_14,XGB,1,6.980863e-01,S4
26529,4,S4_9,XGB,1,6.529258e-01,S4
26034,4,S4_27,XGB,1,6.245754e-01,S4
30103,4,S4_32,XGB,1,5.794351e-01,S4
30104,4,S4_33,XGB,1,4.820638e-01,S4
26035,4,S4_26,XGB,1,4.432023e-01,S4
26036,4,S4_25,XGB,1,3.850451e-01,S4
30908,4,Spin-Ssty_39,XGB,0,3.837524e-01,Spin-Ssty
29836,4,S4_11,XGB,0,3.145718e-01,S4
