## Combining predicted probabilities from several classifiers, for each ligand
The combined tables are saved in: "pred_AUC_AUPRC/negatives_filtering/downsampling/date/probes_tables/"

In [1]:
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
from os import environ

In [4]:
curr_dir = !pwd
models = ["XGB", "RF", "ADA", "KNN", "SVM", "Logistic"]
ligands = ["dna", "dnabase", "dnabackbone", "rna", "rnabase", "rnabackbone", "peptide", "ion", "metabolite"]

In [62]:
pred_prob_dict = defaultdict(dict)

for ligand in ligands:
    #Read the data tables
    for model in models:
        pred_prob_dict[ligand][model] = dict()
        pred_prob_dict[ligand][model]["dataset"] = pd.read_csv(curr_dir[0]+"/../10.Prediction/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/NoDown/01.25.2018/features_pred_tables/"+ligand+"_"+model+"_features_pred.csv", sep='\t', index_col=0)[["fold", "prob"]]
        pred_prob_dict[ligand][model]["dataset"].columns = ["fold", model+"_prob"]
        pred_prob_dict[ligand][model]["between"] = pd.read_csv(curr_dir[0]+"/../10.Prediction/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/NoDown/01.25.2018/between_pred_tables/"+ligand+"_"+model+"_features_pred.csv", sep='\t', index_col=0)[["fold", "prob"]]
        pred_prob_dict[ligand][model]["between"].columns = ["fold", model+"_prob"]
    
    #Combine them into ligand probs table
    probs_merged_df = pd.DataFrame()
    between_merged_df = pd.DataFrame()
    
    for model in models:
        #The regular dataset
        if (probs_merged_df.shape[0] == 0):
            probs_merged_df = pred_prob_dict[ligand][model]["dataset"]
        else:
            probs_merged_df = probs_merged_df.merge(pred_prob_dict[ligand][model]["dataset"], on="fold", left_index=True, right_index=True)
        #The positions between labels
        if (between_merged_df.shape[0] == 0):
            between_merged_df = pred_prob_dict[ligand][model]["between"]
        else:
            between_merged_df = pd.concat([between_merged_df, pred_prob_dict[ligand][model]["between"][[model+"_prob"]]], axis=1)
    
    #Saving to .csv
    probs_merged_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/NoDown/01.25.2018/probs_tables/"+ligand+"_dataset_probs.csv", sep='\t')
    between_merged_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/NoDown/01.25.2018/probs_tables/"+ligand+"_between_probs.csv", sep='\t')
    print "Finished "+ligand

Finished dna
Finished dnabase
Finished dnabackbone
Finished rna
Finished rnabase
Finished rnabackbone
Finished peptide
Finished ion
Finished metabolite
