## Combining predicted probabilities from several classifiers, for each ligand

In [1]:
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
from os import environ

In [2]:
curr_dir = !pwd
models = ["XGB", "RF", "ADA", "KNN", "SVM", "Logistic"]
ligands = ["dna", "rna", "ion", "peptide", "sm"]
datafile_date = "08.06.18"
folds = 5
dirname = "comb_dna0.5_rna0.25_ion0.75"

oof_path = curr_dir[0]+"/1st_level_pred/"+datafile_date+"_"+dirname+"/oof/"
heldout_fold_path = curr_dir[0]+"/../pred_AUC_AUPRC/mediode_NegLigand_NoFilter/NoDown/"+datafile_date+"_"+str(folds)+"f_"+dirname+"/per_fold/"
missing_path = curr_dir[0]+"/missing_predictions/"+datafile_date+"_"+dirname+"/per_fold/"
out_path = curr_dir[0]+"/1st_level_pred/"+datafile_date+"_"+dirname+"/all_combined/"

In [3]:
#%%time
pred_prob_dict = defaultdict(dict)

for ligand in ligands:
    #Read the data tables
    for model in models:
        pred_prob_dict[ligand][model] = dict()
        
        for heldout_fold in range(1,folds+1):
            pred_prob_dict[ligand][model][heldout_fold] = dict()
        
            #Reading the ligand-model missing labels positions
            pred_prob_dict[ligand][model][heldout_fold]["missing"] = pd.read_csv(missing_path+ligand+"_"+model+"_fold"+str(heldout_fold)+"_5w.csv", sep='\t', index_col=0)[["idx", "prob"]]
            pred_prob_dict[ligand][model][heldout_fold]["missing"].columns = ["idx", model+"_"+ligand+"_prob"]
        
            #Reading the ligand-model oof training predictions (of all other 4 folds)
            pred_prob_dict[ligand][model][heldout_fold]["oof-train"] = pd.DataFrame()
            for test_fold in range(1, folds+1):
                
                if (test_fold == heldout_fold):
                    continue
                curr_oof_table = pd.read_csv(oof_path+str(heldout_fold)+"/"+ligand+"_"+model+"_test_fold"+str(test_fold)+"_5w.csv", sep='\t', index_col=0)[["idx", "prob"]]
                curr_oof_table.columns = ["idx", model+"_"+ligand+"_prob"]
                
                if (pred_prob_dict[ligand][model][heldout_fold]["oof-train"].shape[0] == 0): 
                    pred_prob_dict[ligand][model][heldout_fold]["oof-train"] = curr_oof_table
                else:
                    pred_prob_dict[ligand][model][heldout_fold]["oof-train"] = pd.concat([pred_prob_dict[ligand][model][heldout_fold]["oof-train"], curr_oof_table])
                
            #Reading the ligand-model heldout fold predictions
            pred_prob_dict[ligand][model][heldout_fold]["heldout"] = pd.read_csv(heldout_fold_path+ligand+"_"+model+"_fold"+str(heldout_fold)+"_5w.csv", sep='\t', index_col=0)[["idx", "prob"]]
            pred_prob_dict[ligand][model][heldout_fold]["heldout"].columns = ["idx", model+"_"+ligand+"_prob"]
            
        print "Finished reading "+ligand+" "+model
               
    print "Finished reading data "+ligand
    
    #Combine them into ligand-folds tables
    for heldout_fold in range(1,folds+1):
        ligand_fold_combined_table = pd.DataFrame()
        
        for model in models:
            model_combined_table = pd.concat([pred_prob_dict[ligand][model][heldout_fold]["heldout"], pred_prob_dict[ligand][model][heldout_fold]["oof-train"], pred_prob_dict[ligand][model][heldout_fold]["missing"]])
        
            if (ligand_fold_combined_table.shape[0] == 0):
                ligand_fold_combined_table = model_combined_table
            else:
                ligand_fold_combined_table = pd.merge(ligand_fold_combined_table, model_combined_table, on="idx")
        
        
        #Saving to .csv
        ligand_fold_combined_table = ligand_fold_combined_table.sort_values(by="idx")
        ligand_fold_combined_table = ligand_fold_combined_table.reset_index(drop=True)
        ligand_fold_combined_table.to_csv(out_path+str(heldout_fold)+"/"+ligand+"_stacking1_probs.csv", sep='\t')
    
    print "Finished writing "+ligand

Finished reading dna XGB
Finished reading dna RF
Finished reading dna ADA
Finished reading dna KNN
Finished reading dna SVM
Finished reading dna Logistic
Finished reading data dna
Finished writing dna
Finished reading rna XGB
Finished reading rna RF
Finished reading rna ADA
Finished reading rna KNN
Finished reading rna SVM
Finished reading rna Logistic
Finished reading data rna
Finished writing rna
Finished reading ion XGB
Finished reading ion RF
Finished reading ion ADA
Finished reading ion KNN
Finished reading ion SVM
Finished reading ion Logistic
Finished reading data ion
Finished writing ion
Finished reading peptide XGB
Finished reading peptide RF
Finished reading peptide ADA
Finished reading peptide KNN
Finished reading peptide SVM
Finished reading peptide Logistic
Finished reading data peptide
Finished writing peptide
Finished reading sm XGB
Finished reading sm RF
Finished reading sm ADA
Finished reading sm KNN
Finished reading sm SVM
Finished reading sm Logistic
Finished reading