In [1]:
#%matplotlib inline
#Basic imports
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
from os import environ, getcwd
import sys

#Import utils functions
curr_dir = getcwd()
sys.path.append(curr_dir+"/utils")
from prop_threshold_funcs import create_positives_datasets_combined, create_negatives_datasets_combined
from prediction_general_funcs import ligands, score_cols_suffix, get_features_cols, remove_unimportant_features, test_model_iterative_fixed

#Import models dict according to tuning type
tuning_type = "global_auprc"
#tuning_type = "domain_auc"
if (tuning_type == "global_auprc"):
    from generate_models_dict_global_auprc import generate_models_dict
else:
    from generate_models_dict_domain_auc import generate_models_dict
    
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

### Reading the input dataset

In [2]:
pfam_version = "31"
datafile_date = "08.06.18"
input_path = curr_dir+"/domains_similarity/filtered_features_table/"
filename = "windowed_positions_features_mediode_filter_"+datafile_date+".csv"

out_dirname = "comb_dna0.5_rna0.5_ion0.75"

#flags for creating negatives
zero_prop = True
no_prop = True
all_ligands = False
folds_num = 5
ligands = ["dna", "rna", "ion", "peptide", "sm"]

#Features table
features_all = pd.read_csv(input_path+filename, sep='\t', index_col=0)
#Features columns names, without the labels (the binding scores)
features_cols = get_features_cols(features_all)

print "all samples positions #: "+str(features_all.shape[0])

#CV splits dictionary
with open(curr_dir+"/CV_splits/pfam-v"+pfam_version+"/domain_"+str(folds_num)+"_folds_combined_dna0.5_rna0.5_ion0.75_prec_dict.pik", 'rb') as handle:
    splits_dict = pickle.load(handle)

all samples positions #: 44872


#### Remove unimportant features

In [3]:
print "# of features before removal: "+str(len(features_cols))
remove_unimportant_features(features_all, features_cols)
print "# of features after removal: "+str(len(features_cols))

# of features before removal: 761
# of features after removal: 761


#### Dataset of negative examples

In [4]:
ligands_negatives_df = create_negatives_datasets_combined(zero_prop, no_prop, features_all, features_cols, all_ligands)

dna non-binding #:43886
dnabase non-binding #:44418
dnabackbone non-binding #:44021
dna combined non binding #: 43884
rna non-binding #:43727
rnabase non-binding #:44154
rnabackbone non-binding #:43944
rna combined non binding #: 43720
peptide non-binding #:41105
ion non-binding #:39630
metabolite non-binding #:39638
druglike non-binding #:35018
sm non-binding #:32697


#### Datasets of positive examples by ligand

In [5]:
ligands_positives_df = create_positives_datasets_combined(features_all, features_cols, all_ligands)

dna #: 369
dnabase #: 161
dnabackbone #: 245
dna combined #: 397
rna #: 206
rnabase #: 118
rnabackbone #: 136
rna combined #: 247
peptide #: 436
ion #: 351
metabolite #: 522
druglike #: 763
sm #: 825




### Reading env input for downsampler technique, ligand and classifier

In [6]:
#Reading the ligand input
try:
    ligand = environ['ligand']
except:
    ligand = "dna"
print "ligand = "+ligand
    
#Reading the fold input
try: 
    fold = environ['fold']
except:
    fold = "1"
print "fold = "+fold

#Reading the classifier input
try: 
    classifier_method = environ['classifier']
except:
    classifier_method = "XGB"
print "classifier_method = "+classifier_method

ligand = dna
fold = 1
classifier_method = XGB


In [7]:
no_features = len(features_cols)
models_dict = generate_models_dict(ligand, classifier_method, ligands, ligands_positives_df, ligands_negatives_df, folds_num, no_features)

### Predict for each ligand seperatelly

In [8]:
#%%time

save_to_file = False

for fold in [fold]:
    #Initialize dictionary
    pred_dict = defaultdict(list)
    domain_pred_dict = defaultdict(list)
    auc_dict = defaultdict(list)
    auprc_dict = defaultdict(list)
    domain_auc_mean_dict = defaultdict(list)
    domain_auprc_mean_dict = defaultdict(list)
    domain_auprc_ratio_mean_dict = defaultdict(list)
    domain_auc_dict = defaultdict(list)
    domain_auprc_dict = defaultdict(list)
    domain_auprc_ratio_dict = defaultdict(list)
    prec_prob_dict = defaultdict(list)

    model = models_dict[classifier_method][ligand][int(fold)]
    (ordered_features, model) = test_model_iterative_fixed(pred_dict, domain_pred_dict, auc_dict, auprc_dict, domain_auc_mean_dict, domain_auprc_mean_dict, domain_auprc_ratio_mean_dict, 
                                                            domain_auc_dict, domain_auprc_dict, domain_auprc_ratio_dict, prec_prob_dict, ligands_positives_df[ligand], 
                                                            ligands_negatives_df[ligand], ligand, model, classifier_method, fold)


    pred_df = pd.DataFrame.from_dict(pred_dict)
    domain_pred_df = pd.DataFrame.from_dict(domain_pred_dict)
    #global matrics dfs
    auc_df = pd.DataFrame.from_dict(auc_dict)
    auprc_df = pd.DataFrame.from_dict(auprc_dict)
    #per domain mean dfs
    domain_auc_mean_df = pd.DataFrame.from_dict(domain_auc_mean_dict)
    domain_auprc_mean_df = pd.DataFrame.from_dict(domain_auprc_mean_dict)
    domain_auprc_ratio_mean_df = pd.DataFrame.from_dict(domain_auprc_ratio_mean_dict)
    #per domain dfs
    domain_auc_df = pd.DataFrame.from_dict(domain_auc_dict)
    domain_auprc_df = pd.DataFrame.from_dict(domain_auprc_dict)
    domain_auprc_ratio_df= pd.DataFrame.from_dict(domain_auprc_ratio_dict)

    #Save to file
    if (save_to_file):
        pred_df.to_csv(curr_dir+"/phase2_pred_res/"+datafile_date+"_"+str(folds_num)+"f_"+out_dirname+"/"+tuning_type+"/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w.csv", sep='\t')
        domain_pred_df.to_csv(curr_dir+"/phase2_pred_res/"+datafile_date+"_"+str(folds_num)+"f_"+out_dirname+"/"+tuning_type+"/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w_d.csv", sep='\t')

        auc_df.to_csv(curr_dir+"/phase2_pred_res/"+datafile_date+"_"+str(folds_num)+"f_"+out_dirname+"/"+tuning_type+"/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w_auc.csv", sep='\t')
        auprc_df.to_csv(curr_dir+"/phase2_pred_res/"+datafile_date+"_"+str(folds_num)+"f_"+out_dirname+"/"+tuning_type+"/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w_auprc.csv", sep='\t')

        domain_auc_mean_df.to_csv(curr_dir+"/phase2_pred_res/"+datafile_date+"_"+str(folds_num)+"f_"+out_dirname+"/"+tuning_type+"/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w_dm_auc.csv", sep='\t')
        domain_auprc_mean_df.to_csv(curr_dir+"/phase2_pred_res/"+datafile_date+"_"+str(folds_num)+"f_"+out_dirname+"/"+tuning_type+"/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w_dm_auprc.csv", sep='\t')
        domain_auprc_ratio_mean_df.to_csv(curr_dir+"/phase2_pred_res/"+datafile_date+"_"+str(folds_num)+"f_"+out_dirname+"/"+tuning_type+"/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w_dm_auprc_ratio.csv", sep='\t')

        domain_auc_df.to_csv(curr_dir+"/phase2_pred_res/"+datafile_date+"_"+str(folds_num)+"f_"+out_dirname+"/"+tuning_type+"/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w_d_auc.csv", sep='\t')
        domain_auprc_df.to_csv(curr_dir+"/phase2_pred_res/"+datafile_date+"_"+str(folds_num)+"f_"+out_dirname+"/"+tuning_type+"/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w_d_auprc.csv", sep='\t')
        domain_auprc_ratio_df.to_csv(curr_dir+"/phase2_pred_res/"+datafile_date+"_"+str(folds_num)+"f_"+out_dirname+"/"+tuning_type+"/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w_d_auprc_ratio.csv", sep='\t')

        pred_df.to_csv(curr_dir+"/phase2_pred_res/"+datafile_date+"_"+str(folds_num)+"f_"+out_dirname+"/"+tuning_type+"/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w.csv", sep='\t')
        domain_pred_df.to_csv(curr_dir+"/phase2_pred_res/"+datafile_date+"_"+str(folds_num)+"f_"+out_dirname+"/"+tuning_type+"/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w_d.csv", sep='\t')

        auc_df.to_csv(curr_dir+"/phase2_pred_res/"+datafile_date+"_"+str(folds_num)+"f_"+out_dirname+"/"+tuning_type+"/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w_auc.csv", sep='\t')
        auprc_df.to_csv(curr_dir+"/phase2_pred_res/"+datafile_date+"_"+str(folds_num)+"f_"+out_dirname+"/"+tuning_type+"/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w_auprc.csv", sep='\t')

        domain_auc_mean_df.to_csv(curr_dir+"/phase2_pred_res/"+datafile_date+"_"+str(folds_num)+"f_"+out_dirname+"/"+tuning_type+"/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w_dm_auc.csv", sep='\t')
        domain_auprc_mean_df.to_csv(curr_dir+"/phase2_pred_res/"+datafile_date+"_"+str(folds_num)+"f_"+out_dirname+"/"+tuning_type+"/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w_dm_auprc.csv", sep='\t')
        domain_auprc_ratio_mean_df.to_csv(curr_dir+"/phase2_pred_res/"+datafile_date+"_"+str(folds_num)+"f_"+out_dirname+"/"+tuning_type+"/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w_dm_auprc_ratio.csv", sep='\t')

        domain_auc_df.to_csv(curr_dir+"/phase2_pred_res/"+datafile_date+"_"+str(folds_num)+"f_"+out_dirname+"/"+tuning_type+"/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w_d_auc.csv", sep='\t')
        domain_auprc_df.to_csv(curr_dir+"/phase2_pred_res/"+datafile_date+"_"+str(folds_num)+"f_"+out_dirname+"/"+tuning_type+"/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w_d_auprc.csv", sep='\t')
        domain_auprc_ratio_df.to_csv(curr_dir+"/phase2_pred_res/"+datafile_date+"_"+str(folds_num)+"f_"+out_dirname+"/"+tuning_type+"/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w_d_auprc_ratio.csv", sep='\t')

print "Finished ligand "+ligand

fold #: 1
AUC = 0.8783010306018075
AUPRC = 0.36636682847178764
AU prec prob = 0.3380073005202657
domain AUC mean = 0.8425429917549917
domain AUPRC mean = 0.5575022764102965
domain AUPRC ratio mean = 2.8433861616824867
Finished dna XGB fold: 1
Finished ligand dna


In [9]:
#imp_list4 = model.feature_importances_

#importance_dict4 = {}
#for i in range(len(features_cols)):
#    importance_dict4[features_cols[i]] = imp_list4[i]
#sorted(importance_dict4.items(), key=lambda x: x[1])