In [1]:
#Basic imports
import pandas as pd
import numpy as np
import random
import pickle
from collections import defaultdict
from os import environ, getcwd
import sys

#Import utils functions
curr_dir = getcwd()

sys.path.append(curr_dir+"/../../utils")
from prop_threshold_funcs import create_negatives_datasets_combined, create_positives_datasets_combined
from prediction_general_funcs import ligands, score_cols_suffix, get_features_cols, remove_unimportant_features
from CV_funcs import add_domain_name_from_table_idx, calc_CV_idx_iterative
from tuning_helper_functions import test_model_on_validation, test_model_on_heldout
sys.path.append(curr_dir+"/../utils")
from stacking_hyperparameter_trials import generate_trials_Log, generate_trials_XGB
from stacking_funcs import create_stacked_dataset

from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

### Reading the input dataset

In [2]:
#tuning_type = "domain_auc"
tuning_type = "global_auprc"
datafile_date = "08.06.18"
input_path = curr_dir+"/../../domains_similarity/filtered_features_table/"
filename = "windowed_positions_features_mediode_filter_"+datafile_date+".csv"
out_dir = "mediode_NegLigand_NoFilter"
all_models_list = ["XGB", "RF", "SVM", "Logistic", "NN"]

#flags for creating negatives
zero_prop = True
no_prop = True
all_ligands = False
prec_th_str = "dna0.5_rna0.5_ion0.75"
folds_num = 5

#Features table
features_all = pd.read_csv(input_path+filename, sep='\t', index_col=0)
#Features columns names, without the labels (the binding scores)
features_cols = get_features_cols(features_all)
remove_unimportant_features(features_all, features_cols, update_features_cols=True)

print "all samples positions #: "+str(features_all.shape[0])

all samples positions #: 44872


#### Dataset of negative examples

In [3]:
ligands_negatives_df = create_negatives_datasets_combined(zero_prop, no_prop, features_all, features_cols, all_ligands)

dna non-binding #:43886
dnabase non-binding #:44418
dnabackbone non-binding #:44021
dna combined non binding #: 43884
rna non-binding #:43727
rnabase non-binding #:44154
rnabackbone non-binding #:43944
rna combined non binding #: 43720
peptide non-binding #:41105
ion non-binding #:39630
metabolite non-binding #:39638
druglike non-binding #:35018
sm non-binding #:32697


#### Datasets of positive examples by ligand

In [4]:
ligands_positives_df = create_positives_datasets_combined(features_all, features_cols, all_ligands)

dna #: 369
dnabase #: 161
dnabackbone #: 245
dna combined #: 397
rna #: 206
rnabase #: 118
rnabackbone #: 136
rna combined #: 247
peptide #: 436
ion #: 351
metabolite #: 522
druglike #: 763
sm #: 825




### Reading env input for downsampler technique, ligand and classifier

In [5]:
#Reading the ligand input
try:
    ligand = environ['ligand']
except:
    ligand = "rna"
print "ligand = "+ligand
    
#Reading the downsampler input
try: 
    fold = environ['fold']
except:
    fold = "1"
print "fold = "+fold

#Reading the classifier input
try: 
    classifier_method = environ['classifier']
except:
    classifier_method = "XGB"
print "classifier_method = "+classifier_method

# Reading the index to generate model
try:
    trial_idx = int(environ["trial"])
except:
    trial_idx = 44
print "trial idx = "+ str(trial_idx)
   


if classifier_method == "XGB":    
    try:
        max_depth_ub = int(environ["max_depth_ub"])
        max_depth_lb = int(environ["max_depth_lb"])
        min_child_weight_ub = float(environ["min_child_weight_ub"])
        min_child_weight_lb = float(environ["min_child_weight_lb"])
        colsample_bytree_ub = float(environ["colsample_bytree_ub"])
        colsample_bytree_lb = float(environ["colsample_bytree_lb"])
        gamma_ub = float(environ["gamma_ub"])
        gamma_lb = float(environ["gamma_lb"])
        learning_rate_ub = float(environ["learning_rate_ub"])
        learning_rate_lb = float(environ["learning_rate_lb"])

        try:
            sec_max_depth_ub = int(environ["sec_max_depth_ub"])
            sec_max_depth_lb = int(environ["sec_max_depth_lb"])
            max_depth_weight_1 = float(environ["max_depth_weight_1"])
            max_depth_weight_2 = float(environ["max_depth_weight_2"])
        except:
            sec_max_depth_ub = max_depth_ub
            sec_max_depth_lb = max_depth_lb
            max_depth_weight_1 = 1
            max_depth_weight_2 = 1
        try:
            sec_min_child_weight_ub = float(environ['sec_min_child_weight_ub'])
            sec_min_child_weight_lb = float(environ['sec_min_child_weight_lb'])
            min_child_weight_weight_1 = float(environ["min_child_weight_weight_1"])
            min_child_weight_weight_2 = float(environ["min_child_weight_weight_2"])

        except:
            sec_min_child_weight_ub = min_child_weight_ub
            sec_min_child_weight_lb = min_child_weight_lb
            min_child_weight_weight_1 = 1
            min_child_weight_weight_2 = 1
        try:
            sec_colsample_bytree_ub = float(environ['sec_colsample_bytree_ub'])
            sec_colsample_bytree_lb = float(environ['sec_colsample_bytree_lb'])
            colsample_bytree_weight_1 = float(environ['colsample_bytree_weight_1'])
            colsample_bytree_weight_2 = float(environ['colsample_bytree_weight_2'])
        except:
            sec_colsample_bytree_ub = colsample_bytree_ub
            sec_colsample_bytree_lb = colsample_bytree_lb
            colsample_bytree_weight_1 = 1
            colsample_bytree_weight_2 = 1

        try:
            sec_gamma_ub = float(environ['sec_gamma_ub'])
            sec_gamma_lb = float(environ['sec_gamma_lb'])
            gamma_weight_1 = float(environ['gamma_weight_1'])
            gamma_weight_2 = float(environ['gamma_weight_2'])
        except:
            sec_gamma_ub = gamma_ub
            sec_gamma_lb = gamma_lb
            gamma_weight_1 = 1
            gamma_weight_2 = 1
        try:
            sec_learning_rate_ub = float(environ['sec_learning_rate_ub'])
            sec_learning_rate_lb = float(environ['sec_learning_rate_lb'])
            lr_weight_1 = float(environ['lr_weight_1'])
            lr_weight_2 = float(environ['lr_weight_2'])
        except:
            sec_learning_rate_ub = learning_rate_ub
            sec_learning_rate_lb = learning_rate_lb
            lr_weight_1 = 1
            lr_weight_2 = 1

    except:    
        print "Error: goto XGB exception"
        max_depth_ub = 100
        max_depth_lb = 1
        min_child_weight_ub = 20
        min_child_weight_lb = 0
        colsample_bytree_ub = 1
        colsample_bytree_lb = 0.1
        gamma_ub = 1
        gamma_lb = -3
        learning_rate_ub = -0.5
        learning_rate_lb = -4

        sec_max_depth_ub = 100
        sec_max_depth_lb = 1
        sec_min_child_weight_ub = 20
        sec_min_child_weight_lb = 0
        sec_colsample_bytree_ub = 1
        sec_colsample_bytree_lb = 0.1
        sec_gamma_ub = 1
        sec_gamma_lb = -3
        sec_learning_rate_ub = -0.5
        sec_learning_rate_lb = -4

        max_depth_weight_1 = 0.5
        max_depth_weight_2 = 0.5
        min_child_weight_weight_1 = 0.5
        min_child_weight_weight_2 = 0.5
        colsample_bytree_weight_1 = 0.5
        colsample_bytree_weight_2 = 0.5
        gamma_weight_1 = 0.5
        gamma_weight_2 = 0.5
        lr_weight_1 = 0.5
        lr_weight_2 = 0.5

elif classifier_method == "Logistic":
    try:        
        C_ub = int(environ["C_ub"])
        C_lb = int(environ["C_lb"])

        try:
            sec_C_ub = int(environ["sec_C_ub"])
            sec_C_lb = int(environ["sec_C_lb"])
            C_weight_1 = float(environ["C_weight_1"])
            C_weight_2 = float(environ["C_weight_2"])
            
        except:
            sec_C_ub = C_ub
            sec_C_lb = C_lb
            C_weight_1 = 1
            C_weight_2 = 1

    except: 
        print "Error: goto Logistic exception"
        C_ub = 0
        C_lb = -3
        
        sec_C_ub = 0
        sec_C_lb = -3
        
        C_weight_1 = 1
        C_weight_2 = 1

ligand = rna
fold = 1
classifier_method = XGB
trial idx = 44
Error: goto XGB exception


### Generate hyperparameter trials

Choose hyperparameters and generate hyperparameters through random search in a grid, as explained by this video: https://www.youtube.com/watch?v=WrICwRrvuIc&index=66&list=PLBAGcD3siRDguyYYzhVwZ3tLvOyyG5k6K

Use logarithmic scale for search for learnining rate and weight decay for NN, as explained by this video: https://www.youtube.com/watch?v=VUbrW8OK3uo&index=67&list=PLBAGcD3siRDguyYYzhVwZ3tLvOyyG5k6K

Utilize nested cross validation to choose between models, as described here: https://stats.stackexchange.com/questions/266225/step-by-step-explanation-of-k-fold-cross-validation-with-grid-search-to-optimise/266229

In [6]:
no_trials = 100

if classifier_method == "XGB":
    max_depth_list = [[max_depth_lb, max_depth_ub], [sec_max_depth_lb, sec_max_depth_ub]]

    max_depth_list_weights = [max_depth_weight_1, max_depth_weight_2]

    min_child_weight_list = [[min_child_weight_lb, min_child_weight_ub],[sec_min_child_weight_lb, sec_min_child_weight_ub]]

    min_child_weight_list_weights = [min_child_weight_weight_1, min_child_weight_weight_2]

    colsample_bytree_list = [[colsample_bytree_lb, colsample_bytree_ub], [sec_colsample_bytree_lb, sec_colsample_bytree_ub]]

    colsample_bytree_list_weights = [colsample_bytree_weight_1, colsample_bytree_weight_2]

    gamma_list = [[gamma_lb, gamma_ub],[sec_gamma_lb, sec_gamma_ub]] 

    gamma_list_weights = [gamma_weight_1, gamma_weight_2]

    lr_list = [[learning_rate_lb, learning_rate_ub],[sec_learning_rate_lb, sec_learning_rate_ub]]

    lr_list_weights = [lr_weight_1, lr_weight_2]

    hyperparameter_trials = generate_trials_XGB(no_trials, max_depth_list, max_depth_list_weights, min_child_weight_list, 
                                                min_child_weight_list_weights, colsample_bytree_list, colsample_bytree_list_weights, 
                                                gamma_list, gamma_list_weights, lr_list, lr_list_weights)
elif classifier_method == "Logistic":
    C_list = [[C_lb, C_ub], [sec_C_lb, sec_C_ub]]
    
    C_list_weights = [C_weight_1, C_weight_2]
    
    hyperparameter_trials = generate_trials_Log(no_trials, C_list, C_list_weights)

In [7]:
hyperparameters = hyperparameter_trials[trial_idx]

#Use all models and all ligands to get weights for all
hyperparameters["models"] = all_models_list
hyperparameters["ligands"] = [ligand]
print hyperparameters

{'models': ['XGB', 'RF', 'SVM', 'Logistic', 'NN'], 'colsample_bytree': 0.5392506525840591, 'gamma': 0.02645153937935788, 'scale_pos_weight': 0.1, 'learning_rate': 0.1956643967991207, 'max_depth': 35, 'min_child_weight': 2.8969551868675447, 'ligands': ['rna']}


### Create ligand positive and negative tables with stacking1 probs

In [9]:
stacking1_probs_path = curr_dir+"/../1st_level_pred/"+datafile_date+"_comb_dna0.5_rna0.5_ion0.75/"+tuning_type+"/all_combined/"+fold+"/"
(positives_df_stacking_ligands, negatives_df_stacking_ligands) = create_stacked_dataset(stacking1_probs_path, hyperparameters["ligands"], hyperparameters["models"], 
                                                                   ligands_positives_df[ligand], ligands_negatives_df[ligand], all_models_list)

#(features) = 766


### Models tested (and their hyper-parameters)

#### Predict for each ligand seperatelly

In [10]:
import time
start_time = time.time()

hyperparameters = hyperparameter_trials[trial_idx]
hyperparameters_output_dict = defaultdict(list)

#Get validation params
test_model_on_validation(hyperparameters, hyperparameters_output_dict,positives_df_stacking_ligands, negatives_df_stacking_ligands, ligand, classifier_method, fold, trial_idx)

#Get test fold performance
if (classifier_method == "NN" or classifier_method == "XGB"):
    hyperparameters["mean_epoch_count"] = hyperparameters_output_dict["mean_epoch_count"]
test_model_on_heldout(hyperparameters, hyperparameters_output_dict, hyperparameters, ligands_positives_df[ligand], ligands_negatives_df[ligand], ligand, classifier_method, fold)

hyperparameters_df = pd.DataFrame.from_dict(hyperparameters_output_dict)

#Save to file
hyperparameters_df.to_csv(curr_dir+"/"+datafile_date+"_"+prec_th_str+"/"+tuning_type+"/ligand_features_probs/per_trial/"+ligand+"_"+classifier_method+"_fold"+fold+"_trial"+str(trial_idx)+"_"+str(folds_num)+"w_hyperparameters.csv", sep='\t')

print "Finished ligand "+ligand
print "time elapsed = "+str(time.time()-start_time)

fold #: 1
model.best_iteration = 39
AUPRC = 0.13666095714420293
AUC = 0.9726365488259172
domain AUC mean = 0.7523409290402377
domain AUPRC mean = 0.642271290638133
domain AUPRC ratio mean = 2.318048550582659
model.best_iteration = 15
AUPRC = 0.360051227191605
AUC = 0.9385990719568602
domain AUC mean = 0.5602406150128493
domain AUPRC mean = 0.47771436760407343
domain AUPRC ratio mean = 1.1939902269447724
model.best_iteration = 37
AUPRC = 0.06229091523538079
AUC = 0.8825666749832574
domain AUC mean = 0.8354495124530075
domain AUPRC mean = 0.6916961811188428
domain AUPRC ratio mean = 2.8344118866336685
model.best_iteration = 187
AUPRC = 0.028994163483018608
AUC = 0.7783815283122595
domain AUC mean = 0.7639145141539778
domain AUPRC mean = 0.5048260616543885
domain AUPRC ratio mean = 5.939933119290614
Finished rna XGB fold: 1 trial: 44
fold #: 1
test AUC = 0.9051327582728677
test AUPRC = 0.4994124084421825
test domain AUC mean = 0.6411999458874459
test domain AUPRC mean = 0.53058341444902
t