In [1]:
#Basic imports
import pandas as pd
import numpy as np
import random
import pickle
from collections import defaultdict
from os import environ, getcwd
import sys

#Import utils functions
curr_dir = getcwd()

sys.path.append(curr_dir+"/utils")
from prop_threshold_funcs import create_negatives_datasets_combined, create_positives_datasets_combined
from prediction_general_funcs import ligands, get_features_cols, remove_unimportant_features
from generate_hyperparameter_trials import *
from tuning_helper_functions import test_model_on_validation, test_model_on_heldout

from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

cpu
cpu


### Reading the input dataset

In [None]:
datafile_date = "08.06.18"
input_path = curr_dir+"/domains_similarity/filtered_features_table/"
filename = "windowed_positions_features_mediode_filter_"+datafile_date+".csv"

#flags for creating negatives
zero_prop = True
no_prop = True
all_ligands = False
prec_th_str = "dna0.5_rna0.5_ion0.75"
folds_num = 5

#Features table
features_all = pd.read_csv(input_path+filename, sep='\t', index_col=0)
#Features columns names, without the labels (the binding scores)
features_cols = get_features_cols(features_all)
remove_unimportant_features(features_all, features_cols)

print "all samples positions #: "+str(features_all.shape[0])

#### Dataset of negative examples

In [None]:
ligands_negatives_df = create_negatives_datasets_combined(zero_prop, no_prop, features_all, features_cols, all_ligands)

#### Datasets of positive examples by ligand

In [None]:
ligands_positives_df = create_positives_datasets_combined(features_all, features_cols, all_ligands)

### Reading env input for downsampler technique, ligand and classifier

In [None]:
#Reading the ligand input
try:
    ligand = environ['ligand']
except:
    ligand = "ion"
print "ligand = "+ligand
    
#Reading the downsampler input
try: 
    fold = environ['fold']
except:
    fold = "4"
print "fold = "+fold

#Reading the classifier input
try: 
    classifier_method = environ['classifier']
except:
    classifier_method = "NN"
print "classifier_method = "+classifier_method

# Reading the index to generate model
try:
    trial_idx = int(environ["trial"])
except:
    trial_idx = 12
print "trial idx = "+ str(trial_idx)

if classifier_method == "NN":
    try:        
        learning_rate_ub = int(environ['learning_rate_ub'])
        learning_rate_lb = int(environ['learning_rate_lb'])
        batch_size_ub = int(environ['batch_size_ub'])
        batch_size_lb = int(environ['batch_size_lb'])
        weight_decay_ub = int(environ['weight_decay_ub'])
        weight_decay_lb = int(environ['weight_decay_lb'])
        beta_ub = float(environ['beta_ub'])
        beta_lb = float(environ['beta_lb'])
        hidden_units_1_ub = int(environ['hidden_units_1_ub'])
        hidden_units_1_lb = int(environ['hidden_units_1_lb'])
        hidden_units_2_ub = int(environ['hidden_units_2_ub'])
        hidden_units_2_lb = int(environ['hidden_units_2_lb'])
        
        try:
            sec_learning_rate_ub = int(environ['sec_learning_rate_ub'])
            sec_learning_rate_lb = int(environ['sec_learning_rate_lb'])
            lr_weight_1 = float(environ['lr_weight_1'])
            lr_weight_2 = float(environ['lr_weight_2'])
        except:
            sec_learning_rate_ub = learning_rate_ub 
            sec_learning_rate_lb = learning_rate_lb
            lr_weight_1 = 1
            lr_weight_2 = 1
            
        try:
            sec_batch_size_ub = int(environ['sec_batch_size_ub'])
            sec_batch_size_lb = int(environ['sec_batch_size_lb'])
            batch_size_weight_1 = float(environ['batch_size_weight_1'])
            batch_size_weight_2 = float(environ['batch_size_weight_2'])
            
        except:
            sec_batch_size_ub = batch_size_ub
            sec_batch_size_lb = batch_size_lb
            batch_size_weight_1 = 1
            batch_size_weight_2 = 1
            
        try:
            sec_weight_decay_ub = int(environ['sec_weight_decay_ub'])
            sec_weight_decay_lb = int(environ['sec_weight_decay_lb'])
            weight_decay_weight_1 = float(environ['weight_decay_weight_1'])
            weight_decay_weight_2 = float(environ['weight_decay_weight_2'])
            
        except:
            sec_weight_decay_ub = weight_decay_ub
            sec_weight_decay_lb = weight_decay_lb
            weight_decay_weight_1 = 1
            weight_decay_weight_2 = 1
        try:
            sec_beta_ub = float(environ['sec_beta_ub'])
            sec_beta_lb = float(environ['sec_beta_lb'])
            beta_weight_1 = float(environ['beta_weight_1'])
            beta_weight_2 = float(environ['beta_weight_2'])
        except:
            sec_beta_ub = beta_ub
            sec_beta_lb = beta_lb
            beta_weight_1 = 1
            beta_weight_2 = 1
            
        try:
            sec_hidden_units_1_ub = int(environ['sec_hidden_units_1_ub'])
            sec_hidden_units_1_lb = int(environ['sec_hidden_units_1_lb'])
            hidden_units_1_weight_1 = float(environ['hidden_units_1_weight_1'])
            hidden_units_1_weight_2 = float(environ['hidden_units_1_weight_2'])
        except:
            sec_hidden_units_1_ub = hidden_units_1_ub
            sec_hidden_units_1_lb = hidden_units_1_lb
            hidden_units_1_weight_1 = 1
            hidden_units_1_weight_2 = 1
            
        try:
            sec_hidden_units_2_ub = int(environ['sec_hidden_units_2_ub'])
            sec_hidden_units_2_lb = int(environ['sec_hidden_units_2_lb'])
            hidden_units_2_weight_1 = float(environ['hidden_units_2_weight_1'])
            hidden_units_2_weight_2 = float(environ['hidden_units_2_weight_2'])
            
        except: 
            sec_hidden_units_2_ub = hidden_units_2_ub
            sec_hidden_units_2_lb = hidden_units_2_lb
            hidden_units_2_weight_1 = 1
            hidden_units_2_weight_2 = 1
            
    except:        
        print "Error: goto NN exception"
        learning_rate_ub = -3
        learning_rate_lb = -5
        batch_size_ub = 300
        batch_size_lb = 30
        weight_decay_ub = -5
        weight_decay_lb = -25
        beta_ub = 0.95
        beta_lb = 0.85
        hidden_units_1_ub = 1000
        hidden_units_1_lb = 200
        hidden_units_2_ub = 1000
        hidden_units_2_lb = 350
        
        sec_learning_rate_ub = -4
        sec_learning_rate_lb = -5
        sec_batch_size_ub = 300
        sec_batch_size_lb = 30
        sec_weight_decay_ub = -5
        sec_weight_decay_lb = -25
        sec_beta_ub = 0.95
        sec_beta_lb = 0.85
        sec_hidden_units_1_ub = 1000
        sec_hidden_units_1_lb = 200
        sec_hidden_units_2_ub = 1000
        sec_hidden_units_2_lb = 350
        
        lr_weight_1 = 1
        lr_weight_2 = 1
        batch_size_weight_1 = 1
        batch_size_weight_2 = 1
        weight_decay_weight_1 = 1
        weight_decay_weight_2 = 1
        beta_weight_1 = 1
        beta_weight_2 = 1
        hidden_units_1_weight_1 = 1
        hidden_units_1_weight_2 = 1
        hidden_units_2_weight_1 = 1
        hidden_units_2_weight_2 = 1
        
    

elif classifier_method == "XGB":
    
    try:
        max_depth_ub = int(environ["max_depth_ub"])
        max_depth_lb = int(environ["max_depth_lb"])
        min_child_weight_ub = float(environ["min_child_weight_ub"])
        min_child_weight_lb = float(environ["min_child_weight_lb"])
        colsample_bytree_ub = float(environ["colsample_bytree_ub"])
        colsample_bytree_lb = float(environ["colsample_bytree_lb"])
        gamma_ub = float(environ["gamma_ub"])
        gamma_lb = float(environ["gamma_lb"])
        learning_rate_ub = float(environ["learning_rate_ub"])
        learning_rate_lb = float(environ["learning_rate_lb"])

        try:
            sec_max_depth_ub = int(environ["sec_max_depth_ub"])
            sec_max_depth_lb = int(environ["sec_max_depth_lb"])
            max_depth_weight_1 = float(environ["max_depth_weight_1"])
            max_depth_weight_2 = float(environ["max_depth_weight_2"])
        except:
            sec_max_depth_ub = max_depth_ub
            sec_max_depth_lb = max_depth_lb
            max_depth_weight_1 = 1
            max_depth_weight_2 = 1
        try:
            sec_min_child_weight_ub = float(environ['sec_min_child_weight_ub'])
            sec_min_child_weight_lb = float(environ['sec_min_child_weight_lb'])
            min_child_weight_weight_1 = float(environ["min_child_weight_weight_1"])
            min_child_weight_weight_2 = float(environ["min_child_weight_weight_2"])
            
        except:
            sec_min_child_weight_ub = min_child_weight_ub
            sec_min_child_weight_lb = min_child_weight_lb
            min_child_weight_weight_1 = 1
            min_child_weight_weight_2 = 1
        try:
            sec_colsample_bytree_ub = float(environ['sec_colsample_bytree_ub'])
            sec_colsample_bytree_lb = float(environ['sec_colsample_bytree_lb'])
            colsample_bytree_weight_1 = float(environ['colsample_bytree_weight_1'])
            colsample_bytree_weight_2 = float(environ['colsample_bytree_weight_2'])
        except:
            sec_colsample_bytree_ub = colsample_bytree_ub
            sec_colsample_bytree_lb = colsample_bytree_lb
            colsample_bytree_weight_1 = 1
            colsample_bytree_weight_2 = 1
            
        try:
            sec_gamma_ub = float(environ['sec_gamma_ub'])
            sec_gamma_lb = float(environ['sec_gamma_lb'])
            gamma_weight_1 = float(environ['gamma_weight_1'])
            gamma_weight_2 = float(environ['gamma_weight_2'])
        except:
            sec_gamma_ub = gamma_ub
            sec_gamma_lb = gamma_lb
            gamma_weight_1 = 1
            gamma_weight_2 = 1
        try:
            sec_learning_rate_ub = float(environ['sec_learning_rate_ub'])
            sec_learning_rate_lb = float(environ['sec_learning_rate_lb'])
            lr_weight_1 = float(environ['lr_weight_1'])
            lr_weight_2 = float(environ['lr_weight_2'])
        except:
            sec_learning_rate_ub = learning_rate_ub
            sec_learning_rate_lb = learning_rate_lb
            lr_weight_1 = 1
            lr_weight_2 = 1

    except:    
        print "Error: goto XGB exception"
        max_depth_ub = 100
        max_depth_lb = 1
        min_child_weight_ub = 5
        min_child_weight_lb = 0
        colsample_bytree_ub = 1
        colsample_bytree_lb = 0.25
        gamma_ub = 0
        gamma_lb = -3
        learning_rate_ub = -0.5
        learning_rate_lb = -4
        
        sec_max_depth_ub = 100
        sec_max_depth_lb = 1
        sec_min_child_weight_ub = 5
        sec_min_child_weight_lb = 0
        sec_colsample_bytree_ub = 1
        sec_colsample_bytree_lb = 0.25
        sec_gamma_ub = 0
        sec_gamma_lb = -3
        sec_learning_rate_ub = -0.5
        sec_learning_rate_lb = -4
        
        max_depth_weight_1 = 0.5
        max_depth_weight_2 = 0.5
        min_child_weight_weight_1 = 0.5
        min_child_weight_weight_2 = 0.5
        colsample_bytree_weight_1 = 0.5
        colsample_bytree_weight_2 = 0.5
        gamma_weight_1 = 0.5
        gamma_weight_2 = 0.5
        lr_weight_1 = 0.5
        lr_weight_2 = 0.5
        

elif classifier_method == "RF":  
    try:
        n_estimators_ub = int(environ["n_estimators_ub"])
        n_estimators_lb = int(environ["n_estimators_lb"])
        max_depth_ub = int(environ["max_depth_ub"])
        max_depth_lb = int(environ["max_depth_lb"])
        min_samples_leaf_ub = int(environ["min_samples_leaf_ub"])
        min_samples_leaf_lb = int(environ["min_samples_leaf_lb"])
        min_samples_split_ub = int(environ["min_samples_split_ub"])
        min_samples_split_lb = int(environ["min_samples_split_lb"])

        try:
            sec_n_estimators_ub = int(environ['sec_n_estimators_ub'])
            sec_n_estimators_lb = int(environ['sec_n_estimators_lb'])
            n_estimators_weight_1 = float(environ['n_estimators_weight_1'])
            n_estimators_weight_2 = float(environ['n_estimators_weight_2'])
        except:
            sec_n_estimators_ub = n_estimators_ub
            sec_n_estimators_lb = n_estimators_lb
            n_estimators_weight_1 = 1
            n_estimators_weight_2 = 1
            
        try:
            sec_max_depth_ub = int(environ['sec_max_depth_ub'])
            sec_max_depth_lb = int(environ['sec_max_depth_lb'])
            max_depth_weight_1 = float(environ['max_depth_weight_1'])
            max_depth_weight_2 = float(environ['max_depth_weight_2'])
        except:
            sec_max_depth_ub = max_depth_ub
            sec_max_depth_lb = max_depth_lb
            max_depth_weight_1 = 1
            max_depth_weight_2 = 1
            
        try:
            sec_min_samples_leaf_ub = int(environ['sec_min_samples_leaf_ub'])
            sec_min_samples_leaf_lb = int(environ['sec_min_samples_leaf_lb'])
            min_samples_leaf_weight_1 = float(environ['min_samples_leaf_weight_1'])
            min_samples_leaf_weight_2 = float(environ['min_samples_leaf_weight_2'])
        except:
            sec_min_samples_leaf_ub = min_samples_leaf_ub
            sec_min_samples_leaf_lb = min_samples_leaf_lb
            min_samples_leaf_weight_1 = 1
            min_samples_leaf_weight_2 = 1
        try:
            sec_min_samples_split_ub = int(environ['sec_min_samples_split_ub'])
            sec_min_samples_split_lb = int(environ['sec_min_samples_split_lb'])
            min_samples_split_weight_1 = float(environ['min_samples_split_weight_1'])
            min_samples_split_weight_2 = float(environ['min_samples_split_weight_2'])
            
        except:
            sec_min_samples_split_ub = min_samples_split_ub
            sec_min_samples_split_lb = min_samples_split_lb
            min_samples_split_weight_1 = 1
            min_samples_split_weight_2 = 1

    except:
        print "Error: goto RF exception"
        n_estimators_ub = 1500
        n_estimators_lb = 10
        max_depth_ub = 100
        max_depth_lb = 2
        min_samples_leaf_ub = 50
        min_samples_leaf_lb = 1
        min_samples_split_ub = 50
        min_samples_split_lb = 2

        sec_n_estimators_ub = 1500
        sec_n_estimators_lb = 10
        sec_max_depth_ub = 100
        sec_max_depth_lb = 2
        sec_min_samples_leaf_ub = 50
        sec_min_samples_leaf_lb = 1
        sec_min_samples_split_ub = 50
        sec_min_samples_split_lb = 2
        
        n_estimators_weight_1 = 1
        n_estimators_weight_2 = 1
        max_depth_weight_1 = 1
        max_depth_weight_2 = 1
        min_samples_leaf_weight_1 = 1
        min_samples_leaf_weight_2 = 1
        min_samples_split_weight_1 = 1
        min_samples_split_weight_2 = 1


elif classifier_method == "Logistic":
    try:        
        C_ub = int(environ["C_ub"])
        C_lb = int(environ["C_lb"])

        try:
            sec_C_ub = int(environ["sec_C_ub"])
            sec_C_lb = int(environ["sec_C_lb"])
            C_weight_1 = float(environ["C_weight_1"])
            C_weight_2 = float(environ["C_weight_2"])
            
        except:
            sec_C_ub = C_ub
            sec_C_lb = C_lb
            C_weight_1 = 1
            C_weight_2 = 1

    except: 
        print "Error: goto Logistic exception"
        C_ub = 1
        C_lb = -3
        
        sec_C_ub = 1
        sec_C_lb = -3
        
        C_weight_1 = 1
        C_weight_2 = 1
        

elif classifier_method == "KNN":
    try:
        n_neighbors_ub = int(environ["n_neighbors_ub"])
        n_neighbors_lb = int(environ["n_neighbors_lb"])
        
        try:
            sec_n_neighbors_ub = int(environ["sec_n_neighbors_ub"]) 
            sec_n_neighbors_lb = int(environ["sec_n_neighbors_lb"]) 
            n_neighbors_weight_1 = float(environ["n_neighbors_weight_1"])
            n_neighbors_weight_2 = float(environ["n_neighbors_weight_2"])
        except:
            sec_n_neighbors_ub = n_neighbors_ub
            sec_n_neighbors_lb = n_neighbors_lb
            n_neighbors_weight_1 = 1
            n_neighbors_weight_2 = 1       

    except:
        print "Error: goto KNN exception"
        n_neighbors_ub = 300
        n_neighbors_lb = 150
        
        sec_n_neighbors_ub = 1000
        sec_n_neighbors_lb = 450
        
        n_neighbors_weight_1 = 0.25
        n_neighbors_weight_2 = 0.75
          
        
elif classifier_method == "ADA":
    try:
        n_estimators_ub = int(environ["n_estimators_ub"])
        n_estimators_lb = int(environ["n_estimators_lb"])
        learning_rate_ub = float(environ["learning_rate_ub"])
        learning_rate_lb = float(environ["learning_rate_lb"])
        
        try:         
            sec_n_estimators_ub = int(environ["sec_n_estimators_ub"]) 
            sec_n_estimators_lb = int(environ["sec_n_estimators_lb"])
            n_estimators_weight_1 = float(environ['n_estimators_weight_1'])
            n_estimators_weight_2 = float(environ['n_estimators_weight_2'])
            
        except:
            sec_n_estimators_ub = n_estimators_ub
            sec_n_estimators_lb = n_estimators_lb
            n_estimators_weight_1 = 1
            n_estimators_weight_2 = 1
        try:
            sec_learning_rate_ub = float(environ["sec_learning_rate_ub"])
            sec_learning_rate_lb = float(environ["sec_learning_rate_lb"]) 
            lr_weight_1 = float(environ['lr_weight_1'])
            lr_weight_2 = float(environ['lr_weight_2'])
        except:
            sec_learning_rate_ub = learning_rate_ub
            sec_learning_rate_lb = learning_rate_lb         
            lr_weight_1 = 1
            lr_weight_2 = 1
            
    except:
        print "Error: goto ADA exception"
        n_estimators_ub = 1500
        n_estimators_lb = 100
        learning_rate_ub = -0.5
        learning_rate_lb = -4
        
        sec_n_estimators_ub = 1500
        sec_n_estimators_lb = 100
        sec_learning_rate_ub = -0.5
        sec_learning_rate_lb = -4
        
        n_estimators_weight_1 = 1
        n_estimators_weight_2 = 1
        lr_weight_1 = 1
        lr_weight_2 = 1
        
        
        
elif classifier_method == "SVM":
    try:
        C_ub = int(environ["C_ub"])
        C_lb = int(environ["C_lb"])
        gamma_ub = int(environ["gamma_ub"])
        gamma_lb = int(environ["gamma_lb"])
        
        try:
            sec_C_ub = int(environ["sec_C_ub"]) 
            sec_C_lb = int(environ["sec_C_lb"]) 
            C_weight_1 = float(environ["C_weight_1"])
            C_weight_2 = float(environ["C_weight_2"])
        
        except:
            sec_C_ub = C_ub
            sec_C_lb = C_lb
            C_weight_1 = 1
            C_weight_2 = 1
        
        try:
            sec_gamma_ub = int(environ["sec_gamma_ub"]) 
            sec_gamma_lb = int(environ["sec_gamma_lb"])
            gamma_weight_1 = float(environ['gamma_weight_1'])
            gamma_weight_2 = float(environ['gamma_weight_2'])
        except:
            sec_gamma_ub = gamma_ub
            sec_gamma_lb = gamma_lb
            gamma_weight_1 = 1
            gamma_weight_2 = 1            
            
    except:
        print "Error: goto SVM exception"
        C_ub = 2
        C_lb = -4
        gamma_ub = -1
        gamma_lb = -5
        
        sec_C_ub = 2
        sec_C_lb = -4
        sec_gamma_ub = -1
        sec_gamma_lb = -5
        C_weight_1 = 1
        C_weight_2 = 1
        gamma_weight_1 = 1
        gamma_weight_2 = 1  
        

### Generate hyperparameter trials

Choose hyperparameters and generate hyperparameters through random search in a grid, as explained by this video: https://www.youtube.com/watch?v=WrICwRrvuIc&index=66&list=PLBAGcD3siRDguyYYzhVwZ3tLvOyyG5k6K

Use logarithmic scale for search for learnining rate and weight decay for NN, as explained by this video: https://www.youtube.com/watch?v=VUbrW8OK3uo&index=67&list=PLBAGcD3siRDguyYYzhVwZ3tLvOyyG5k6K

Utilize nested cross validation to choose between models, as described here: https://stats.stackexchange.com/questions/266225/step-by-step-explanation-of-k-fold-cross-validation-with-grid-search-to-optimise/266229

In [None]:
no_trials = 100
                              
if classifier_method == "NN":
    lr_list = [[learning_rate_lb, learning_rate_ub],[sec_learning_rate_lb, sec_learning_rate_ub]]
    
    lr_list_weights = [lr_weight_1, lr_weight_2]
    
    batch_size_list = [[batch_size_lb, batch_size_ub],[sec_batch_size_lb, sec_batch_size_ub]]
    
    batch_size_list_weights = [batch_size_weight_1, batch_size_weight_2]
    
    weight_decay_list = [[weight_decay_lb, weight_decay_ub],[sec_weight_decay_lb, sec_weight_decay_ub]]
    
    weight_decay_list_weights = [weight_decay_weight_1, weight_decay_weight_2]
    
    beta_list = [[beta_lb, beta_ub], [sec_beta_lb, sec_beta_ub]]
    
    beta_list_weights = [beta_weight_1, beta_weight_2]
    
    hidden_units_1_list = [[hidden_units_1_lb, hidden_units_1_ub],[sec_hidden_units_1_lb, sec_hidden_units_1_ub]]
    
    hidden_units_1_list_weights = [hidden_units_1_weight_1, hidden_units_1_weight_2]
    
    hidden_units_2_list = [[hidden_units_2_lb, hidden_units_2_ub],[sec_hidden_units_2_lb, sec_hidden_units_2_ub]]
    
    hidden_units_2_list_weights = [hidden_units_2_weight_1, hidden_units_2_weight_2]
                              
    hyperparameter_trials = generate_trials_NN(no_trials, lr_list, lr_list_weights, batch_size_list, 
                                               batch_size_list_weights, weight_decay_list, weight_decay_list_weights, 
                                               beta_list, beta_list_weights, hidden_units_1_list, hidden_units_1_list_weights, 
                                               hidden_units_2_list, hidden_units_2_list_weights)
elif classifier_method == "XGB":
    max_depth_list = [[max_depth_lb, max_depth_ub], [sec_max_depth_lb, sec_max_depth_ub]]
    
    max_depth_list_weights = [max_depth_weight_1, max_depth_weight_2]
    
    min_child_weight_list = [[min_child_weight_lb, min_child_weight_ub],[sec_min_child_weight_lb, sec_min_child_weight_ub]]
    
    min_child_weight_list_weights = [min_child_weight_weight_1, min_child_weight_weight_2]
    
    colsample_bytree_list = [[colsample_bytree_lb, colsample_bytree_ub], [sec_colsample_bytree_lb, sec_colsample_bytree_ub]]
    
    colsample_bytree_list_weights = [colsample_bytree_weight_1, colsample_bytree_weight_2]
    
    gamma_list = [[gamma_lb, gamma_ub],[sec_gamma_lb, sec_gamma_ub]] 
    
    gamma_list_weights = [gamma_weight_1, gamma_weight_2]
    
    lr_list = [[learning_rate_lb, learning_rate_ub],[sec_learning_rate_lb, sec_learning_rate_ub]]
    
    lr_list_weights = [lr_weight_1, lr_weight_2]

    hyperparameter_trials = generate_trials_XGB(no_trials, max_depth_list, max_depth_list_weights, min_child_weight_list, 
                                                min_child_weight_list_weights, colsample_bytree_list, colsample_bytree_list_weights, 
                                                gamma_list, gamma_list_weights, lr_list, lr_list_weights)
                                      
elif classifier_method == "RF":
    
    n_estimators_list = [[n_estimators_lb, n_estimators_ub],[sec_n_estimators_lb, sec_n_estimators_ub]]
    
    n_estimators_list_weights = [n_estimators_weight_1, n_estimators_weight_2]

    max_depth_list = [[max_depth_lb, max_depth_ub],[sec_max_depth_lb, sec_max_depth_ub]]
    
    max_depth_list_weights = [max_depth_weight_1, max_depth_weight_2]
    
    min_samples_leaf_list = [[min_samples_leaf_lb, min_samples_leaf_ub], [sec_min_samples_leaf_lb, sec_min_samples_leaf_ub]]
 
    min_samples_leaf_list_weights = [min_samples_leaf_weight_1, min_samples_leaf_weight_2]
    
    min_samples_split_list = [[min_samples_split_lb, min_samples_split_ub],
                              [sec_min_samples_split_lb, sec_min_samples_split_ub]]
       
    min_samples_split_list_weights = [min_samples_split_weight_1, min_samples_split_weight_2]
    
    hyperparameter_trials = generate_trials_RF(no_trials, n_estimators_list, n_estimators_list_weights, max_depth_list, 
                                               max_depth_list_weights, min_samples_leaf_list, min_samples_leaf_list_weights, 
                                               min_samples_split_list, min_samples_split_list_weights)
    
elif classifier_method == "Logistic":
    C_list = [[C_lb, C_ub], [sec_C_lb, sec_C_ub]]
    
    C_list_weights = [C_weight_1, C_weight_2]
    
    hyperparameter_trials = generate_trials_Log(no_trials, C_list, C_list_weights)
    
elif classifier_method == "KNN":
    
    n_neighbors_list = [[n_neighbors_lb, n_neighbors_ub], [sec_n_neighbors_lb, sec_n_neighbors_ub]]
    
    n_neighbors_list_weights = [n_neighbors_weight_1, n_neighbors_weight_2]
    
    hyperparameter_trials = generate_trials_KNN(no_trials, n_neighbors_list, n_neighbors_list_weights)
    
elif classifier_method == "ADA":
    
    n_estimators_list = [[n_estimators_lb, n_estimators_ub], [sec_n_estimators_lb, sec_n_estimators_ub]]
    
    n_estimators_list_weights = [n_estimators_weight_1, n_estimators_weight_2]
    
    lr_list = [[learning_rate_lb, learning_rate_ub], [sec_learning_rate_lb, sec_learning_rate_ub]]
    
    lr_list_weights = [lr_weight_1, lr_weight_2]
    
    hyperparameter_trials = generate_trials_ADA(no_trials, n_estimators_list, n_estimators_list_weights, lr_list, 
                                                lr_list_weights)
    
elif classifier_method == "SVM":
    C_list = [[C_lb, C_ub], [sec_C_lb, sec_C_ub]]
    
    C_list_weights = [C_weight_1, C_weight_2]
                                      
    gamma_list = [[gamma_lb, gamma_ub], [sec_gamma_lb, sec_gamma_ub]]
    
    gamma_list_weights = [gamma_weight_1, gamma_weight_2]
                                          
    hyperparameter_trials = generate_trials_SVM(no_trials, C_list, C_list_weights, gamma_list, gamma_list_weights)

#print hyperparameter_trials

In [None]:
hyperparameters = hyperparameter_trials[trial_idx]
print hyperparameters

### Models tested (and their hyper-parameters)

#### Predict for each ligand seperatelly

In [None]:
import time
start_time = time.time()

hyperparameters = hyperparameter_trials[trial_idx]
hyperparameters_output_dict = defaultdict(list)

#Get validation params
test_model_on_validation(hyperparameters, hyperparameters_output_dict,ligands_positives_df[ligand], ligands_negatives_df[ligand], ligand, classifier_method, fold, trial_idx,
                        xgb_early_stopping_rounds=500, xgb_increase_rounds_limit=500)

#Get test fold performance
if (classifier_method == "NN" or classifier_method == "XGB"):
    hyperparameters["mean_epoch_count"] = hyperparameters_output_dict["mean_epoch_count"]
test_model_on_heldout(hyperparameters, hyperparameters_output_dict, hyperparameters, ligands_positives_df[ligand], ligands_negatives_df[ligand], ligand, classifier_method, fold)
hyperparameters_df = pd.DataFrame.from_dict(hyperparameters_output_dict)

#Save to file
hyperparameters_df.to_csv(curr_dir+"/hyperparam_tuning/phase1_initial_run/"+datafile_date+"_"+prec_th_str+"/per_trial/"+ligand+"_"+classifier_method+"_fold"+fold+"_trial"+str(trial_idx)+"_"+str(folds_num)+"w_hyperparameters.csv", sep='\t')

print "Finished ligand "+ligand
print "time elapsed = "+str(time.time()-start_time)