In [1]:
#Basic imports
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
from os import getcwd, environ
import sys

#Import utils functions
curr_dir = getcwd()
sys.path.append(curr_dir+"/../10.Prediction/utils")
from generate_hyperparameter_trials import generate_trials_XGB
from tuning_helper_functions import test_model_on_validation, test_model_on_heldout

### Read data

In [2]:
date = "08.06.18"

#Read input and sort by domain
features_table = pd.read_csv(curr_dir+"/domain_features.csv", sep="\t", index_col=0)
features_table.sort_index(inplace=True)
labels = pd.read_csv(curr_dir+"/train_domain_labels_"+date+".csv", sep="\t", index_col=0)
labels.sort_index(inplace=True)

#Verify input
for i in range(0,features_table.shape[0]):
    if features_table.index[i] != labels.index[i]:
        print(features_table.index[i])
        print(labels.index[i])
        print("Error: Domains do not match")

### Read hyperparams input

In [3]:
#Reading the ligand input
try:
    ligand = environ['ligand']
except:
    ligand = "dna"
print "ligand = "+ligand
    
#Reading the downsampler input
try: 
    fold = environ['fold']
except:
    fold = "1"
print "fold = "+fold

#Reading the classifier input
try: 
    classifier_method = environ['classifier']
except:
    classifier_method = "XGB"
print "classifier_method = "+classifier_method

# Reading the index to generate model
try:
    trial_idx = int(environ["trial"])
except:
    trial_idx = 0
print "trial idx = "+ str(trial_idx)

if classifier_method == "XGB":
    
    try:
        max_depth_ub = int(environ["max_depth_ub"])
        max_depth_lb = int(environ["max_depth_lb"])
        min_child_weight_ub = float(environ["min_child_weight_ub"])
        min_child_weight_lb = float(environ["min_child_weight_lb"])
        colsample_bytree_ub = float(environ["colsample_bytree_ub"])
        colsample_bytree_lb = float(environ["colsample_bytree_lb"])
        gamma_ub = float(environ["gamma_ub"])
        gamma_lb = float(environ["gamma_lb"])
        learning_rate_ub = float(environ["learning_rate_ub"])
        learning_rate_lb = float(environ["learning_rate_lb"])

        try:
            sec_max_depth_ub = int(environ["sec_max_depth_ub"])
            sec_max_depth_lb = int(environ["sec_max_depth_lb"])
            max_depth_weight_1 = float(environ["max_depth_weight_1"])
            max_depth_weight_2 = float(environ["max_depth_weight_2"])
        except:
            sec_max_depth_ub = max_depth_ub
            sec_max_depth_lb = max_depth_lb
            max_depth_weight_1 = 1
            max_depth_weight_2 = 1
        try:
            sec_min_child_weight_ub = float(environ['sec_min_child_weight_ub'])
            sec_min_child_weight_lb = float(environ['sec_min_child_weight_lb'])
            min_child_weight_weight_1 = float(environ["min_child_weight_weight_1"])
            min_child_weight_weight_2 = float(environ["min_child_weight_weight_2"])
            
        except:
            sec_min_child_weight_ub = min_child_weight_ub
            sec_min_child_weight_lb = min_child_weight_lb
            min_child_weight_weight_1 = 1
            min_child_weight_weight_2 = 1
        try:
            sec_colsample_bytree_ub = float(environ['sec_colsample_bytree_ub'])
            sec_colsample_bytree_lb = float(environ['sec_colsample_bytree_lb'])
            colsample_bytree_weight_1 = float(environ['colsample_bytree_weight_1'])
            colsample_bytree_weight_2 = float(environ['colsample_bytree_weight_2'])
        except:
            sec_colsample_bytree_ub = colsample_bytree_ub
            sec_colsample_bytree_lb = colsample_bytree_lb
            colsample_bytree_weight_1 = 1
            colsample_bytree_weight_2 = 1
            
        try:
            sec_gamma_ub = float(environ['sec_gamma_ub'])
            sec_gamma_lb = float(environ['sec_gamma_lb'])
            gamma_weight_1 = float(environ['gamma_weight_1'])
            gamma_weight_2 = float(environ['gamma_weight_2'])
        except:
            sec_gamma_ub = gamma_ub
            sec_gamma_lb = gamma_lb
            gamma_weight_1 = 1
            gamma_weight_2 = 1
        try:
            sec_learning_rate_ub = float(environ['sec_learning_rate_ub'])
            sec_learning_rate_lb = float(environ['sec_learning_rate_lb'])
            lr_weight_1 = float(environ['lr_weight_1'])
            lr_weight_2 = float(environ['lr_weight_2'])
        except:
            sec_learning_rate_ub = learning_rate_ub
            sec_learning_rate_lb = learning_rate_lb
            lr_weight_1 = 1
            lr_weight_2 = 1

    except:    
        print "Error: goto XGB exception"
        max_depth_ub = 10
        max_depth_lb = 1
        min_child_weight_ub = 5
        min_child_weight_lb = 0
        colsample_bytree_ub = 1
        colsample_bytree_lb = 0.5
        gamma_ub = -1
        gamma_lb = -3
        learning_rate_ub = -0.3
        learning_rate_lb = -1
        
        sec_max_depth_ub = 10
        sec_max_depth_lb = 1
        sec_min_child_weight_ub = 5
        sec_min_child_weight_lb = 0
        sec_colsample_bytree_ub = 1
        sec_colsample_bytree_lb = 0.5
        sec_gamma_ub = -1
        sec_gamma_lb = -3
        sec_learning_rate_ub = -0.3
        sec_learning_rate_lb = -1
        
        max_depth_weight_1 = 0.5
        max_depth_weight_2 = 0.5
        min_child_weight_weight_1 = 0.5
        min_child_weight_weight_2 = 0.5
        colsample_bytree_weight_1 = 0.5
        colsample_bytree_weight_2 = 0.5
        gamma_weight_1 = 0.5
        gamma_weight_2 = 0.5
        lr_weight_1 = 0.5
        lr_weight_2 = 0.5
        

ligand = dna
fold = 1
classifier_method = XGB
trial idx = 0
Error: goto XGB exception


### Sample trials

In [4]:
no_trials = 100

if classifier_method == "XGB":
    max_depth_list = [[max_depth_lb, max_depth_ub], [sec_max_depth_lb, sec_max_depth_ub]]
    
    max_depth_list_weights = [max_depth_weight_1, max_depth_weight_2]
    
    min_child_weight_list = [[min_child_weight_lb, min_child_weight_ub],[sec_min_child_weight_lb, sec_min_child_weight_ub]]
    
    min_child_weight_list_weights = [min_child_weight_weight_1, min_child_weight_weight_2]
    
    colsample_bytree_list = [[colsample_bytree_lb, colsample_bytree_ub], [sec_colsample_bytree_lb, sec_colsample_bytree_ub]]
    
    colsample_bytree_list_weights = [colsample_bytree_weight_1, colsample_bytree_weight_2]
    
    gamma_list = [[gamma_lb, gamma_ub],[sec_gamma_lb, sec_gamma_ub]] 
    
    gamma_list_weights = [gamma_weight_1, gamma_weight_2]
    
    lr_list = [[learning_rate_lb, learning_rate_ub],[sec_learning_rate_lb, sec_learning_rate_ub]]
    
    lr_list_weights = [lr_weight_1, lr_weight_2]

    hyperparameter_trials = generate_trials_XGB(no_trials, max_depth_list, max_depth_list_weights, min_child_weight_list, 
                                                min_child_weight_list_weights, colsample_bytree_list, colsample_bytree_list_weights, 
                                                gamma_list, gamma_list_weights, lr_list, lr_list_weights)

In [5]:
#print hyperparameter_trials
hyperparameters = hyperparameter_trials[trial_idx]
if (hyperparameters["scale_pos_weight"] == 0.1):
    hyperparameters["scale_pos_weight"] = 1 #dataset is not highly imbalance anymore, 0.1 is not useful here.
print hyperparameters

{'colsample_bytree': 0.9289728088113784, 'scale_pos_weight': 1, 'learning_rate': 0.273208739594035, 'min_child_weight': 4.2213287429050865, 'max_depth': 6, 'gamma': 0.04948840736375751}


### Arrange ligand data for prediction

In [6]:
ligand_str = ligand+"_label"
domains_positives = labels[labels[ligand_str] == 1].index
domains_negatvies = labels[labels[ligand_str] == 0].index
ligand_features_positives = features_table.loc[domains_positives,:]
ligand_features_negatives = features_table.loc[domains_negatvies,:]

### Predict and save trial performance

In [7]:
#Run the trial and predict
hyperparameters_output_dict = defaultdict(list)

#Get validation params
test_model_on_validation(hyperparameters, hyperparameters_output_dict, ligand_features_positives, ligand_features_negatives, ligand, classifier_method, fold, trial_idx, features=[],
                        xgb_early_stopping_rounds=50, xgb_increase_rounds_limit=0, final_model=False, whole_domain=True)

#Get test fold performance
if (classifier_method == "NN" or classifier_method == "XGB"):
    hyperparameters["mean_epoch_count"] = hyperparameters_output_dict["mean_epoch_count"]
    
    
test_model_on_heldout(hyperparameters_output_dict, hyperparameters, ligand_features_positives, ligand_features_negatives, ligand, 
                          classifier_method, fold, features=[], final_model=False, test_positives=None, test_negatives=None, rseed=0, whole_domain=True)
hyperparameters_df = pd.DataFrame.from_dict(hyperparameters_output_dict)

#Save to file
hyperparameters_df.to_csv(curr_dir+"/hyperparam_tuning/per_trial/"+ligand+"_"+classifier_method+"_fold"+fold+"_trial"+str(trial_idx)+"_5w_hyperparameters.csv", sep='\t')


fold #: 1
model.best_iteration = 8
AUPRC = 0.7844453165881737
AUC = 0.9311224489795918
model.best_iteration = 2
AUPRC = 0.6431216931216931
AUC = 0.8742424242424243
model.best_iteration = 19
AUPRC = 0.5492329196255641
AUC = 0.7264150943396227
model.best_iteration = 2
AUPRC = 0.3816643882433356
AUC = 0.7577639751552796
Finished dna XGB fold: 1 trial: 0
fold #: 1
test AUC = 0.9371584699453551
test AUPRC = 0.7677179058758006
Finished dna XGB fold: 1
