In [1]:
#Basic imports
import pandas as pd
import numpy as np
import random
import pickle
from collections import defaultdict
from os import environ, getcwd
import sys

#Classifier imports
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier

#ML framework imports
from sklearn.metrics import auc, roc_auc_score, precision_recall_curve
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle

# Neural Net imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

#Import utils functions
curr_dir = !pwd

sys.path.append(curr_dir[0]+"/utils")
from prop_threshold_funcs import create_negatives_datasets_combined, create_positives_datasets_combined
from prediction_general_funcs import ligands, score_cols_suffix, get_features_cols, remove_unimportant_features
from CV_funcs import add_domain_name_from_table_idx, calc_CV_idx_iterative
from generate_hyperparameter_trials import *


from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

### Reading the input dataset

In [2]:
curr_dir = !pwd
pfam_version = "31"
datafile_date = "06.20.18"
input_path = curr_dir[0]+"/domains_similarity/filtered_features_table/"
filename = "windowed_positions_features_mediode_filter_"+datafile_date+".csv"
out_dir = "mediode_NegLigand_NoFilter"

#flags for creating negatives
zero_prop = True
no_prop = True
all_ligands = False
prec_th_str = "dna0.5_rna0.25_ion0.75"
folds_num = 5

#Features table
features_all = pd.read_csv(input_path+filename, sep='\t', index_col=0)
#Features columns names, without the labels (the binding scores)
features_cols = get_features_cols(features_all)
remove_unimportant_features(features_all, features_cols)

print "all samples positions #: "+str(features_all.shape[0])

#CV splits dictionary
# with open(curr_dir[0]+"/CV_splits/pfam-v"+pfam_version+"/domain_"+str(folds_num)+"_folds_"+str(prec_th)+"_prec_dict.pik", 'rb') as handle:
#         splits_dict = pickle.load(handle)
with open(curr_dir[0]+"/CV_splits/pfam-v"+pfam_version+"/domain_"+str(folds_num)+"_folds_combined_"+prec_th_str+"_prec_dict.pik", 'rb') as handle:
        splits_dict = pickle.load(handle)

all samples positions #: 42535


#### Remove unimportant features

In [None]:
print "# of features before removal: "+str(len(features_cols))
remove_unimportant_features(features_all, features_cols)
print "# of features after removal: "+str(len(features_cols))

#### Dataset of negative examples

In [3]:
ligands_negatives_df = create_negatives_datasets_combined(zero_prop, no_prop, features_all, features_cols, all_ligands)

dna non-binding #:41680
dnabase non-binding #:42089
dnabackbone non-binding #:41689
dna combined non binding #: 41555
rna non-binding #:41613
rnabase non-binding #:41828
rnabackbone non-binding #:41619
rna combined non binding #: 41401
peptide non-binding #:38794
ion non-binding #:37525
metabolite non-binding #:37463
sm non-binding #:30978


#### Datasets of positive examples by ligand

In [4]:
ligands_positives_df = create_positives_datasets_combined(features_all, features_cols, all_ligands)

dna #: 239
dnabase #: 170
dnabackbone #: 244
dna combined #: 353
rna #: 360
rnabase #: 246
rnabackbone #: 346
rna combined #: 468
peptide #: 462
ion #: 350
metabolite #: 504
sm #: 708




### Reading env input for downsampler technique, ligand and classifier

In [5]:
#Reading the ligand input
try:
    ligand = environ['ligand']
except:
    ligand = "dna"
print "ligand = "+ligand
    
#Reading the downsampler input
try: 
    fold = environ['fold']
except:
    fold = "1"
print "fold = "+fold

#Reading the classifier input
try: 
    classifier_method = environ['classifier']
except:
    classifier_method = "XGB"
print "classifier_method = "+classifier_method

# Reading the index to generate model
try:
    trial_idx = int(environ["trial"])
except:
    trial_idx = 0
print "trial idx = "+ str(trial_idx)

if classifier_method == "NN":
    try:
        learning_rate_ub = int(environ['learning_rate_ub'])
        learning_rate_lb = int(environ['learning_rate_lb'])
        batch_size_ub = int(environ['batch_size_ub'])
        batch_size_lb = int(environ['batch_size_lb'])
        weight_decay_ub = int(environ['weight_decay_ub'])
        weight_decay_lb = int(environ['weight_decay_lb'])
        beta_ub = float(environ['beta_ub'])
        beta_lb = float(environ['beta_lb'])
        hidden_units_1_ub = int(environ['hidden_units_1_ub'])
        hidden_units_1_lb = int(environ['hidden_units_1_lb'])
        hidden_units_2_ub = int(environ['hidden_units_2_ub'])
        hidden_units_2_lb = int(environ['hidden_units_2_lb'])

    except:
        learning_rate_ub = -2
        learning_rate_lb = -3
        batch_size_ub = 150
        batch_size_lb = 30
        weight_decay_ub = -7
        weight_decay_lb = -17
        beta_ub = 0.95
        beta_lb = 0.85
        hidden_units_1_ub = 300
        hidden_units_1_lb = 50
        hidden_units_2_ub = 800
        hidden_units_2_lb = 350
    

elif classifier_method == "XGB":
    try:
        max_depth_ub = int(environ["max_depth_ub"])
        max_depth_lb = int(environ["max_depth_lb"])
        min_child_weight_ub = int(environ["min_child_weight_ub"])
        min_child_weight_lb = int(environ["min_child_weight_lb"])
        colsample_bytree_ub = float(environ["colsample_bytree_ub"])
        colsample_bytree_lb = float(environ["colsample_bytree_lb"])
        gamma_ub = int(environ["gamma_ub"])
        gamma_lb = int(environ["gamma_lb"])
        learning_rate_ub = int(environ["learning_rate_ub"])
        learning_rate_lb = int(environ["learning_rate_lb"])
    except:
        max_depth_ub = 100
        max_depth_lb = 1
        min_child_weight_ub = 2
        min_child_weight_lb = 0
        colsample_bytree_ub = 1
        colsample_bytree_lb = 0.25
        gamma_ub = 0
        gamma_lb = -3
        learning_rate_ub = -0.5
        learning_rate_lb = -3
        
elif classifier_method == "RF":
    try:
        n_estimators_ub = int(environ["n_estimators_ub"])
        n_estimators_lb = int(environ["n_estimators_lb"])
        max_depth_ub = int(environ["max_depth_ub"])
        max_depth_lb = int(environ["max_depth_lb"])
        min_samples_leaf_ub = int(environ["min_samples_leaf_ub"])
        min_samples_leaf_lb = int(environ["min_samples_leaf_lb"])
        min_samples_split_ub = int(environ["min_samples_split_ub"])
        min_samples_split_lb = int(environ["min_samples_split_lb"])
    except:
        n_estimators_ub = 1500
        n_estimators_lb = 100
        max_depth_ub = 20
        max_depth_lb = 2
        min_samples_leaf_ub = 50
        min_samples_leaf_lb = 1
        min_samples_split_ub = 50
        min_samples_split_lb = 1

elif classifier_method == "Logistic":
    try:
        C_ub = int(environ["C_ub"])
        C_lb = int(environ["C_lb"])
    except:
        C_ub = 0
        C_lb = -3

elif classifier_method == "KNN":
    try:
        n_neighbors_ub = int(environ["n_neighbors_ub"])
        n_neighbors_lb = int(environ["n_neighbors_lb"])

    except:
        n_neighbors_ub = 100
        n_neighbors_lb = 5
        
elif classifier_method == "ADA":
    try:
        n_estimators_ub = int(environ["n_estimators_ub"])
        n_estimators_lb = int(environ["n_estimators_lb"])
        learning_rate_ub = int(environ["learning_rate_ub"])
        learning_rate_lb = int(environ["learning_rate_lb"])
    except:
        n_estimators_ub = 1500
        n_estimators_lb = 100
        learning_rate_ub = 0
        learning_rate_lb = -3
        
elif classifier_method == "SVM":
    try:
        C_ub = int(environ["C_ub"])
        C_lb = int(environ["C_lb"])
        gamma_ub = int(environ["gamma_ub"])
        gamma_lb = int(environ["gamma_lb"])
    except:
        C_ub = 2
        C_lb = -4
        gamma_ub = 1
        gamma_lb = -5

ligand = dna
fold = 1
classifier_method = XGB
trial idx = 0


### Generate hyperparameter trials

Choose hyperparameters and generate hyperparameters through random search in a grid, as explained by this video: https://www.youtube.com/watch?v=WrICwRrvuIc&index=66&list=PLBAGcD3siRDguyYYzhVwZ3tLvOyyG5k6K

Use logarithmic scale for search for learnining rate and weight decay for NN, as explained by this video: https://www.youtube.com/watch?v=VUbrW8OK3uo&index=67&list=PLBAGcD3siRDguyYYzhVwZ3tLvOyyG5k6K

Utilize nested cross validation to choose between models, as described here: https://stats.stackexchange.com/questions/266225/step-by-step-explanation-of-k-fold-cross-validation-with-grid-search-to-optimise/266229

In [6]:
no_trials = 100

if classifier_method == "NN":
    hyperparameter_trials = generate_trials_NN(no_trials, learning_rate_ub, learning_rate_lb, batch_size_ub, batch_size_lb,weight_decay_ub, 
                                               weight_decay_lb, beta_ub, beta_lb, hidden_units_1_ub, hidden_units_1_lb, hidden_units_2_ub, 
                                               hidden_units_2_lb)
elif classifier_method == "XGB":
    hyperparameter_trials = generate_trials_XGB(no_trials, max_depth_ub, max_depth_lb, min_child_weight_ub,
                                                min_child_weight_lb, colsample_bytree_ub, colsample_bytree_lb,
                                               gamma_ub, gamma_lb, learning_rate_lb, learning_rate_ub)
elif classifier_method == "RF":
    hyperparameter_trials = generate_trials_RF(no_trials, n_estimators_ub, n_estimators_lb,
                                              max_depth_ub, max_depth_lb, min_samples_leaf_ub,
                                              min_samples_leaf_lb, min_samples_split_ub, min_samples_split_lb)
elif classifier_method == "Logistic":
    hyperparameter_trials = generate_trials_Log(no_trials, C_ub, C_lb)
    
elif classifier_method == "KNN":
    hyperparameter_trials = generate_trials_KNN(no_trials, n_neighbors_ub, n_neighbors_lb)
    
elif classifier_method == "ADA":
    hyperparameter_trials = generate_trials_ADA(no_trials, n_estimators_ub, n_estimators_lb,
                                               learning_rate_lb,learning_rate_ub)
    
elif classifier_method == "SVM":
    hyperparameter_trials = generate_trials_SVM(no_trials, C_ub, C_lb, gamma_ub, gamma_lb)

#print hyperparameter_trials

In [7]:
hyperparameters = hyperparameter_trials[trial_idx]
print hyperparameters

{'colsample_bytree': 0.883199311435763, 'scale_pos_weight': 0.1, 'learning_rate': 0.1312590772846183, 'min_child_weight': 1.1856892364500367, 'max_depth': 45, 'gamma': 0.37483216628479255}


### Define the Network

Tutorial for Neural Net Architecture: https://pytorch.org/tutorials/beginner/blitz/neural_networks_tutorial.html

Utilize batch normalization, as explained here: https://www.youtube.com/watch?v=fv1Luwd-LOI&index=69&list=PLBAGcD3siRDguyYYzhVwZ3tLvOyyG5k6K

In [8]:
# define the network with batch normalization
class Net(nn.Module):
    def __init__(self, hyperparameters):
        hidden_units_1 = hyperparameters["hidden_units_1"]
        hidden_units_2 = hyperparameters["hidden_units_2"]
        super(Net, self).__init__()
        self.input = nn.Linear(len(features_cols), hidden_units_1) # read input size from the .shape of data table
        self.hidden1 = nn.Linear(hidden_units_1, hidden_units_2)
        self.hidden1_bn = nn.BatchNorm1d(hidden_units_2)
        self.hidden2 = nn.Linear(hidden_units_2, hidden_units_2)
        self.hidden2_bn = nn.BatchNorm1d(hidden_units_2)
        self.hidden3 = nn.Linear(hidden_units_2, hidden_units_1)
        self.hidden3_bn = nn.BatchNorm1d(hidden_units_1)
        self.output = nn.Linear(hidden_units_1,2)
        self.batch_size = hyperparameters["batch_size"]
        self.learning_rate = hyperparameters["learning_rate"]
        self.beta = hyperparameters["beta"]
        self.weight_decay = hyperparameters["weight_decay"]

    def forward(self, x):
        x = F.relu(self.input(x))
        x = F.relu(self.hidden1_bn(self.hidden1(x)))
        x = F.relu(self.hidden2_bn(self.hidden2(x)))
        x = F.relu(self.hidden3_bn(self.hidden3(x)))
        x = self.output(x)
        return x
    
    def fit(self, X_train, y_train_label, X_valid, y_valid, weight):

        # set random seed for weights and biases
        torch.manual_seed(0)

        # dataset
        dataset = pd.concat([X_train,y_train_label],axis=1)
        dataset = shuffle(dataset, random_state = 0)

        X_train = dataset.iloc[:,:dataset.shape[1]-1]
        y_train_label = dataset.iloc[:,dataset.shape[1]-1]


        # create loss function
        loss = nn.CrossEntropyLoss(weight = weight)
        # mini-batching
        batch_size = self.batch_size

        # create adam optimizer for Phase 1
        optimizer_1 = optim.Adam(self.parameters(), lr=self.learning_rate,betas=(self.beta,0.999), 
                                 weight_decay = self.weight_decay)
        no_batch_minus_1 = X_train.shape[0] / batch_size 

        # Repeated Stratified K Fold to ensure positives are evenly distributed across batches
        skf_1 = RepeatedStratifiedKFold(n_splits=no_batch_minus_1,n_repeats=301,random_state=0)
        
        count = 0
        epoch_count = 0
        max_auprc = 0
        ideal_epoch_count = 0 
        patience = 100
        patience_j = 0

        for train,test in skf_1.split(X_train,y_train_label):
            data = X_train.iloc[test,:]
            data = torch.Tensor(data.values.astype(np.float32))
             # forward pass
            output = self.forward(data)
            output.data = output.data.view(data.shape[0],2)

            labels = y_train_label[test]
            labels = torch.Tensor(labels.astype(np.float32))
            labels = torch.autograd.Variable(labels).long()

            # zero the gradient buffers
            optimizer_1.zero_grad()
            # compute loss and gradients
            loss_output = loss(output,labels)
            loss_output.backward()
            # Does the update
            optimizer_1.step()

            count = count + 1
            
            # Early Stopping
            if count == no_batch_minus_1 + 1:
                count = 0
                epoch_count = epoch_count + 1
                probs = self.predict_proba(X_valid)
                precision, recall, _ = precision_recall_curve(y_valid, probs)
                auprc = auc(recall, precision)
                if auprc > max_auprc:
                    max_auprc = auprc
                    ideal_epoch_count = epoch_count
                    patience = patience + epoch_count
                    patience_j = 0
                else:
                    patience_j = patience_j + 1 
                    if patience_j == patience: break

                self.train()

        return max_auprc, ideal_epoch_count

        
    #prediction probabilities array
    def predict_proba(self, X_test):
        self.eval()
        #forward pass
        test = torch.Tensor(X_test.values.astype(np.float32))
        output = self.forward(test)
        sf = nn.Softmax()
        probs = sf(output.data)
        return probs[:,1]

### Models tested (and their hyper-parameters)

In [9]:
def generate_model(classifier_method, hyperparameters, no_pos=1, no_neg=1):
    
    xgb_trees_limit = 5000
    
    if (classifier_method == "XGB"):
        if (hyperparameters["scale_pos_weight"] == "balanced"):
            scale_weight = no_neg/float(no_pos)
        else:
            scale_weight = hyperparameters["scale_pos_weight"]
        model = XGBClassifier(n_estimators=xgb_trees_limit, n_jobs=-1, random_state=0, max_depth=hyperparameters["max_depth"], 
                              min_child_weight=hyperparameters["min_child_weight"], colsample_bytree=hyperparameters["colsample_bytree"], 
                              gamma=hyperparameters["gamma"], learning_rate=hyperparameters["learning_rate"], scale_pos_weight=scale_weight)
        
    elif (classifier_method == "RF"):
        model = RandomForestClassifier(n_estimators=hyperparameters["n_estimators"], n_jobs=-1, random_state=0,
                                      max_depth=hyperparameters["max_depth"], min_samples_leaf=hyperparameters["min_samples_leaf"],
                                      min_samples_split=hyperparameters["min_samples_split"], class_weight=hyperparameters["class_weight"])
        
    elif(classifier_method == "Logistic"):
        model = LogisticRegression(C=hyperparameters["C"], random_state=0, n_jobs=-1, class_weight=hyperparameters["class_weight"])
        
    elif (classifier_method == "KNN"):
        model = KNeighborsClassifier(n_neighbors=hyperparameters["n_neighbors"], n_jobs=-1, weights=hyperparameters["weights"])
        
    elif (classifier_method == "ADA"):
        model = AdaBoostClassifier(n_estimators=hyperparameters["n_estimators"], random_state=0, learning_rate=hyperparameters["learning_rate"])
        
    elif (classifier_method == "SVM"):
        model = SVC(C=hyperparameters["C"], gamma = hyperparameters["gamma"], kernel=hyperparameters["kernel"], probability=True, random_state=0, cache_size=400,
                    class_weight = hyperparameters["class_weight"])
        
    elif (classifier_method =="NN"):
        torch.manual_seed(0)
        model = Net(hyperparameters)
        # sets model in training mode because batch normalization behavior in training and testing modes are different
        model.train()
    return model

#### Dealing with model imbalance
Weight Vector: https://towardsdatascience.com/dealing-with-imbalanced-classes-in-machine-learning-d43d6fa19d2 (look at section on "Cost-sensitive Learning")

Implementing Early Stopping for XGBoost: https://cambridgespark.com/content/tutorials/hyperparameter-tuning-in-xgboost/index.html

In [16]:
def test_model_iterative_fixed(hyperparameters_dict,ligand_bind_features, ligand_negatives_features, ligand_name, features=[]):
    
    """
    Test different models in k-folds cross-validation.
    """
    
    #Default: Exclude no features
    if len(features) == 0:
        features = np.ones([ligand_bind_features.shape[1],]).astype(bool)
    
    models_req_scaling = ["SVM", "KNN", "Logistic", "NN"]
    classifier = classifier_method

    #Create X and y with included features
    X = pd.concat([ligand_bind_features.iloc[:,features], ligand_negatives_features.iloc[:,features]])
    y = [1] * ligand_bind_features.shape[0]
    y.extend([0] * ligand_negatives_features.shape[0])
    y = np.array(y)
    y_df = pd.DataFrame(y)
    y_df.index = X.index
    y_df.columns = ["label"]
    
    #Get the fold indices
    cv_idx = calc_CV_idx_iterative(X, splits_dict)
    k = (int(fold)-1)
    
    pred_idx = k+1
    print "fold #: "+str(pred_idx)
    #test_index = cv_idx[k]["test"]
    full_train_index = cv_idx[k]["train"]
        
    # phase 1: testing on validation set, hyperparameter tuning
    
    trial_auprc_results = np.zeros(folds_num-1)
    trial_auc_results = np.zeros(folds_num-1)
    epoch_counts = np.zeros(folds_num-1, dtype = "int")
    for i in range(folds_num-1):
    #for i in range(1):
        valid_k = (k + 1 + i) % folds_num
        valid_index = cv_idx[valid_k]["test"]

        train_index = [index for index in full_train_index if index not in valid_index]
        X_train, X_valid = X.loc[train_index,:], X.loc[valid_index,:]
        y_train, y_valid = y_df.loc[train_index,:], y_df.loc[valid_index,:]

        if (classifier in models_req_scaling):
            cols = X_train.columns

            # phase 1 scaling with just training data
            scaler_1 = StandardScaler() 
            scaler_1.fit(X_train) 
            X_train = pd.DataFrame(scaler_1.transform(X_train))
            # apply same transformation to validation data
            X_valid = pd.DataFrame(scaler_1.transform(X_valid))

            #Restoring indices after scaling
            X_train.index = train_index 
            X_valid.index = valid_index

            #Restoring features names
            X_train.columns = cols
            X_valid.columns = cols

        #No down-sampling
        X_train_sampled = X_train
        y_train_sampled = y_train
        
        #pos and neg numbers in the training
        no_pos = np.count_nonzero(y_train_sampled["label"] == 1)
        no_neg = np.count_nonzero(y_train_sampled["label"] == 0)  
        
        #fit to training data
        if (classifier == "NN"):
            if hyperparameters["weight"] == "balanced":              
                #weight vector
                neg_weight = float(no_pos) / float(no_neg + no_pos) 
                pos_weight = 1 - neg_weight
            elif hyperparameters["weight"] == "0.1":
                neg_weight = 10
                pos_weight = 1
            elif hyperparameters["weight"] == "None":
                neg_weight = 1
                pos_weight = 1
            
            weight = torch.Tensor([neg_weight, pos_weight])
            model = generate_model(classifier_method, hyperparameters)
            auprc_score,epoch_count = model.fit(X_train_sampled, y_train_sampled["label"],X_valid, y_valid["label"], weight)

        elif (classifier == "XGB"):
            num_early_stopping_rounds = 750
            model = generate_model(classifier_method, hyperparameters, no_pos = no_pos, no_neg = no_neg)
            model.fit(X_train_sampled, y_train_sampled["label"], eval_set = [(X_valid,y_valid["label"])], eval_metric = "map", 
                      verbose=False, early_stopping_rounds = num_early_stopping_rounds)
            probs_list = []
            probs = model.predict_proba(X_valid, ntree_limit=model.best_ntree_limit)
            for l in probs:
                probs_list.append(l[1])
            precision, recall, _ = precision_recall_curve(y_valid, probs_list)
            auprc_score = auc(recall, precision)
            auc_score = roc_auc_score(y_valid, probs_list)
            print "model.best_iteration = "+str(model.best_iteration)
            epoch_count = model.best_ntree_limit
            

        else:            
            model = generate_model(classifier_method, hyperparameters)
            model.fit(X_train_sampled, y_train_sampled["label"])
            probs_list = []
            probs = model.predict_proba(X_valid)
            for l in probs:
                probs_list.append(l[1])
            precision, recall, _ = precision_recall_curve(y_valid, probs_list)
            auprc_score = auc(recall, precision)
        
        
        print "AUPRC = "+str(auprc_score)
        print "AUC = "+str(auc_score)
        trial_auprc_results[i] = auprc_score 
        trial_auc_results[i] = auc_score 
        if classifier == "NN" or classifier == "XGB": epoch_counts[i] = epoch_count
    
    mean_auprc_result = np.mean(trial_auprc_results)
    mean_auc_result = np.mean(trial_auc_results)
    var_auprc_result = np.var(trial_auprc_results)
    var_auc_result = np.var(trial_auc_results)
    if classifier == "NN" or classifier == "XGB":
        mean_epoch_count = int(np.mean(epoch_counts))
        hyperparameters_dict["mean_epoch_count"] = mean_epoch_count

    hyperparameters_dict["mean_AUPRC"] = mean_auprc_result
    hyperparameters_dict["mean_AUC"] = mean_auc_result
    hyperparameters_dict["var_AUPRC"] = var_auprc_result
    hyperparameters_dict["var_AUC"] = var_auc_result
    hyperparameters_dict["trial_idx"] = trial_idx

    # Update dictionary with all hyperparameters
    keys = hyperparameters.keys()
    for key in keys:
        hyperparameters_dict[key].append(hyperparameters[key])
    pred_idx += 1

    print "Finished "+ligand+" "+classifier+" fold: "+fold+" trial: "+str(trial_idx)

#### Predict for each ligand seperatelly

In [17]:
%%time

#Initialize dictionary
hyperparameters_dict = defaultdict(list)

test_model_iterative_fixed(hyperparameters_dict,ligands_positives_df[ligand], ligands_negatives_df[ligand], ligand)

hyperparameters_df = pd.DataFrame.from_dict(hyperparameters_dict)

#Save to file
hyperparameters_df.to_csv(curr_dir[0]+"/hyperparam_tuning/phase1_initial_run/"+datafile_date+"_"+prec_th_str+"/per_trial/"+ligand+"_"+classifier_method+"_fold"+fold+"_trial"+str(trial_idx)+"_"+str(folds_num)+"w_hyperparameters.csv", sep='\t')

print "Finished ligand "+ligand

fold #: 1
[0]	validation_0-map:0.015775
Will train until validation_0-map hasn't improved in 750 rounds.
[1]	validation_0-map:0.015775
[2]	validation_0-map:0.015775
[3]	validation_0-map:0.015775
[4]	validation_0-map:0.015775
[5]	validation_0-map:0.015775
[6]	validation_0-map:0.015484
[7]	validation_0-map:0.015484
[8]	validation_0-map:0.010538
[9]	validation_0-map:0.010538
[10]	validation_0-map:0.020102
[11]	validation_0-map:0.034602
[12]	validation_0-map:0.016986
[13]	validation_0-map:0.016133
[14]	validation_0-map:0.015405
[15]	validation_0-map:0.015425
[16]	validation_0-map:0.016414
[17]	validation_0-map:0.018584
[18]	validation_0-map:0.019671
[19]	validation_0-map:0.01965
[20]	validation_0-map:0.01881
[21]	validation_0-map:0.019923
[22]	validation_0-map:0.018862
[23]	validation_0-map:0.019418
[24]	validation_0-map:0.018061
[25]	validation_0-map:0.018026
[26]	validation_0-map:0.017042
[27]	validation_0-map:0.036777
[28]	validation_0-map:0.041221
[29]	validation_0-map:0.037838
[30]	va

[258]	validation_0-map:0.123674
[259]	validation_0-map:0.123674
[260]	validation_0-map:0.123674
[261]	validation_0-map:0.123674
[262]	validation_0-map:0.123674
[263]	validation_0-map:0.123674
[264]	validation_0-map:0.123674
[265]	validation_0-map:0.123674
[266]	validation_0-map:0.123674
[267]	validation_0-map:0.123674
[268]	validation_0-map:0.123674
[269]	validation_0-map:0.123674
[270]	validation_0-map:0.123674
[271]	validation_0-map:0.123674
[272]	validation_0-map:0.123674
[273]	validation_0-map:0.123674
[274]	validation_0-map:0.123674
[275]	validation_0-map:0.123674
[276]	validation_0-map:0.123674
[277]	validation_0-map:0.123674
[278]	validation_0-map:0.123674
[279]	validation_0-map:0.123674
[280]	validation_0-map:0.123674
[281]	validation_0-map:0.123674
[282]	validation_0-map:0.121529
[283]	validation_0-map:0.121529
[284]	validation_0-map:0.121529
[285]	validation_0-map:0.121529
[286]	validation_0-map:0.121529
[287]	validation_0-map:0.121529
[288]	validation_0-map:0.121529
[289]	va

[515]	validation_0-map:0.120079
[516]	validation_0-map:0.120079
[517]	validation_0-map:0.120079
[518]	validation_0-map:0.120079
[519]	validation_0-map:0.120079
[520]	validation_0-map:0.120079
[521]	validation_0-map:0.120079
[522]	validation_0-map:0.120079
[523]	validation_0-map:0.120079
[524]	validation_0-map:0.120079
[525]	validation_0-map:0.120079
[526]	validation_0-map:0.120079
[527]	validation_0-map:0.120079
[528]	validation_0-map:0.120079
[529]	validation_0-map:0.120079
[530]	validation_0-map:0.120079
[531]	validation_0-map:0.120079
[532]	validation_0-map:0.120079
[533]	validation_0-map:0.120079
[534]	validation_0-map:0.120079
[535]	validation_0-map:0.120079
[536]	validation_0-map:0.120079
[537]	validation_0-map:0.120079
[538]	validation_0-map:0.120079
[539]	validation_0-map:0.120079
[540]	validation_0-map:0.120079
[541]	validation_0-map:0.120079
[542]	validation_0-map:0.120079
[543]	validation_0-map:0.120079
[544]	validation_0-map:0.120079
[545]	validation_0-map:0.120079
[546]	va

[772]	validation_0-map:0.120079
[773]	validation_0-map:0.120079
[774]	validation_0-map:0.120079
[775]	validation_0-map:0.120079
[776]	validation_0-map:0.120079
[777]	validation_0-map:0.120079
[778]	validation_0-map:0.120079
[779]	validation_0-map:0.120079
[780]	validation_0-map:0.120079
[781]	validation_0-map:0.120079
[782]	validation_0-map:0.120079
[783]	validation_0-map:0.120079
[784]	validation_0-map:0.120079
[785]	validation_0-map:0.120079
[786]	validation_0-map:0.120079
[787]	validation_0-map:0.120079
[788]	validation_0-map:0.120079
[789]	validation_0-map:0.120079
[790]	validation_0-map:0.120079
[791]	validation_0-map:0.120079
[792]	validation_0-map:0.120079
[793]	validation_0-map:0.120079
[794]	validation_0-map:0.120079
[795]	validation_0-map:0.120079
[796]	validation_0-map:0.120079
[797]	validation_0-map:0.120079
[798]	validation_0-map:0.120079
[799]	validation_0-map:0.120079
[800]	validation_0-map:0.120079
[801]	validation_0-map:0.120079
[802]	validation_0-map:0.120079
[803]	va

[197]	validation_0-map:0.125344
[198]	validation_0-map:0.125344
[199]	validation_0-map:0.125344
[200]	validation_0-map:0.125344
[201]	validation_0-map:0.125344
[202]	validation_0-map:0.125344
[203]	validation_0-map:0.125344
[204]	validation_0-map:0.125344
[205]	validation_0-map:0.125344
[206]	validation_0-map:0.125344
[207]	validation_0-map:0.125344
[208]	validation_0-map:0.125344
[209]	validation_0-map:0.125344
[210]	validation_0-map:0.125344
[211]	validation_0-map:0.125344
[212]	validation_0-map:0.125344
[213]	validation_0-map:0.125344
[214]	validation_0-map:0.125344
[215]	validation_0-map:0.125344
[216]	validation_0-map:0.125344
[217]	validation_0-map:0.125344
[218]	validation_0-map:0.125344
[219]	validation_0-map:0.125344
[220]	validation_0-map:0.125344
[221]	validation_0-map:0.125344
[222]	validation_0-map:0.127289
[223]	validation_0-map:0.127289
[224]	validation_0-map:0.126771
[225]	validation_0-map:0.126771
[226]	validation_0-map:0.126771
[227]	validation_0-map:0.126771
[228]	va

[454]	validation_0-map:0.125221
[455]	validation_0-map:0.125221
[456]	validation_0-map:0.125221
[457]	validation_0-map:0.125221
[458]	validation_0-map:0.125221
[459]	validation_0-map:0.125221
[460]	validation_0-map:0.125221
[461]	validation_0-map:0.125221
[462]	validation_0-map:0.125221
[463]	validation_0-map:0.125221
[464]	validation_0-map:0.125221
[465]	validation_0-map:0.125221
[466]	validation_0-map:0.125221
[467]	validation_0-map:0.125221
[468]	validation_0-map:0.125221
[469]	validation_0-map:0.125221
[470]	validation_0-map:0.125221
[471]	validation_0-map:0.125221
[472]	validation_0-map:0.125221
[473]	validation_0-map:0.125221
[474]	validation_0-map:0.125221
[475]	validation_0-map:0.125221
[476]	validation_0-map:0.125221
[477]	validation_0-map:0.125221
[478]	validation_0-map:0.125221
[479]	validation_0-map:0.125221
[480]	validation_0-map:0.125221
[481]	validation_0-map:0.125221
[482]	validation_0-map:0.125221
[483]	validation_0-map:0.125221
[484]	validation_0-map:0.125221
[485]	va

[711]	validation_0-map:0.125689
[712]	validation_0-map:0.125689
[713]	validation_0-map:0.125689
[714]	validation_0-map:0.125689
[715]	validation_0-map:0.125689
[716]	validation_0-map:0.125689
[717]	validation_0-map:0.125689
[718]	validation_0-map:0.125689
[719]	validation_0-map:0.125689
[720]	validation_0-map:0.125689
[721]	validation_0-map:0.125689
[722]	validation_0-map:0.125689
[723]	validation_0-map:0.125689
[724]	validation_0-map:0.125689
[725]	validation_0-map:0.125689
[726]	validation_0-map:0.125689
[727]	validation_0-map:0.125689
[728]	validation_0-map:0.125689
[729]	validation_0-map:0.125689
[730]	validation_0-map:0.125689
[731]	validation_0-map:0.125689
[732]	validation_0-map:0.125689
[733]	validation_0-map:0.125689
[734]	validation_0-map:0.125689
[735]	validation_0-map:0.125689
[736]	validation_0-map:0.125689
[737]	validation_0-map:0.125689
[738]	validation_0-map:0.125689
[739]	validation_0-map:0.125689
[740]	validation_0-map:0.125689
[741]	validation_0-map:0.125689
[742]	va

[102]	validation_0-map:0.283281
[103]	validation_0-map:0.289118
[104]	validation_0-map:0.287352
[105]	validation_0-map:0.292185
[106]	validation_0-map:0.288608
[107]	validation_0-map:0.294743
[108]	validation_0-map:0.297231
[109]	validation_0-map:0.298794
[110]	validation_0-map:0.297281
[111]	validation_0-map:0.295201
[112]	validation_0-map:0.29646
[113]	validation_0-map:0.296183
[114]	validation_0-map:0.30044
[115]	validation_0-map:0.294943
[116]	validation_0-map:0.296327
[117]	validation_0-map:0.299505
[118]	validation_0-map:0.301095
[119]	validation_0-map:0.301353
[120]	validation_0-map:0.304905
[121]	validation_0-map:0.305987
[122]	validation_0-map:0.300344
[123]	validation_0-map:0.304845
[124]	validation_0-map:0.303968
[125]	validation_0-map:0.304691
[126]	validation_0-map:0.306998
[127]	validation_0-map:0.306814
[128]	validation_0-map:0.306894
[129]	validation_0-map:0.307734
[130]	validation_0-map:0.312333
[131]	validation_0-map:0.316593
[132]	validation_0-map:0.316593
[133]	vali

[359]	validation_0-map:0.331468
[360]	validation_0-map:0.331034
[361]	validation_0-map:0.330528
[362]	validation_0-map:0.330528
[363]	validation_0-map:0.330528
[364]	validation_0-map:0.330528
[365]	validation_0-map:0.330528
[366]	validation_0-map:0.330528
[367]	validation_0-map:0.330528
[368]	validation_0-map:0.330528
[369]	validation_0-map:0.330528
[370]	validation_0-map:0.330528
[371]	validation_0-map:0.330528
[372]	validation_0-map:0.330528
[373]	validation_0-map:0.332101
[374]	validation_0-map:0.332101
[375]	validation_0-map:0.332101
[376]	validation_0-map:0.332101
[377]	validation_0-map:0.332101
[378]	validation_0-map:0.332101
[379]	validation_0-map:0.332101
[380]	validation_0-map:0.332101
[381]	validation_0-map:0.332101
[382]	validation_0-map:0.332101
[383]	validation_0-map:0.332101
[384]	validation_0-map:0.332101
[385]	validation_0-map:0.332101
[386]	validation_0-map:0.333373
[387]	validation_0-map:0.333373
[388]	validation_0-map:0.333373
[389]	validation_0-map:0.331913
[390]	va

[616]	validation_0-map:0.331614
[617]	validation_0-map:0.331614
[618]	validation_0-map:0.331614
[619]	validation_0-map:0.331614
[620]	validation_0-map:0.331614
[621]	validation_0-map:0.331614
[622]	validation_0-map:0.331614
[623]	validation_0-map:0.331614
[624]	validation_0-map:0.331614
[625]	validation_0-map:0.331614
[626]	validation_0-map:0.331614
[627]	validation_0-map:0.331614
[628]	validation_0-map:0.331614
[629]	validation_0-map:0.331614
[630]	validation_0-map:0.331614
[631]	validation_0-map:0.331614
[632]	validation_0-map:0.331614
[633]	validation_0-map:0.331614
[634]	validation_0-map:0.331614
[635]	validation_0-map:0.331614
[636]	validation_0-map:0.331614
[637]	validation_0-map:0.331614
[638]	validation_0-map:0.331614
[639]	validation_0-map:0.331614
[640]	validation_0-map:0.331614
[641]	validation_0-map:0.331614
[642]	validation_0-map:0.331614
[643]	validation_0-map:0.331614
[644]	validation_0-map:0.331614
[645]	validation_0-map:0.331614
[646]	validation_0-map:0.331614
[647]	va

[873]	validation_0-map:0.331614
[874]	validation_0-map:0.331614
[875]	validation_0-map:0.331614
[876]	validation_0-map:0.331614
[877]	validation_0-map:0.331614
[878]	validation_0-map:0.331614
[879]	validation_0-map:0.331614
[880]	validation_0-map:0.331614
[881]	validation_0-map:0.331614
[882]	validation_0-map:0.331614
[883]	validation_0-map:0.331614
[884]	validation_0-map:0.331614
[885]	validation_0-map:0.331614
[886]	validation_0-map:0.331614
[887]	validation_0-map:0.331614
[888]	validation_0-map:0.331614
[889]	validation_0-map:0.331614
[890]	validation_0-map:0.331614
[891]	validation_0-map:0.331614
[892]	validation_0-map:0.331614
[893]	validation_0-map:0.331614
[894]	validation_0-map:0.331614
[895]	validation_0-map:0.331614
[896]	validation_0-map:0.331614
[897]	validation_0-map:0.331614
[898]	validation_0-map:0.331614
[899]	validation_0-map:0.331614
[900]	validation_0-map:0.331614
[901]	validation_0-map:0.331614
[902]	validation_0-map:0.331614
[903]	validation_0-map:0.331614
[904]	va

[1126]	validation_0-map:0.331614
[1127]	validation_0-map:0.331614
[1128]	validation_0-map:0.331614
[1129]	validation_0-map:0.331614
[1130]	validation_0-map:0.331614
[1131]	validation_0-map:0.331614
[1132]	validation_0-map:0.331614
[1133]	validation_0-map:0.331614
[1134]	validation_0-map:0.331614
[1135]	validation_0-map:0.331614
[1136]	validation_0-map:0.331614
[1137]	validation_0-map:0.331614
[1138]	validation_0-map:0.331614
[1139]	validation_0-map:0.331614
[1140]	validation_0-map:0.331614
[1141]	validation_0-map:0.331614
Stopping. Best iteration:
[391]	validation_0-map:0.333706

model.best_iteration = 391
AUPRC = 0.3296335401276381
AUC = 0.9127792032326838
[0]	validation_0-map:0.015566
Will train until validation_0-map hasn't improved in 750 rounds.
[1]	validation_0-map:0.015566
[2]	validation_0-map:0.015566
[3]	validation_0-map:0.015566
[4]	validation_0-map:0.015566
[5]	validation_0-map:0.015566
[6]	validation_0-map:0.015566
[7]	validation_0-map:0.015333
[8]	validation_0-map:0.015096

[238]	validation_0-map:0.479996
[239]	validation_0-map:0.479996
[240]	validation_0-map:0.479996
[241]	validation_0-map:0.479996
[242]	validation_0-map:0.479996
[243]	validation_0-map:0.479996
[244]	validation_0-map:0.479996
[245]	validation_0-map:0.479996
[246]	validation_0-map:0.482767
[247]	validation_0-map:0.482767
[248]	validation_0-map:0.482767
[249]	validation_0-map:0.482767
[250]	validation_0-map:0.482767
[251]	validation_0-map:0.482767
[252]	validation_0-map:0.482767
[253]	validation_0-map:0.482767
[254]	validation_0-map:0.482767
[255]	validation_0-map:0.482767
[256]	validation_0-map:0.486584
[257]	validation_0-map:0.486584
[258]	validation_0-map:0.486584
[259]	validation_0-map:0.486584
[260]	validation_0-map:0.486584
[261]	validation_0-map:0.486584
[262]	validation_0-map:0.486584
[263]	validation_0-map:0.486584
[264]	validation_0-map:0.486584
[265]	validation_0-map:0.486584
[266]	validation_0-map:0.486584
[267]	validation_0-map:0.486584
[268]	validation_0-map:0.486584
[269]	va

[495]	validation_0-map:0.482562
[496]	validation_0-map:0.482562
[497]	validation_0-map:0.482562
[498]	validation_0-map:0.482562
[499]	validation_0-map:0.482562
[500]	validation_0-map:0.482562
[501]	validation_0-map:0.482562
[502]	validation_0-map:0.482562
[503]	validation_0-map:0.482562
[504]	validation_0-map:0.482562
[505]	validation_0-map:0.482562
[506]	validation_0-map:0.482562
[507]	validation_0-map:0.482562
[508]	validation_0-map:0.482562
[509]	validation_0-map:0.482562
[510]	validation_0-map:0.482562
[511]	validation_0-map:0.482562
[512]	validation_0-map:0.482562
[513]	validation_0-map:0.482562
[514]	validation_0-map:0.482562
[515]	validation_0-map:0.482562
[516]	validation_0-map:0.482562
[517]	validation_0-map:0.482562
[518]	validation_0-map:0.482562
[519]	validation_0-map:0.482562
[520]	validation_0-map:0.482562
[521]	validation_0-map:0.482562
[522]	validation_0-map:0.482562
[523]	validation_0-map:0.482562
[524]	validation_0-map:0.482562
[525]	validation_0-map:0.482562
[526]	va

[752]	validation_0-map:0.482562
[753]	validation_0-map:0.482562
[754]	validation_0-map:0.482562
[755]	validation_0-map:0.482562
[756]	validation_0-map:0.482562
[757]	validation_0-map:0.482562
[758]	validation_0-map:0.482562
[759]	validation_0-map:0.482562
[760]	validation_0-map:0.482562
[761]	validation_0-map:0.482562
[762]	validation_0-map:0.482562
[763]	validation_0-map:0.482562
[764]	validation_0-map:0.482562
[765]	validation_0-map:0.482562
[766]	validation_0-map:0.482562
[767]	validation_0-map:0.482562
[768]	validation_0-map:0.482562
[769]	validation_0-map:0.482562
[770]	validation_0-map:0.482562
[771]	validation_0-map:0.482562
[772]	validation_0-map:0.482562
[773]	validation_0-map:0.482562
[774]	validation_0-map:0.482562
[775]	validation_0-map:0.482562
[776]	validation_0-map:0.482562
[777]	validation_0-map:0.482562
[778]	validation_0-map:0.482562
[779]	validation_0-map:0.482562
[780]	validation_0-map:0.482562
[781]	validation_0-map:0.482562
[782]	validation_0-map:0.482562
[783]	va