In [43]:
#Basic imports
import pandas as pd
import numpy as np
import random
import pickle
from collections import defaultdict
from os import environ, getcwd
import sys

#Classifier imports
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier

#ML framework imports
from sklearn.metrics import auc, roc_auc_score, precision_recall_curve
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle

# Neural Net imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

#Import utils functions
curr_dir = !pwd

sys.path.append(curr_dir[0]+"/utils")
from prop_threshold_funcs import create_negatives_datasets_combined, create_positives_datasets_combined
from prediction_general_funcs import ligands, score_cols_suffix, get_features_cols, remove_unimportant_features
from CV_funcs import add_domain_name_from_table_idx, calc_CV_idx_iterative
from generate_hyperparameter_trials import *


from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

### Reading the input dataset

In [44]:
curr_dir = !pwd
pfam_version = "31"
datafile_date = "06.20.18"
input_path = curr_dir[0]+"/domains_similarity/filtered_features_table/"
filename = "windowed_positions_features_mediode_filter_"+datafile_date+".csv"
out_dir = "mediode_NegLigand_NoFilter"

#flags for creating negatives
zero_prop = True
no_prop = True
all_ligands = False
prec_th_str = "dna0.5_rna0.25_ion0.75"
folds_num = 5

#Features table
features_all = pd.read_csv(input_path+filename, sep='\t', index_col=0)
#Features columns names, without the labels (the binding scores)
features_cols = get_features_cols(features_all)
remove_unimportant_features(features_all, features_cols)

print "all samples positions #: "+str(features_all.shape[0])

#CV splits dictionary
# with open(curr_dir[0]+"/CV_splits/pfam-v"+pfam_version+"/domain_"+str(folds_num)+"_folds_"+str(prec_th)+"_prec_dict.pik", 'rb') as handle:
#         splits_dict = pickle.load(handle)
with open(curr_dir[0]+"/CV_splits/pfam-v"+pfam_version+"/domain_"+str(folds_num)+"_folds_combined_"+prec_th_str+"_prec_dict.pik", 'rb') as handle:
        splits_dict = pickle.load(handle)

all samples positions #: 42535


#### Dataset of negative examples

In [45]:
ligands_negatives_df = create_negatives_datasets_combined(zero_prop, no_prop, features_all, features_cols, all_ligands)

dna non-binding #:41680
dnabase non-binding #:42089
dnabackbone non-binding #:41689
dna combined non binding #: 41555
rna non-binding #:41613
rnabase non-binding #:41828
rnabackbone non-binding #:41619
rna combined non binding #: 41401
peptide non-binding #:38794
ion non-binding #:37525
metabolite non-binding #:37463
sm non-binding #:30978


#### Datasets of positive examples by ligand

In [46]:
ligands_positives_df = create_positives_datasets_combined(features_all, features_cols, all_ligands)

dna #: 239
dnabase #: 170
dnabackbone #: 244
dna combined #: 353
rna #: 360
rnabase #: 246
rnabackbone #: 346
rna combined #: 468
peptide #: 462
ion #: 350
metabolite #: 504
sm #: 708


### Reading env input for downsampler technique, ligand and classifier

In [47]:
#Reading the ligand input
try:
    ligand = environ['ligand']
except:
    ligand = "dna"
print "ligand = "+ligand
    
#Reading the downsampler input
try: 
    fold = environ['fold']
except:
    fold = "1"
print "fold = "+fold

#Reading the classifier input
try: 
    classifier_method = environ['classifier']
except:
    classifier_method = "XGB"
print "classifier_method = "+classifier_method

# Reading the index to generate model
try:
    trial_idx = int(environ["trial"])
except:
    trial_idx = 0
print "trial idx = "+ str(trial_idx)

if classifier_method == "NN":
    try:
        learning_rate_ub = int(environ['learning_rate_ub'])
        learning_rate_lb = int(environ['learning_rate_lb'])
        batch_size_ub = int(environ['batch_size_ub'])
        batch_size_lb = int(environ['batch_size_lb'])
        weight_decay_ub = int(environ['weight_decay_ub'])
        weight_decay_lb = int(environ['weight_decay_lb'])
        beta_ub = float(environ['beta_ub'])
        beta_lb = float(environ['beta_lb'])
        hidden_units_1_ub = int(environ['hidden_units_1_ub'])
        hidden_units_1_lb = int(environ['hidden_units_1_lb'])
        hidden_units_2_ub = int(environ['hidden_units_2_ub'])
        hidden_units_2_lb = int(environ['hidden_units_2_lb'])

    except:
        learning_rate_ub = -2
        learning_rate_lb = -3
        batch_size_ub = 150
        batch_size_lb = 30
        weight_decay_ub = -7
        weight_decay_lb = -17
        beta_ub = 0.95
        beta_lb = 0.85
        hidden_units_1_ub = 300
        hidden_units_1_lb = 50
        hidden_units_2_ub = 800
        hidden_units_2_lb = 350
    

elif classifier_method == "XGB":
    try:
        n_estimators_ub =int(environ["n_estimators_ub"])
        n_estimators_lb =int(environ["n_estimators_lb"])
        max_depth_ub = int(environ["max_depth_ub"])
        max_depth_lb = int(environ["max_depth_lb"])
        min_child_weight_ub = int(environ["min_child_weight_ub"])
        min_child_weight_lb = int(environ["min_child_weight_lb"])
        colsample_bytree_ub = float(environ["colsample_bytree_ub"])
        colsample_bytree_lb = float(environ["colsample_bytree_lb"])
        gamma_ub = int(environ["gamma_ub"])
        gamma_lb = int(environ["gamma_lb"])
        learning_rate_ub = int(environ["learning_rate_ub"])
        learning_rate_lb = int(environ["learning_rate_lb"])
        
    
    except:
        n_estimators_ub = 1500
        n_estimators_lb = 100
        max_depth_ub = 1500
        max_depth_lb = 100
        min_child_weight_ub = 2
        min_child_weight_lb = 0
        colsample_bytree_ub = 1
        colsample_bytree_lb = 0.25
        gamma_ub = 0
        gamma_lb = -3
        learning_rate_ub = 0
        learning_rate_lb = -1
        
elif classifier_method == "RF":
    try:
        n_estimators_ub = int(environ["n_estimators_ub"])
        n_estimators_lb = int(environ["n_estimators_lb"])
        max_depth_ub = int(environ["max_depth_ub"])
        max_depth_lb = int(environ["max_depth_lb"])
        min_samples_leaf_ub = int(environ["min_samples_leaf_ub"])
        min_samples_leaf_lb = int(environ["min_samples_leaf_lb"])
        min_samples_split_ub = int(environ["min_samples_split_ub"])
        min_samples_split_lb = int(environ["min_samples_split_lb"])
    except:
        n_estimators_ub = 5
        n_estimators_lb = 2
        max_depth_ub = 20
        max_depth_lb = 2
        min_samples_leaf_ub = 50
        min_samples_leaf_lb = 1
        min_samples_split_ub = 50
        min_samples_split_lb = 1

elif classifier_method == "Logistic":
    try:
        C_ub = int(environ["C_ub"])
        C_lb = int(environ["C_lb"])
    except:
        C_ub = 3
        C_lb = 1

elif classifier_method == "KNN":
    try:
        n_neighbors_ub = int(environ["n_neighbors_ub"])
        n_neighbors_lb = int(environ["n_neighbors_lb"])

    except:
        n_neighbors_ub = 100
        n_neighbors_lb = 5
        
elif classifier_method == "ADA":
    try:
        n_estimators_ub = int(environ["n_estimators_ub"])
        n_estimators_lb = int(environ["n_estimators_lb"])
        learning_rate_ub = int(environ["learning_rate_ub"])
        learning_rate_lb = int(environ["learning_rate_lb"])
    except:
        n_estimators_ub = 6
        n_estimators_lb = 3
        learning_rate_ub = 0
        learning_rate_lb = -4
        
elif classifier_method == "SVM":
    try:
        C_ub = int(environ["C_ub"])
        C_lb = int(environ["C_lb"])
        gamma_ub = int(environ["gamma_ub"])
        gamma_lb = int(environ["gamma_lb"])
    except:
        C_ub = 4
        C_lb = 2
        gamma_ub = -4
        gamma_lb = -6

ligand = dna
fold = 1
classifier_method = XGB
trial idx = 0


### Generate hyperparameter trials

Choose hyperparameters and generate hyperparameters through random search in a grid, as explained by this video: https://www.youtube.com/watch?v=WrICwRrvuIc&index=66&list=PLBAGcD3siRDguyYYzhVwZ3tLvOyyG5k6K

Use logarithmic scale for search for learnining rate and weight decay for NN, as explained by this video: https://www.youtube.com/watch?v=VUbrW8OK3uo&index=67&list=PLBAGcD3siRDguyYYzhVwZ3tLvOyyG5k6K

Utilize nested cross validation to choose between models, as described here: https://stats.stackexchange.com/questions/266225/step-by-step-explanation-of-k-fold-cross-validation-with-grid-search-to-optimise/266229

In [48]:
no_trials = 100
print classifier_method
if classifier_method == "NN":
    hyperparameter_trials = generate_trials_NN(no_trials, learning_rate_ub, learning_rate_lb, batch_size_ub, batch_size_lb,weight_decay_ub, 
                                               weight_decay_lb, beta_ub, beta_lb, hidden_units_1_ub, hidden_units_1_lb, hidden_units_2_ub, 
                                               hidden_units_2_lb)
elif classifier_method == "XGB":
    hyperparameter_trials = generate_trials_XGB(no_trials, n_estimators_ub, n_estimators_lb,
                                                max_depth_ub, max_depth_lb, min_child_weight_ub,
                                                min_child_weight_lb, colsample_bytree_ub, colsample_bytree_lb,
                                               gamma_ub, gamma_lb, learning_rate_lb, learning_rate_ub)
elif classifier_method == "RF":
    hyperparameter_trials = generate_trials_RF(no_trials, n_estimators_ub, n_estimators_lb,
                                              max_depth_ub, max_depth_lb, min_samples_leaf_ub,
                                              min_samples_leaf_lb, min_samples_split_ub, min_samples_split_lb)
elif classifier_method == "Logistic":
    hyperparameter_trials = generate_trials_Log(no_trials, C_ub, C_lb)
    
elif classifier_method == "KNN":
    hyperparameter_trials = generate_trials_KNN(no_trials, n_neighbors_ub, n_neighbors_lb)
    
elif classifier_method == "ADA":
    hyperparameter_trials = generate_trials_ADA(no_trials, n_estimators_ub, n_estimators_lb,
                                               learning_rate_lb,learning_rate_ub)
    
elif classifier_method == "SVM":
    hyperparameter_trials = generate_trials_SVM(no_trials, C_ub, C_lb, gamma_ub, gamma_lb)

print hyperparameter_trials

XGB
[{'colsample_bytree': 0.7020725320537329, 'scale_pos_weight': 0.1, 'learning_rate': 0.26524963767594756, 'min_child_weight': 1.430378732744839, 'n_estimators': 784, 'max_depth': 659, 'gamma': 0.04311710058685491}, {'colsample_bytree': 0.9727470703757719, 'scale_pos_weight': 0.1, 'learning_rate': 0.6190490166774184, 'min_child_weight': 1.7835460015641595, 'n_estimators': 377, 'max_depth': 699, 'gamma': 0.014135935551752292}, {'colsample_bytree': 0.9441974787194958, 'scale_pos_weight': 'balanced', 'learning_rate': 0.12221634728708944, 'min_child_weight': 1.1360891221878646, 'n_estimators': 187, 'max_depth': 274, 'gamma': 0.0016334587611069498}, {'colsample_bytree': 0.8336175632123879, 'scale_pos_weight': 1, 'learning_rate': 0.9519592150539826, 'min_child_weight': 1.665239691095876, 'n_estimators': 215, 'max_depth': 1076, 'gamma': 0.40741446541662296}, {'colsample_bytree': 0.7591596475892202, 'scale_pos_weight': 'balanced', 'learning_rate': 0.3819616775589123, 'min_child_weight': 1.04

### Models tested (and their hyper-parameters)

In [49]:
def generate_model(classifier_method, hyperparameters, no_pos=1, no_neg=1):
    
    if (classifier_method == "XGB"):
        if (hyperparameters["scale_pos_weight"] == "balanced"):
            scale_weight = no_neg/float(no_pos)
        else:
            scale_weight = hyperparameters["scale_pos_weight"]
        model = XGBClassifier(n_estimators=hyperparameters["n_estimators"], n_jobs=-1, random_state=0, max_depth=hyperparameters["max_depth"], 
                              min_child_weight=1, colsample_bytree=hyperparameters["colsample_bytree"], 
                              gamma=hyperparameters["gamma"], learning_rate=hyperparameters["learning_rate"], scale_pos_weight=scale_weight)
        
    elif (classifier_method == "RF"):
        model = RandomForestClassifier(n_estimators=hyperparameters["n_estimators"], n_jobs=-1, random_state=0,
                                      max_depth=hyperparameters["max_depth"], min_samples_leaf=hyperparameters["min_samples_leaf"],
                                      min_samples_split=hyperparameters["min_samples_split"], class_weight=hyperparameters["class_weight"])
        
    elif(classifier_method == "Logistic"):
        model = LogisticRegression(C=hyperparameters["C"], random_state=0, n_jobs=-1, class_weight=hyperparameters["class_weight"])
        
    elif (classifier_method == "KNN"):
        model = KNeighborsClassifier(n_neighbors=hyperparameters["n_neighbors"], n_jobs=-1, weights=hyperparameters["weights"])
        
    elif (classifier_method == "ADA"):
        model = AdaBoostClassifier(n_estimators=hyperparameters["n_estimators"], random_state=0, learning_rate=hyperparameters["learning_rate"])
        
    elif (classifier_method == "SVM"):
        model = SVC(C=hyperparameters["C"], gamma = hyperparameters["gamma"], kernel=hyperparameters["kernel"], probability=True, random_state=0, cache_size=400,
                    class_weight = hyperparameters["class_weight"])
        
    elif (classifier_method =="NN"):
        torch.manual_seed(0)
        model = Net(hyperparameters)
        # sets model in training mode because batch normalization behavior in training and testing modes are different
        model.train()
    return model

In [50]:
# Generate trial for model
hyperparameters = hyperparameter_trials[trial_idx]
model = generate_model(classifier_method, hyperparameters)
print hyperparameters
print model

{'colsample_bytree': 0.7020725320537329, 'scale_pos_weight': 0.1, 'learning_rate': 0.26524963767594756, 'min_child_weight': 1.430378732744839, 'n_estimators': 784, 'max_depth': 659, 'gamma': 0.04311710058685491}
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.702072532054, gamma=0.0431171005869,
       learning_rate=0.5, max_delta_step=0, max_depth=659,
       min_child_weight=1, missing=None, n_estimators=784, n_jobs=-1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=0.1, seed=None,
       silent=False, subsample=1)


### Define the Network

Tutorial for Neural Net Architecture: https://pytorch.org/tutorials/beginner/blitz/neural_networks_tutorial.html

Utilize batch normalization, as explained here: https://www.youtube.com/watch?v=fv1Luwd-LOI&index=69&list=PLBAGcD3siRDguyYYzhVwZ3tLvOyyG5k6K

In [51]:
# define the network with batch normalization
class Net(nn.Module):
    def __init__(self, hyperparameters):
        hidden_units_1 = hyperparameters["hidden_units_1"]
        hidden_units_2 = hyperparameters["hidden_units_2"]
        super(Net, self).__init__()
        self.input = nn.Linear(len(features_cols), hidden_units_1) # read input size from the .shape of data table
        self.hidden1 = nn.Linear(hidden_units_1, hidden_units_2)
        self.hidden1_bn = nn.BatchNorm1d(hidden_units_2)
        self.hidden2 = nn.Linear(hidden_units_2, hidden_units_2)
        self.hidden2_bn = nn.BatchNorm1d(hidden_units_2)
        self.hidden3 = nn.Linear(hidden_units_2, hidden_units_1)
        self.hidden3_bn = nn.BatchNorm1d(hidden_units_1)
        self.output = nn.Linear(hidden_units_1,2)
        self.batch_size = hyperparameters["batch_size"]
        self.learning_rate = hyperparameters["learning_rate"]
        self.beta = hyperparameters["beta"]
        self.weight_decay = hyperparameters["weight_decay"]

    def forward(self, x):
        x = F.relu(self.input(x))
        x = F.relu(self.hidden1_bn(self.hidden1(x)))
        x = F.relu(self.hidden2_bn(self.hidden2(x)))
        x = F.relu(self.hidden3_bn(self.hidden3(x)))
        x = self.output(x)
        return x
    
    def fit(self, X_train, y_train_label, X_valid, y_valid, weight):

        # set random seed for weights and biases
        torch.manual_seed(0)

        # dataset
        dataset = pd.concat([X_train,y_train_label],axis=1)
        dataset = shuffle(dataset, random_state = 0)

        X_train = dataset.iloc[:,:dataset.shape[1]-1]
        y_train_label = dataset.iloc[:,dataset.shape[1]-1]


        # create loss function
        loss = nn.CrossEntropyLoss(weight = weight)
        # mini-batching
        batch_size = self.batch_size

        # create adam optimizer for Phase 1
        optimizer_1 = optim.Adam(self.parameters(), lr=self.learning_rate,betas=(self.beta,0.999), 
                                 weight_decay = self.weight_decay)
        no_batch_minus_1 = X_train.shape[0] / batch_size 

        # Repeated Stratified K Fold to ensure positives are evenly distributed across batches
        skf_1 = RepeatedStratifiedKFold(n_splits=no_batch_minus_1,n_repeats=301,random_state=0)
        count = 0
        epoch_count = 0
        prev_score = 0
        patience = 100 

        for train,test in skf_1.split(X_train,y_train_label):
            data = X_train.iloc[test,:]
            data = torch.Tensor(data.values.astype(np.float32))
             # forward pass
            output = self.forward(data)
            output.data = output.data.view(data.shape[0],2)

            labels = y_train_label[test]
            labels = torch.Tensor(labels.astype(np.float32))
            labels = torch.autograd.Variable(labels).long()

            # zero the gradient buffers
            optimizer_1.zero_grad()
            # compute loss and gradients
            loss_output = loss(output,labels)
            loss_output.backward()
            # Does the update
            optimizer_1.step()

            count = count + 1

            # Early Stopping
            if count == no_batch_minus_1 + 1:
                count = 0
                epoch_count = epoch_count + 1
                probs = self.predict_proba(X_valid)
                precision, recall, _ = precision_recall_curve(y_valid, probs)
                auprc = auc(recall, precision)
                if auprc > max_auprc:
                    max_auprc = auprc
                    ideal_epoch_count = epoch_count
                    patience = patience + epoch_count
                    patience_j = 0
                else:
                    patience_j = patience_j + 1 
                    if patience_j == patience: break
                
                self = self.train()                   
                #score = roc_auc_score(y_valid, probs)
        return max_auprc, ideal_epoch_count

        
    #prediction probabilities array
    def predict_proba(self, X_test):
        self.eval()
        #forward pass
        test = torch.Tensor(X_test.values.astype(np.float32))
        output = self.forward(test)
        sf = nn.Softmax()
        probs = sf(output.data)
        return probs[:,1]

#### Dealing with model imbalance
Weight Vector: https://towardsdatascience.com/dealing-with-imbalanced-classes-in-machine-learning-d43d6fa19d2 (look at section on "Cost-sensitive Learning")

Implementing Early Stopping for XGBoost: https://cambridgespark.com/content/tutorials/hyperparameter-tuning-in-xgboost/index.html

In [52]:
def test_model_iterative_fixed(hyperparameters_dict,ligand_bind_features, ligand_negatives_features, ligand_name, features=[]):
    
    """
    Test different models in k-folds cross-validation.
    """
    
    #Default: Exclude no features
    if len(features) == 0:
        features = np.ones([ligand_bind_features.shape[1],]).astype(bool)
    
    models_req_scaling = ["SVM", "KNN", "Logistic", "NN"]
    classifier = classifier_method

    #Create X and y with included features
    X = pd.concat([ligand_bind_features.iloc[:,features], ligand_negatives_features.iloc[:,features]])
    y = [1] * ligand_bind_features.shape[0]
    y.extend([0] * ligand_negatives_features.shape[0])
    y = np.array(y)
    y_df = pd.DataFrame(y)
    y_df.index = X.index
    y_df.columns = ["label"]
    
    #Get the fold indices
    cv_idx = calc_CV_idx_iterative(X, splits_dict)
    k = (int(fold)-1)
    
    pred_idx = k+1
    print "fold #: "+str(pred_idx)
    #test_index = cv_idx[k]["test"]
    full_train_index = cv_idx[k]["train"]
        
    # phase 1: testing on validation set, hyperparameter tuning
    
    trial_results = np.zeros(folds_num-1)
    epoch_counts = np.zeros(folds_num-1, dtype = "int")
    for i in range(folds_num-1):
        valid_k = (k + 1 + i) % folds_num
        valid_index = cv_idx[valid_k]["test"]

        train_index = [index for index in full_train_index if index not in valid_index]
        X_train, X_valid = X.loc[train_index,:], X.loc[valid_index,:]
        y_train, y_valid = y_df.loc[train_index,:], y_df.loc[valid_index,:]

        if (classifier in models_req_scaling):
            cols = X_train.columns

            # phase 1 scaling with just training data
            scaler_1 = StandardScaler() 
            scaler_1.fit(X_train) 
            X_train = pd.DataFrame(scaler_1.transform(X_train))
            # apply same transformation to validation data
            X_valid = pd.DataFrame(scaler_1.transform(X_valid))

            #Restoring indices after scaling
            X_train.index = train_index 
            X_valid.index = valid_index

            #Restoring features names
            X_train.columns = cols
            X_valid.columns = cols

        #No down-sampling
        X_train_sampled = X_train
        y_train_sampled = y_train

        #pos and neg numbers in the training
        #no_pos = np.count_nonzero(y_train_sampled["label"] == 1)
        #no_neg = np.count_nonzero(y_train_sampled["label"] == 0)

        #fit to training data
        if (classifier == "NN"):
            #weight vector
            #neg_weight = float(no_pos) / float(no_neg + no_pos) 
            #pos_weight = 1 - neg_weight
            neg_weight = 10
            pos_weight = 1
            weight = torch.Tensor([neg_weight, pos_weight])
            auprc_score,epoch_count = model.fit(X_train_sampled, y_train_sampled["label"],X_valid, y_valid["label"], weight)
        

        elif (classifier == "XGB"):
            num_early_stopping_rounds = 10
            model.fit(X_train_sampled, y_train_sampled["label"], eval_set = [(X_train_sampled, y_train_sampled["label"]),(X_valid,y_valid["label"])], eval_metric = "map", 
                      early_stopping_rounds = num_early_stopping_rounds)
            probs_list = []
            probs = model.predict_proba(X_valid, ntree_limit=model.best_ntree_limit)
            for l in probs:
                probs_list.append(l[1])
            precision, recall, _ = precision_recall_curve(y_valid, probs_list)
            auprc_score = auc(recall, precision)
            epoch_count = model.best_iteration + 1
            

        else:
            model.fit(X_train_sampled, y_train_sampled["label"])
            probs_list = []
            probs = model.predict_proba(X_valid)
            for l in probs:
                probs_list.append(l[1])
            precision, recall, _ = precision_recall_curve(y_valid, probs_list)
            auprc_score = auc(recall, precision)

        trial_results[i] = auprc_score 
        if classifier == "NN" or classifier == "XGB": epoch_counts[i] = epoch_count

    mean_result = np.mean(trial_results)
    if classifier == "NN" or classifier == "XGB":
        mean_epoch_count = int(np.mean(epoch_counts))
        hyperparameters_dict["mean_epoch_count"] = mean_epoch_count

    hyperparameters_dict["mean_AUPRC"] = mean_result

    # Update dictionary with all hyperparameters
    keys = hyperparameters.keys()
    for key in keys:
        hyperparameters_dict[key].append(hyperparameters[key])
    pred_idx += 1

    print "Finished "+ligand+" "+classifier+" fold: "+fold

#### Predict for each ligand seperatelly

In [53]:
%%time

#Initialize dictionary
hyperparameters_dict = defaultdict(list)

test_model_iterative_fixed(hyperparameters_dict,ligands_positives_df[ligand], ligands_negatives_df[ligand], ligand)

hyperparameters_df = pd.DataFrame.from_dict(hyperparameters_dict)

#Save to file
hyperparameters_df.to_csv(curr_dir[0]+"/hyperparam_tuning/phase1_initial_run/"+datafile_date+"_"+prec_th_str+"/per_trial/"+ligand+"_"+classifier_method+"_fold"+fold+"_trial"+str(trial_idx)+"_"+str(folds_num)+"w_hyperparameters.csv", sep='\t')

print "Finished ligand "+ligand

fold #: 1
[0]	validation_0-map:1	validation_1-map:0.007397
Multiple eval metrics have been passed: 'validation_1-map' will be used for early stopping.

Will train until validation_1-map hasn't improved in 10 rounds.
[1]	validation_0-map:0.65843	validation_1-map:0.007397
[2]	validation_0-map:0.363898	validation_1-map:0.012856
[3]	validation_0-map:0.450897	validation_1-map:0.018285
[4]	validation_0-map:0.49223	validation_1-map:0.015543
[5]	validation_0-map:0.526557	validation_1-map:0.018922
[6]	validation_0-map:0.577719	validation_1-map:0.016723
[7]	validation_0-map:0.625582	validation_1-map:0.015852
[8]	validation_0-map:0.686675	validation_1-map:0.041031
[9]	validation_0-map:0.734676	validation_1-map:0.043373
[10]	validation_0-map:0.770941	validation_1-map:0.042223
[11]	validation_0-map:0.800372	validation_1-map:0.057823
[12]	validation_0-map:0.812937	validation_1-map:0.082492
[13]	validation_0-map:0.831224	validation_1-map:0.078542
[14]	validation_0-map:0.842332	validation_1-map:0.0866

[31]	validation_0-map:0.962302	validation_1-map:0.47677
[32]	validation_0-map:0.967179	validation_1-map:0.474169
[33]	validation_0-map:0.969855	validation_1-map:0.476939
[34]	validation_0-map:0.973365	validation_1-map:0.485777
[35]	validation_0-map:0.976331	validation_1-map:0.487091
[36]	validation_0-map:0.980746	validation_1-map:0.491812
[37]	validation_0-map:0.982192	validation_1-map:0.491942
[38]	validation_0-map:0.984091	validation_1-map:0.491445
[39]	validation_0-map:0.984572	validation_1-map:0.496653
[40]	validation_0-map:0.986879	validation_1-map:0.492137
[41]	validation_0-map:0.988519	validation_1-map:0.482521
[42]	validation_0-map:0.98919	validation_1-map:0.482838
[43]	validation_0-map:0.990154	validation_1-map:0.49053
[44]	validation_0-map:0.991702	validation_1-map:0.489602
[45]	validation_0-map:0.992218	validation_1-map:0.487939
[46]	validation_0-map:0.9937	validation_1-map:0.486957
[47]	validation_0-map:0.995261	validation_1-map:0.472639
[48]	validation_0-map:0.996189	valid