In [1]:
#Basic imports
import pandas as pd
import numpy as np
import random
import pickle
from collections import defaultdict
from os import environ, getcwd
import sys

#Classifier imports
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier

#ML framework imports
from sklearn.metrics import auc, roc_auc_score, precision_recall_curve
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle

# Neural Net imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR

#Import utils functions
curr_dir = !pwd

sys.path.append(curr_dir[0]+"/utils")
from prop_threshold_funcs import create_negatives_datasets_combined, create_positives_datasets_combined
from prediction_general_funcs import ligands, score_cols_suffix, get_features_cols, remove_unimportant_features
from CV_funcs import add_domain_name_from_table_idx, calc_CV_idx_iterative
from generate_hyperparameter_trials import *


from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

### Reading the input dataset

In [2]:
curr_dir = !pwd
pfam_version = "31"
datafile_date = "08.06.18"
input_path = curr_dir[0]+"/domains_similarity/filtered_features_table/"
filename = "windowed_positions_features_mediode_filter_"+datafile_date+".csv"
out_dir = "mediode_NegLigand_NoFilter"

#flags for creating negatives
zero_prop = True
no_prop = True
all_ligands = False
prec_th_str = "dna0.5_rna0.25_ion0.75"
folds_num = 5

#Features table
features_all = pd.read_csv(input_path+filename, sep='\t', index_col=0)
#Features columns names, without the labels (the binding scores)
features_cols = get_features_cols(features_all)
remove_unimportant_features(features_all, features_cols)

print "all samples positions #: "+str(features_all.shape[0])

#CV splits dictionary
# with open(curr_dir[0]+"/CV_splits/pfam-v"+pfam_version+"/domain_"+str(folds_num)+"_folds_"+str(prec_th)+"_prec_dict.pik", 'rb') as handle:
#         splits_dict = pickle.load(handle)
with open(curr_dir[0]+"/CV_splits/pfam-v"+pfam_version+"/domain_"+str(folds_num)+"_folds_combined_"+prec_th_str+"_prec_dict.pik", 'rb') as handle:
        splits_dict = pickle.load(handle)

all samples positions #: 44872


#### Dataset of negative examples

In [3]:
ligands_negatives_df = create_negatives_datasets_combined(zero_prop, no_prop, features_all, features_cols, all_ligands)

dna non-binding #:43886
dnabase non-binding #:44418
dnabackbone non-binding #:44021
dna combined non binding #: 43884
rna non-binding #:43727
rnabase non-binding #:44154
rnabackbone non-binding #:43944
rna combined non binding #: 43720
peptide non-binding #:41105
ion non-binding #:39630
metabolite non-binding #:39638
druglike non-binding #:35018
sm non-binding #:32697


#### Datasets of positive examples by ligand

In [4]:
ligands_positives_df = create_positives_datasets_combined(features_all, features_cols, all_ligands)

dna #: 369
dnabase #: 161
dnabackbone #: 245
dna combined #: 397
rna #: 450
rnabase #: 290
rnabackbone #: 306
rna combined #: 531
peptide #: 436
ion #: 351
metabolite #: 522
druglike #: 763
sm #: 825




### Reading env input for downsampler technique, ligand and classifier

In [5]:
#Reading the ligand input
try:
    ligand = environ['ligand']
except:
    ligand = "dna"
print "ligand = "+ligand
    
#Reading the downsampler input
try: 
    fold = environ['fold']
except:
    fold = "1"
print "fold = "+fold

#Reading the classifier input
try: 
    classifier_method = environ['classifier']
except:
    classifier_method = "NN"
print "classifier_method = "+classifier_method

# Reading the index to generate model
try:
    trial_idx = int(environ["trial"])
except:
    trial_idx = 0
print "trial idx = "+ str(trial_idx)

if classifier_method == "NN":
    try:        
        learning_rate_ub = int(environ['learning_rate_ub'])
        learning_rate_lb = int(environ['learning_rate_lb'])
        batch_size_ub = int(environ['batch_size_ub'])
        batch_size_lb = int(environ['batch_size_lb'])
        weight_decay_ub = int(environ['weight_decay_ub'])
        weight_decay_lb = int(environ['weight_decay_lb'])
        beta_ub = float(environ['beta_ub'])
        beta_lb = float(environ['beta_lb'])
        hidden_units_1_ub = int(environ['hidden_units_1_ub'])
        hidden_units_1_lb = int(environ['hidden_units_1_lb'])
        hidden_units_2_ub = int(environ['hidden_units_2_ub'])
        hidden_units_2_lb = int(environ['hidden_units_2_lb'])
        
        try:
            sec_learning_rate_ub = int(environ['sec_learning_rate_ub'])
            sec_learning_rate_lb = int(environ['sec_learning_rate_lb'])
            lr_weight_1 = float(environ['lr_weight_1'])
            lr_weight_2 = float(environ['lr_weight_2'])
        except:
            sec_learning_rate_ub = learning_rate_ub 
            sec_learning_rate_lb = learning_rate_lb
            lr_weight_1 = 1
            lr_weight_2 = 1
            
        try:
            sec_batch_size_ub = int(environ['sec_batch_size_ub'])
            sec_batch_size_lb = int(environ['sec_batch_size_lb'])
            batch_size_weight_1 = float(environ['batch_size_weight_1'])
            batch_size_weight_2 = float(environ['batch_size_weight_2'])
            
        except:
            sec_batch_size_ub = batch_size_ub
            sec_batch_size_lb = batch_size_lb
            batch_size_weight_1 = 1
            batch_size_weight_2 = 1
            
        try:
            sec_weight_decay_ub = int(environ['sec_weight_decay_ub'])
            sec_weight_decay_lb = int(environ['sec_weight_decay_lb'])
            weight_decay_weight_1 = float(environ['weight_decay_weight_1'])
            weight_decay_weight_2 = float(environ['weight_decay_weight_2'])
            
        except:
            sec_weight_decay_ub = weight_decay_ub
            sec_weight_decay_lb = weight_decay_lb
            weight_decay_weight_1 = 1
            weight_decay_weight_2 = 1
        try:
            sec_beta_ub = float(environ['sec_beta_ub'])
            sec_beta_lb = float(environ['sec_beta_lb'])
            beta_weight_1 = float(environ['beta_weight_1'])
            beta_weight_2 = float(environ['beta_weight_2'])
        except:
            sec_beta_ub = beta_ub
            sec_beta_lb = beta_lb
            beta_weight_1 = 1
            beta_weight_2 = 1
            
        try:
            sec_hidden_units_1_ub = int(environ['sec_hidden_units_1_ub'])
            sec_hidden_units_1_lb = int(environ['sec_hidden_units_1_lb'])
            hidden_units_1_weight_1 = float(environ['hidden_units_1_weight_1'])
            hidden_units_1_weight_2 = float(environ['hidden_units_1_weight_2'])
        except:
            sec_hidden_units_1_ub = hidden_units_1_ub
            sec_hidden_units_1_lb = hidden_units_1_lb
            hidden_units_1_weight_1 = 1
            hidden_units_1_weight_2 = 1
            
        try:
            sec_hidden_units_2_ub = int(environ['sec_hidden_units_2_ub'])
            sec_hidden_units_2_lb = int(environ['sec_hidden_units_2_lb'])
            hidden_units_2_weight_1 = float(environ['hidden_units_2_weight_1'])
            hidden_units_2_weight_2 = float(environ['hidden_units_2_weight_2'])
            
        except: 
            sec_hidden_units_2_ub = hidden_units_2_ub
            sec_hidden_units_2_lb = hidden_units_2_lb
            hidden_units_2_weight_1 = 1
            hidden_units_2_weight_2 = 1
            
            
            

    except:        
        learning_rate_ub = -4
        learning_rate_lb = -5
        batch_size_ub = 300
        batch_size_lb = 30
        weight_decay_ub = -5
        weight_decay_lb = -25
        beta_ub = 0.95
        beta_lb = 0.85
        hidden_units_1_ub = 300
        hidden_units_1_lb = 50
        hidden_units_2_ub = 1000
        hidden_units_2_lb = 350
        
        sec_learning_rate_ub = -4
        sec_learning_rate_lb = -5
        sec_batch_size_ub = 300
        sec_batch_size_lb = 30
        sec_weight_decay_ub = -5
        sec_weight_decay_lb = -25
        sec_beta_ub = 0.95
        sec_beta_lb = 0.85
        sec_hidden_units_1_ub = 300
        sec_hidden_units_1_lb = 50
        sec_hidden_units_2_ub = 1000
        sec_hidden_units_2_lb = 350
        
        lr_weight_1 = 1
        lr_weight_2 = 1
        batch_size_weight_1 = 1
        batch_size_weight_2 = 1
        weight_decay_weight_1 = 1
        weight_decay_weight_2 = 1
        beta_weight_1 = 1
        beta_weight_2 = 1
        hidden_units_1_weight_1 = 1
        hidden_units_1_weight_2 = 1
        hidden_units_2_weight_1 = 1
        hidden_units_2_weight_2 = 1
        
         
        
    

elif classifier_method == "XGB":
    
    try:
        max_depth_ub = int(environ["max_depth_ub"])
        max_depth_lb = int(environ["max_depth_lb"])
        min_child_weight_ub = int(environ["min_child_weight_ub"])
        min_child_weight_lb = int(environ["min_child_weight_lb"])
        colsample_bytree_ub = float(environ["colsample_bytree_ub"])
        colsample_bytree_lb = float(environ["colsample_bytree_lb"])
        gamma_ub = int(environ["gamma_ub"])
        gamma_lb = int(environ["gamma_lb"])
        learning_rate_ub = float(environ["learning_rate_ub"])
        learning_rate_lb = int(environ["learning_rate_lb"])

        try:
            sec_max_depth_ub = int(environ["sec_max_depth_ub"])
            sec_max_depth_lb = int(environ["sec_max_depth_lb"])
            max_depth_weight_1 = float(environ["max_depth_weight_1"])
            max_depth_weight_2 = float(environ["max_depth_weight_2"])
        except:
            sec_max_depth_ub = max_depth_ub
            sec_max_depth_lb = max_depth_lb
            max_depth_weight_1 = 1
            max_depth_weight_2 = 1
        try:
            sec_min_child_weight_ub = int(environ['sec_min_child_weight_ub'])
            sec_min_child_weight_lb = int(environ['sec_min_child_weight_lb'])
            min_child_weight_weight_1 = float(environ["min_child_weight_weight_1"])
            min_child_weight_weight_2 = float(environ["min_child_weight_weight_2"])
            
        except:
            sec_min_child_weight_ub = min_child_weight_ub
            sec_min_child_weight_lb = min_child_weight_lb
            min_child_weight_weight_1 = 1
            min_child_weight_weight_2 = 1
        try:
            sec_colsample_bytree_ub = float(environ['sec_colsample_bytree_ub'])
            sec_colsample_bytree_lb = float(environ['sec_colsample_bytree_lb'])
            colsample_bytree_weight_1 = float(environ['colsample_bytree_weight_1'])
            colsample_bytree_weight_2 = float(environ['colsample_bytree_weight_2'])
        except:
            sec_colsample_bytree_ub = colsample_bytree_ub
            sec_colsample_bytree_lb = colsample_bytree_lb
            colsample_bytree_weight_1 = 1
            colsample_bytree_weight_2 = 1
            
        try:
            sec_gamma_ub = int(environ['sec_gamma_ub'])
            sec_gamma_lb = int(environ['sec_gamma_lb'])
            gamma_weight_1 = float(environ['gamma_weight_1'])
            gamma_weight_2 = float(environ['gamma_weight_2'])
        except:
            sec_gamma_ub = gamma_ub
            sec_gamma_lb = gamma_lb
            gamma_weight_1 = 1
            gamma_weight_2 = 1
        try:
            sec_learning_rate_ub = int(environ['sec_learning_rate_ub'])
            sec_learning_rate_lb = int(environ['sec_learning_rate_lb'])
            lr_weight_1 = float(environ['lr_weight_1'])
            lr_weight_2 = float(environ['lr_weight_2'])
        except:
            sec_learning_rate_ub = learning_rate_ub
            sec_learning_rate_lb = learning_rate_lb
            lr_weight_1 = 1
            lr_weight_2 = 1

    except:    
        
        max_depth_ub = 1500
        max_depth_lb = 100
        min_child_weight_ub = 2
        min_child_weight_lb = 0
        colsample_bytree_ub = 1
        colsample_bytree_lb = 0.25
        gamma_ub = 0
        gamma_lb = -3
        learning_rate_ub = -0.5
        learning_rate_lb = -3
        
        sec_max_depth_ub = 4000
        sec_max_depth_lb = 2000
        sec_min_child_weight_ub = 5
        sec_min_child_weight_lb = 3
        sec_colsample_bytree_ub = 0.25
        sec_colsample_bytree_lb = 0
        sec_gamma_ub = -4
        sec_gamma_lb = -6
        sec_learning_rate_ub = -1
        sec_learning_rate_lb = -2
        
        max_depth_weight_1 = 1
        max_depth_weight_2 = 1
        min_child_weight_weight_1 = 1
        min_child_weight_weight_2 = 1
        colsample_bytree_weight_1 = 1
        colsample_bytree_weight_2 = 1
        gamma_weight_1 = 1
        gamma_weight_2 = 1
        lr_weight_1 = 1
        lr_weight_2 = 1
        
    print max_depth_weight_1   
        
        

elif classifier_method == "RF":  
    try:
        n_estimators_ub = int(environ["n_estimators_ub"])
        n_estimators_lb = int(environ["n_estimators_lb"])
        max_depth_ub = int(environ["max_depth_ub"])
        max_depth_lb = int(environ["max_depth_lb"])
        min_samples_leaf_ub = int(environ["min_samples_leaf_ub"])
        min_samples_leaf_lb = int(environ["min_samples_leaf_lb"])
        min_samples_split_ub = int(environ["min_samples_split_ub"])
        min_samples_split_lb = int(environ["min_samples_split_lb"])

        try:
            sec_n_estimators_ub = int(environ['sec_n_estimators_ub'])
            sec_n_estimators_lb = int(environ['sec_n_estimators_lb'])
            n_estimators_weight_1 = float(environ['n_estimators_weight_1'])
            n_estimators_weight_2 = float(environ['n_estimators_weight_2'])
        except:
            sec_n_estimators_ub = n_estimators_ub
            sec_n_estimators_lb = n_estimators_lb
            n_estimators_weight_1 = 1
            n_estimators_weight_2 = 1
            
        try:
            sec_max_depth_ub = int(environ['sec_max_depth_ub'])
            sec_max_depth_lb = int(environ['sec_max_depth_lb'])
            max_depth_weight_1 = float(environ['max_depth_weight_1'])
            max_depth_weight_2 = float(environ['max_depth_weight_2'])
        except:
            sec_max_depth_ub = max_depth_ub
            sec_max_depth_lb = max_depth_lb
            max_depth_weight_1 = 1
            max_depth_weight_2 = 1
            
        try:
            sec_min_samples_leaf_ub = int(environ['sec_min_samples_leaf_ub'])
            sec_min_samples_leaf_lb = int(environ['sec_min_samples_leaf_lb'])
            min_samples_leaf_weight_1 = float(environ['min_samples_leaf_weight_1'])
            min_samples_leaf_weight_2 = float(environ['min_samples_leaf_weight_2'])
        except:
            sec_min_samples_leaf_ub = min_samples_leaf_ub
            sec_min_samples_leaf_lb = min_samples_leaf_lb
            min_samples_leaf_weight_1 = 1
            min_samples_leaf_weight_2 = 1
        try:
            sec_min_samples_split_ub = int(environ['sec_min_samples_split_ub'])
            sec_min_samples_split_lb = int(environ['sec_min_samples_split_lb'])
            min_samples_split_weight_1 = float(environ['min_samples_split_weight_1'])
            min_samples_split_weight_2 = float(environ['min_samples_split_weight_2'])
            
        except:
            sec_min_samples_split_ub = min_samples_split_ub
            sec_min_samples_split_lb = min_samples_split_lb
            min_samples_split_weight_1 = 1
            min_samples_split_weight_2 = 1

    except:
        n_estimators_ub = 1500
        n_estimators_lb = 100
        max_depth_ub = 20
        max_depth_lb = 2
        min_samples_leaf_ub = 50
        min_samples_leaf_lb = 1
        min_samples_split_ub = 50
        min_samples_split_lb = 2

        sec_n_estimators_ub = 3000
        sec_n_estimators_lb = 2000
        sec_max_depth_ub = 100
        sec_max_depth_lb = 50
        sec_min_samples_leaf_ub = 100
        sec_min_samples_leaf_lb = 60
        sec_min_samples_split_ub = 100
        sec_min_samples_split_lb = 60
        
        n_estimators_weight_1 = 1
        n_estimators_weight_2 = 1
        max_depth_weight_1 = 1
        max_depth_weight_2 = 1
        min_samples_leaf_weight_1 = 1
        min_samples_leaf_weight_2 = 1
        min_samples_split_weight_1 = 1
        min_samples_split_weight_2 = 1


elif classifier_method == "Logistic":
    try:        
        C_ub = int(environ["C_ub"])
        C_lb = int(environ["C_lb"])

        try:
            sec_C_ub = int(environ["sec_C_ub"])
            sec_C_lb = int(environ["sec_C_lb"])
            C_weight_1 = float(environ["C_weight_1"])
            C_weight_2 = float(environ["C_weight_2"])
            
        except:
            sec_C_ub = C_ub
            sec_C_lb = C_lb
            C_weight_1 = 1
            C_weight_2 = 1

    except:       
        C_ub = 3
        C_lb = 1
        
        sec_C_ub = 7
        sec_C_lb = 6
        
        C_weight_1 = 1
        C_weight_2 = 1
        

elif classifier_method == "KNN":
    try:
        n_neighbors_ub = int(environ["n_neighbors_ub"])
        n_neighbors_lb = int(environ["n_neighbors_lb"])
        
        try:
            sec_n_neighbors_ub = int(environ["sec_n_neighbors_ub"]) 
            sec_n_neighbors_lb = int(environ["sec_n_neighbors_lb"]) 
            n_neighbors_weight_1 = float(environ["n_neighbors_weight_1"])
            n_neighbors_weight_2 = float(environ["n_neighbors_weight_2"])
        except:
            sec_n_neighbors_ub = n_neighbors_ub
            sec_n_neighbors_lb = n_neighbors_lb
            n_neighbors_weight_1 = 1
            n_neighbors_weight_2 = 1
            

    except:
        n_neighbors_ub = 100
        n_neighbors_lb = 5
        
        sec_n_neighbors_ub = 200
        sec_n_neighbors_lb = 150
        
        n_neighbors_weight_1 = 1
        n_neighbors_weight_2 = 1
        
        
        
elif classifier_method == "ADA":
    try:
        n_estimators_ub = int(environ["n_estimators_ub"])
        n_estimators_lb = int(environ["n_estimators_lb"])
        learning_rate_ub = int(environ["learning_rate_ub"])
        learning_rate_lb = int(environ["learning_rate_lb"])
        
        try:         
            sec_n_estimators_ub = int(environ["sec_n_estimators_ub"]) 
            sec_n_estimators_lb = int(environ["sec_n_estimators_lb"])
            n_estimators_weight_1 = float(environ['n_estimators_weight_1'])
            n_estimators_weight_2 = float(environ['n_estimators_weight_2'])
            
        except:
            sec_n_estimators_ub = n_estimators_ub
            sec_n_estimators_lb = n_estimators_lb
            n_estimators_weight_1 = 1
            n_estimators_weight_2 = 1
        try:
            sec_learning_rate_ub = int(environ["sec_learning_rate_ub"])
            sec_learning_rate_lb = int(environ["sec_learning_rate_lb"]) 
            lr_weight_1 = float(environ['lr_weight_1'])
            lr_weight_2 = float(environ['lr_weight_2'])
        except:
            sec_learning_rate_ub = learning_rate_ub
            sec_learning_rate_lb = learning_rate_lb         
            lr_weight_1 = 1
            lr_weight_2 = 1
            
    except:
        
        n_estimators_ub = 6
        n_estimators_lb = 3
        learning_rate_ub = 0
        learning_rate_lb = -4
        
        sec_n_estimators_ub = 12
        sec_n_estimators_lb = 9
        sec_learning_rate_ub = -14
        sec_learning_rate_lb = -15
        
        n_estimators_weight_1 = 1
        n_estimators_weight_2 = 1
        lr_weight_1 = 1
        lr_weight_2 = 1
        
        
        
elif classifier_method == "SVM":
    try:
        C_ub = int(environ["C_ub"])
        C_lb = int(environ["C_lb"])
        gamma_ub = int(environ["gamma_ub"])
        gamma_lb = int(environ["gamma_lb"])
        
        try:
            sec_C_ub = int(environ["sec_C_ub"]) 
            sec_C_lb = int(environ["sec_C_lb"]) 
            C_weight_1 = float(environ["C_weight_1"])
            C_weight_2 = float(environ["C_weight_2"])
        
        except:
            sec_C_ub = C_ub
            sec_C_lb = C_lb
            C_weight_1 = 1
            C_weight_2 = 1
        
        try:
            sec_gamma_ub = int(environ["sec_gamma_ub"]) 
            sec_gamma_lb = int(environ["sec_gamma_lb"])
            gamma_weight_1 = float(environ['gamma_weight_1'])
            gamma_weight_2 = float(environ['gamma_weight_2'])
        except:
            sec_gamma_ub = gamma_ub
            sec_gamma_lb = gamma_lb
            gamma_weight_1 = 1
            gamma_weight_2 = 1            
            
    except:
        
        C_ub = 4
        C_lb = 2
        gamma_ub = -4
        gamma_lb = -6
        
        sec_C_ub = 10
        sec_C_lb = 8
        sec_gamma_ub = -13
        sec_gamma_lb = -15
        
        C_weight_1 = 1
        C_weight_2 = 1
        gamma_weight_1 = 1
        gamma_weight_2 = 1  
        

ligand = dna
fold = 1
classifier_method = NN
trial idx = 0


### Generate hyperparameter trials

Choose hyperparameters and generate hyperparameters through random search in a grid, as explained by this video: https://www.youtube.com/watch?v=WrICwRrvuIc&index=66&list=PLBAGcD3siRDguyYYzhVwZ3tLvOyyG5k6K

Use logarithmic scale for search for learnining rate and weight decay for NN, as explained by this video: https://www.youtube.com/watch?v=VUbrW8OK3uo&index=67&list=PLBAGcD3siRDguyYYzhVwZ3tLvOyyG5k6K

Utilize nested cross validation to choose between models, as described here: https://stats.stackexchange.com/questions/266225/step-by-step-explanation-of-k-fold-cross-validation-with-grid-search-to-optimise/266229

In [15]:
no_trials = 100
                              
if classifier_method == "NN":
    lr_list = [[learning_rate_lb, learning_rate_ub],[sec_learning_rate_lb, sec_learning_rate_ub]]
    
    lr_list_weights = [lr_weight_1, lr_weight_2]
    
    batch_size_list = [[batch_size_lb, batch_size_ub],[sec_batch_size_lb, sec_batch_size_ub]]
    
    batch_size_list_weights = [batch_size_weight_1, batch_size_weight_2]
    
    weight_decay_list = [[weight_decay_lb, weight_decay_ub],[sec_weight_decay_lb, sec_weight_decay_ub]]
    
    weight_decay_list_weights = [weight_decay_weight_1, weight_decay_weight_2]
    
    beta_list = [[beta_lb, beta_ub], [sec_beta_lb, sec_beta_ub]]
    
    beta_list_weights = [beta_weight_1, beta_weight_2]
    
    hidden_units_1_list = [[hidden_units_1_lb, hidden_units_1_ub],[sec_hidden_units_1_lb, sec_hidden_units_1_ub]]
    
    hidden_units_1_list_weights = [hidden_units_1_weight_1, hidden_units_1_weight_2]
    
    hidden_units_2_list = [[hidden_units_2_lb, hidden_units_2_ub],[sec_hidden_units_2_lb, sec_hidden_units_2_ub]]
    
    hidden_units_2_list_weights = [hidden_units_2_weight_1, hidden_units_2_weight_2]
                              
    hyperparameter_trials = generate_trials_NN(no_trials, lr_list, lr_list_weights, batch_size_list, 
                                               batch_size_list_weights, weight_decay_list, weight_decay_list_weights, 
                                               beta_list, beta_list_weights, hidden_units_1_list, hidden_units_1_list_weights, 
                                               hidden_units_2_list, hidden_units_2_list_weights)
elif classifier_method == "XGB":
    max_depth_list = [[max_depth_lb, max_depth_ub], [sec_max_depth_lb, sec_max_depth_ub]]
    
    max_depth_list_weights = [max_depth_weight_1, max_depth_weight_2]
    
    min_child_weight_list = [[min_child_weight_lb, min_child_weight_ub],[sec_min_child_weight_lb, sec_min_child_weight_ub]]
    
    min_child_weight_list_weights = [min_child_weight_weight_1, min_child_weight_weight_2]
    
    colsample_bytree_list = [[colsample_bytree_lb, colsample_bytree_ub], [sec_colsample_bytree_lb, sec_colsample_bytree_ub]]
    
    colsample_bytree_list_weights = [colsample_bytree_weight_1, colsample_bytree_weight_2]
    
    gamma_list = [[gamma_lb, gamma_ub],[sec_gamma_lb, sec_gamma_ub]] 
    
    gamma_list_weights = [gamma_weight_1, gamma_weight_2]
    
    lr_list = [[learning_rate_lb, learning_rate_ub],[sec_learning_rate_lb, sec_learning_rate_ub]]
    
    lr_list_weights = [lr_weight_1, lr_weight_2]

    hyperparameter_trials = generate_trials_XGB(no_trials, max_depth_list, max_depth_list_weights, min_child_weight_list, 
                                                min_child_weight_list_weights, colsample_bytree_list, colsample_bytree_list_weights, 
                                                gamma_list, gamma_list_weights, lr_list, lr_list_weights)
                                      
elif classifier_method == "RF":
    
    n_estimators_list = [[n_estimators_lb, n_estimators_ub],[sec_n_estimators_lb, sec_n_estimators_ub]]
    
    n_estimators_list_weights = [n_estimators_weight_1, n_estimators_weight_2]

    max_depth_list = [[max_depth_lb, max_depth_ub],[sec_max_depth_lb, sec_max_depth_ub]]
    
    max_depth_list_weights = [max_depth_weight_1, max_depth_weight_2]
    
    min_samples_leaf_list = [[min_samples_leaf_lb, min_samples_leaf_ub], [sec_min_samples_leaf_lb, sec_min_samples_leaf_ub]]
 
    min_samples_leaf_list_weights = [min_samples_leaf_weight_1, min_samples_leaf_weight_2]
    
    min_samples_split_list = [[min_samples_split_lb, min_samples_split_ub],
                              [sec_min_samples_split_lb, sec_min_samples_split_ub]]
       
    min_samples_split_list_weights = [min_samples_split_weight_1, min_samples_split_weight_2]
    
    hyperparameter_trials = generate_trials_RF(no_trials, n_estimators_list, n_estimators_list_weights, max_depth_list, 
                                               max_depth_list_weights, min_samples_leaf_list, min_samples_leaf_list_weights, 
                                               min_samples_split_list, min_samples_split_list_weights)
    
elif classifier_method == "Logistic":
    C_list = [[C_lb, C_ub], [sec_C_lb, sec_C_ub]]
    
    C_list_weights = [C_weight_1, C_weight_2]
    
    hyperparameter_trials = generate_trials_Log(no_trials, C_list, C_list_weights)
    
elif classifier_method == "KNN":
    
    n_neighbors_list = [[n_neighbors_lb, n_neighbors_ub], [sec_n_neighbors_lb, sec_n_neighbors_ub]]
    
    n_neighbors_list_weights = [n_neighbors_weight_1, n_neighbors_weight_2]
    
    hyperparameter_trials = generate_trials_KNN(no_trials, n_neighbors_list, n_neighbors_list_weights)
    
elif classifier_method == "ADA":
    
    n_estimators_list = [[n_estimators_lb, n_estimators_ub], [sec_n_estimators_lb, sec_n_estimators_ub]]
    
    n_estimators_list_weights = [n_estimators_weight_1, n_estimators_weight_2]
    
    lr_list = [[learning_rate_lb, learning_rate_ub], [sec_learning_rate_lb, sec_learning_rate_ub]]
    
    lr_list_weights = [lr_weight_1, lr_weight_2]
    
    hyperparameter_trials = generate_trials_ADA(no_trials, n_estimators_list, n_estimators_list_weights, lr_list, 
                                                lr_list_weights)
    
elif classifier_method == "SVM":
    C_list = [[C_lb, C_ub], [sec_C_lb, sec_C_ub]]
    
    C_list_weights = [C_weight_1, C_weight_2]
                                      
    gamma_list = [[gamma_lb, gamma_ub], [sec_gamma_lb, sec_gamma_ub]]
    
    gamma_list_weights = [gamma_weight_1, gamma_weight_2]
                                          
    hyperparameter_trials = generate_trials_SVM(no_trials, C_list, C_list_weights, gamma_list, gamma_list_weights)

print hyperparameter_trials

[{'weight': 'None', 'hidden_units_2': 359, 'learning_rate': 1.2520653814999462e-05, 'batch_size': 147, 'beta': 0.9357945617622756, 'weight_decay': 7.679182193416932e-09, 'hidden_units_1': 153}, {'weight': '0.1', 'hidden_units_2': 746, 'learning_rate': 1.76655593656432e-05, 'batch_size': 272, 'beta': 0.939177300078208, 'weight_decay': 5.646043570251756e-17, 'hidden_units_1': 138}, {'weight': 'balanced', 'hidden_units_2': 887, 'learning_rate': 5.846326121643415e-06, 'batch_size': 69, 'beta': 0.9068044561093932, 'weight_decay': 3.783540569074339e-15, 'hidden_units_1': 215}, {'weight': '0.1', 'hidden_units_2': 781, 'learning_rate': 1.3869861245357313e-06, 'batch_size': 295, 'beta': 0.8868241539840548, 'weight_decay': 9.192580485660896e-13, 'hidden_units_1': 247}, {'weight': '0.1', 'hidden_units_2': 497, 'learning_rate': 5.497617455119908e-05, 'batch_size': 129, 'beta': 0.8961479362252931, 'weight_decay': 9.619916099446869e-10, 'hidden_units_1': 197}, {'weight': 'None', 'hidden_units_2': 59

In [16]:
hyperparameters = hyperparameter_trials[trial_idx]
print hyperparameters

{'weight': 'None', 'hidden_units_2': 359, 'learning_rate': 1.2520653814999462e-05, 'batch_size': 147, 'beta': 0.9357945617622756, 'weight_decay': 7.679182193416932e-09, 'hidden_units_1': 153}


### Customised Loss Function that approximates AUC for Neural Network

Customised ROC AUC loss inspired by the following tflearn loss function: http://tflearn.org/objectives/#roc-auc-score, which has been inspired by this paper: https://pdfs.semanticscholar.org/df27/dde10589455d290eeee6d0ae6ceeb83d0c6b.pdf

Code for customised function adapted from the opensource implementation of ROC AUC loss in tensorflow over here: https://github.com/tflearn/tflearn/blob/master/tflearn/objectives.py

Note that this loss function requires every batch to have positives to compute an estimation of AUC for each batch iteration, hence this loss function would not work for batch sizes that are too small (such that there are batches without any positives). To compute the minimum batch size necessary perform: total size of training set / total number of positives

In [17]:
# customised loss function that approximates AUC metric
class ROC_AUC_Loss(torch.nn.Module):
    def _init_(self):
        super(Custom_Loss, self)._init_()
    def forward(self, output, labels):
        labels = labels.byte()
        pos = torch.masked_select(output[:,1], labels)
        if len(pos) == 0:
            raise ValueError("Batch Size is too small. There are batches with no positives, hence ROC_AUC_Loss metric cannot be optimized with current batch size. Increase Batch Size or Change Loss Function Used.")
        neg_index = []
        for i in range(len(labels)):
            if labels[i] == 0:
                neg_index.append(1)
            else:
                neg_index.append(0)

        neg_index = torch.Tensor(neg_index).byte()
        neg = torch.masked_select(output[:,1], neg_index)
        pos = pos.unsqueeze(0)
        neg = neg.unsqueeze(1)
        gamma = 0.2
        p = 2

        difference = torch.zeros((pos * neg).size()) + pos - neg - gamma
        diff_index = (difference < 0.0).byte()
        masked = torch.masked_select(difference, diff_index)
        masked = (-masked)**p
        return masked.sum()

### Define the Network

Tutorial for Neural Net Architecture: https://pytorch.org/tutorials/beginner/blitz/neural_networks_tutorial.html

Utilize batch normalization, as explained here: https://www.youtube.com/watch?v=fv1Luwd-LOI&index=69&list=PLBAGcD3siRDguyYYzhVwZ3tLvOyyG5k6K

Utilize dropout, as explained here: https://www.youtube.com/watch?v=ARq74QuavAo

In [18]:
# define the network with batch normalization
class Net(nn.Module):
    def __init__(self, dropout_parameter = 0.5, hidden_units_1 = 200, 
                 hidden_units_2 = 400, batch_size = 75, 
                 learning_rate = 1e-5, beta = 0.9, weight_decay = 1e-4):
        torch.manual_seed(0)
        super(Net, self).__init__()
        self.input = nn.Linear(len(features_cols), hidden_units_1) # read input size from the .shape of data table
        self.hidden1 = nn.Linear(hidden_units_1, hidden_units_2)
        self.hidden1_bn = nn.BatchNorm1d(hidden_units_2)
        self.hidden2 = nn.Linear(hidden_units_2, hidden_units_2)
        self.hidden2_bn = nn.BatchNorm1d(hidden_units_2)
        self.hidden3 = nn.Linear(hidden_units_2, hidden_units_1)
        self.hidden3_bn = nn.BatchNorm1d(hidden_units_1)
        self.dropout = nn.Dropout(p = dropout_parameter)
        self.output = nn.Linear(hidden_units_1,2)
        self.learning_rate = learning_rate
        self.beta = beta
        self.batch_size = batch_size
        self.weight_decay = weight_decay
  
    def forward(self, x):
        x = F.rrelu(self.input(x))
        x = self.dropout(F.rrelu(self.hidden1_bn(self.hidden1(x))))
        x = self.dropout(F.rrelu(self.hidden2_bn(self.hidden2(x))))
        x = self.dropout(F.rrelu(self.hidden3_bn(self.hidden3(x))))
        x = self.output(x)
        return x
    
    def fit(self, X_train, y_train_label, X_valid, y_valid, weight, cost = "BCELoss"):
            # sets model in training mode because batch normalization behavior in training and testing modes are different
            self.train()
            # set random seed for weights and biases
            torch.manual_seed(0)

            # dataset
            dataset = pd.concat([X_train,y_train_label],axis=1)
            dataset = shuffle(dataset, random_state = 0)

            X_train = dataset.iloc[:,:dataset.shape[1]-1]
            y_train_label = dataset.iloc[:,dataset.shape[1]-1]
            # create loss function
            if cost == "BCE_Loss":
                loss = nn.BCEWithLogitsLoss(weight = weight)
            elif cost == "ROC_AUC_Loss":
                loss = ROC_AUC_Loss()
            # mini-batching
            batch_size = self.batch_size

            BETA_2 = 0.999
            TOTAL_EPOCHS_TRAINED = 10**4

            # create adam optimizer for Phase 1
            optimizer_1 = optim.Adam(self.parameters(), lr=self.learning_rate,betas=(self.beta,BETA_2), 
                                     weight_decay = self.weight_decay)

            lambda1 = lambda epoch_count: 0.995 ** epoch_count 
            scheduler = LambdaLR(optimizer_1, lr_lambda=lambda1)
            no_batch_minus_1 = X_train.shape[0] / batch_size 

            # Repeated Stratified K Fold to ensure positives are evenly distributed across batchSes
            skf_1 = RepeatedStratifiedKFold(n_splits=no_batch_minus_1,n_repeats=TOTAL_EPOCHS_TRAINED,random_state=0)

            INITIAL_PATIENCE = 50
            count = 0
            epoch_count = 0
            max_auprc = 0
            ideal_epoch_count = 0 
            patience = INITIAL_PATIENCE
            patience_j = 0

            for train,test in skf_1.split(X_train,y_train_label):
                data = X_train.iloc[test,:]
                data = torch.Tensor(data.values.astype(np.float32))
                 # forward pass
                output = self.forward(data)
                output.data = output.data.view(data.shape[0],2)

                labels = y_train_label[test]
                if cost == "BCE_Loss":
                    labels = np.eye(2)[labels]
                labels = torch.autograd.Variable(torch.Tensor(labels), requires_grad = False)

                # zero the gradient buffers
                optimizer_1.zero_grad()
                # compute loss and gradients
                loss_output = loss(output,labels)
                loss_output.backward()
                # Does the update
                optimizer_1.step()

                count = count + 1

                # Early Stopping
                if count == no_batch_minus_1 + 1:
                    count = 0
                    epoch_count = epoch_count + 1
                    scheduler.step()
                    probs_valid = self.predict_proba(X_valid)
                    precision, recall, _ = precision_recall_curve(y_valid, probs_valid)
                    auprc = auc(recall, precision)
                    print auprc
                    if auprc > max_auprc:
                        max_auprc = auprc
                        ideal_epoch_count = epoch_count
                        patience = patience + epoch_count
                        patience_j = 0
                    else:
                        patience_j = patience_j + 1 
                        if patience_j == patience: break
            
                self.train()
            return max_auprc, ideal_epoch_count

        
    #prediction probabilities array
    def predict_proba(self, X_test):
        self.eval()
        #forward pass
        test = torch.Tensor(X_test.values.astype(np.float32))
        output = self.forward(test)
        sf = nn.Softmax()
        probs = sf(output.data)
        return probs[:,1]
        
    # function that evaluates BCE Loss on validation set just to check if loss is decreasing over epochs. Not entirely necessary to be used- 
    # monitoring AUPRC over epochs is most critical
    def evaluate_BCE_loss(self,X_valid, y_valid, weight):
        data = torch.Tensor(X_valid.values.astype(np.float32))
        loss = nn.BCEWithLogitsLoss(weight = weight)
         # forward pass
        output = self.forward(data)
        output.data = output.data.view(data.shape[0],2)

        labels = y_valid
        labels = labels.astype(int)
        labels = torch.Tensor(np.eye(2)[labels])
        labels = torch.autograd.Variable(labels, requires_grad = False)

        # compute loss and gradients
        loss_output = loss(output,labels)

        return loss_output
    
    

### Models tested (and their hyper-parameters)

In [19]:
def generate_model(classifier_method, hyperparameters, no_pos=1, no_neg=1):
    
    xgb_trees_limit = 5000
    
    if (classifier_method == "XGB"):
        if (hyperparameters["scale_pos_weight"] == "balanced"):
            scale_weight = no_neg/float(no_pos)
        else:
            scale_weight = hyperparameters["scale_pos_weight"]
        model = XGBClassifier(n_estimators=xgb_trees_limit, n_jobs=-1, random_state=0, max_depth=hyperparameters["max_depth"], 
                              min_child_weight=hyperparameters["min_child_weight"], colsample_bytree=hyperparameters["colsample_bytree"], 
                              gamma=hyperparameters["gamma"], learning_rate=hyperparameters["learning_rate"], scale_pos_weight=scale_weight)
        
    elif (classifier_method == "RF"):
        model = RandomForestClassifier(n_estimators=hyperparameters["n_estimators"], n_jobs=-1, random_state=0,
                                      max_depth=hyperparameters["max_depth"], min_samples_leaf=hyperparameters["min_samples_leaf"],
                                      min_samples_split=hyperparameters["min_samples_split"], class_weight=hyperparameters["class_weight"])
        
    elif(classifier_method == "Logistic"):
        model = LogisticRegression(C=hyperparameters["C"], random_state=0, n_jobs=-1, class_weight=hyperparameters["class_weight"])
        
    elif (classifier_method == "KNN"):
        model = KNeighborsClassifier(n_neighbors=hyperparameters["n_neighbors"], n_jobs=-1, weights=hyperparameters["weights"])
        
    elif (classifier_method == "ADA"):
        model = AdaBoostClassifier(n_estimators=hyperparameters["n_estimators"], random_state=0, learning_rate=hyperparameters["learning_rate"])
        
    elif (classifier_method == "SVM"):
        model = SVC(C=hyperparameters["C"], gamma = hyperparameters["gamma"], kernel=hyperparameters["kernel"], probability=True, random_state=0, cache_size=400,
                    class_weight = hyperparameters["class_weight"])
        
    elif (classifier_method =="NN"):
        model = Net(dropout_parameter = 0.5, hidden_units_1 = hyperparameters["hidden_units_1"], 
                 hidden_units_2 = hyperparameters["hidden_units_2"], batch_size = hyperparameters["batch_size"], 
                 learning_rate = hyperparameters["learning_rate"], beta = hyperparameters["beta"], 
                 weight_decay = hyperparameters["weight_decay"])
    return model

#### Dealing with model imbalance
Weight Vector: https://towardsdatascience.com/dealing-with-imbalanced-classes-in-machine-learning-d43d6fa19d2 (look at section on "Cost-sensitive Learning")

Implementing Early Stopping for XGBoost: https://cambridgespark.com/content/tutorials/hyperparameter-tuning-in-xgboost/index.html

In [20]:
def test_model_iterative_fixed(hyperparameters_dict,ligand_bind_features, ligand_negatives_features, ligand_name, features=[]):
    
    """
    Test different models in k-folds cross-validation.
    """
    
    #Default: Exclude no features
    if len(features) == 0:
        features = np.ones([ligand_bind_features.shape[1],]).astype(bool)
    
    models_req_scaling = ["SVM", "KNN", "Logistic", "NN"]
    classifier = classifier_method

    #Create X and y with included features
    X = pd.concat([ligand_bind_features.iloc[:,features], ligand_negatives_features.iloc[:,features]])
    
    y = [1] * ligand_bind_features.shape[0]
    y.extend([0] * ligand_negatives_features.shape[0])
    y = np.array(y)
    y_df = pd.DataFrame(y)
    y_df.index = X.index
    y_df.columns = ["label"]
    
    #Get the fold indices
    cv_idx = calc_CV_idx_iterative(X, splits_dict)
    k = (int(fold)-1)
    
    pred_idx = k+1
    print "fold #: "+str(pred_idx)
    #test_index = cv_idx[k]["test"]
    full_train_index = cv_idx[k]["train"]
        
    # phase 1: testing on validation set, hyperparameter tuning
    
    trial_auprc_results = np.zeros(folds_num-1)
    trial_auc_results = np.zeros(folds_num-1)
    epoch_counts = np.zeros(folds_num-1, dtype = "int")
    for i in range(folds_num-1):
    #for i in range(1):
        valid_k = (k + 1 + i) % folds_num
        valid_index = cv_idx[valid_k]["test"]

        train_index = [index for index in full_train_index if index not in valid_index]
        X_train, X_valid = X.loc[train_index,:], X.loc[valid_index,:]
        y_train, y_valid = y_df.loc[train_index,:], y_df.loc[valid_index,:]

        if (classifier in models_req_scaling):
            cols = X_train.columns

            # phase 1 scaling with just training data
            scaler_1 = StandardScaler() 
            scaler_1.fit(X_train) 
            X_train = pd.DataFrame(scaler_1.transform(X_train))
            # apply same transformation to validation data
            X_valid = pd.DataFrame(scaler_1.transform(X_valid))

            #Restoring indices after scaling
            X_train.index = train_index 
            X_valid.index = valid_index

            #Restoring features names
            X_train.columns = cols
            X_valid.columns = cols

        #No down-sampling
        X_train_sampled = X_train
        y_train_sampled = y_train
        
        #pos and neg numbers in the training
        no_pos = np.count_nonzero(y_train_sampled["label"] == 1)
        no_neg = np.count_nonzero(y_train_sampled["label"] == 0)  
        
        #fit to training data
        if (classifier == "NN"):
            if hyperparameters["weight"] == "balanced":              
                #weight vector
                neg_weight = float(no_pos) / float(no_neg + no_pos) 
                pos_weight = 1 - neg_weight
            elif hyperparameters["weight"] == "0.1":
                neg_weight = 10
                pos_weight = 1
            elif hyperparameters["weight"] == "None":
                neg_weight = 1
                pos_weight = 1
            
            weight = torch.Tensor([neg_weight, pos_weight])
            model = generate_model(classifier_method, hyperparameters)
            auprc_score,epoch_count = model.fit(X_train_sampled, y_train_sampled["label"],X_valid, y_valid["label"], weight, cost = "ROC_AUC_Loss")
            probs_list = model.predict_proba(X_valid)
            auc_score = roc_auc_score(y_valid, probs_list)
        elif (classifier == "XGB"):
            num_early_stopping_rounds = 750
            model = generate_model(classifier_method, hyperparameters, no_pos = no_pos, no_neg = no_neg)
            model.fit(X_train_sampled, y_train_sampled["label"], eval_set = [(X_valid,y_valid["label"])], eval_metric = "map", 
                      verbose=False, early_stopping_rounds = num_early_stopping_rounds)
            probs_list = []
            probs = model.predict_proba(X_valid, ntree_limit=model.best_ntree_limit)
            for l in probs:
                probs_list.append(l[1])
            precision, recall, _ = precision_recall_curve(y_valid, probs_list)
            auprc_score = auc(recall, precision)
            auc_score = roc_auc_score(y_valid, probs_list)
            print "model.best_iteration = "+str(model.best_iteration)
            epoch_count = model.best_ntree_limit
            

        else:            
            model = generate_model(classifier_method, hyperparameters)
            model.fit(X_train_sampled, y_train_sampled["label"])
            probs_list = []
            probs = model.predict_proba(X_valid)
            for l in probs:
                probs_list.append(l[1])
            precision, recall, _ = precision_recall_curve(y_valid, probs_list)
            auprc_score = auc(recall, precision)
            auc_score = roc_auc_score(y_valid, probs_list)
        
        print "AUPRC = "+str(auprc_score)
        print "AUC = "+str(auc_score)
        trial_auprc_results[i] = auprc_score 
        trial_auc_results[i] = auc_score 
        if classifier == "NN" or classifier == "XGB": epoch_counts[i] = epoch_count
    
    mean_auprc_result = np.mean(trial_auprc_results)
    mean_auc_result = np.mean(trial_auc_results)
    var_auprc_result = np.var(trial_auprc_results)
    var_auc_result = np.var(trial_auc_results)
    if classifier == "NN" or classifier == "XGB":
        mean_epoch_count = int(np.mean(epoch_counts))
        hyperparameters_dict["mean_epoch_count"] = mean_epoch_count

    hyperparameters_dict["mean_AUPRC"] = mean_auprc_result
    hyperparameters_dict["mean_AUC"] = mean_auc_result
    hyperparameters_dict["var_AUPRC"] = var_auprc_result
    hyperparameters_dict["var_AUC"] = var_auc_result
    hyperparameters_dict["trial_idx"] = trial_idx

    # Update dictionary with all hyperparameters
    keys = hyperparameters.keys()
    for key in keys:
        hyperparameters_dict[key].append(hyperparameters[key])
    pred_idx += 1

    print "Finished "+ligand+" "+classifier+" fold: "+fold+" trial: "+str(trial_idx)

#### Predict for each ligand seperatelly

In [21]:
%%time

#Initialize dictionary
hyperparameters_dict = defaultdict(list)

test_model_iterative_fixed(hyperparameters_dict,ligands_positives_df[ligand], ligands_negatives_df[ligand], ligand)

hyperparameters_df = pd.DataFrame.from_dict(hyperparameters_dict)

#Save to file
hyperparameters_df.to_csv(curr_dir[0]+"/hyperparam_tuning/phase1_initial_run/"+datafile_date+"_"+prec_th_str+"/per_trial/"+ligand+"_"+classifier_method+"_fold"+fold+"_trial"+str(trial_idx)+"_"+str(folds_num)+"w_hyperparameters.csv", sep='\t')

print "Finished ligand "+ligand

fold #: 1
0.035294897367
0.0562895550077
0.0687229865923
0.0868657583094
0.0982637816699
0.118885837566
0.145292452519
0.157921072233
0.174895002582
0.164347331497
0.196181281786
0.204773616582
0.221646263879
0.221823911339
0.22855903127
0.238269192804
0.232823119317
0.23283517286
0.246304733655
0.262049099008
0.263424658985
0.273478862805
0.286143218053
0.287512257153
0.297615518403
0.303089051686
0.311814240308
0.325851340685
0.33454720557
0.328908112627
0.307501014513
0.315300702392
0.323678822392
0.339044274997
0.331248752021

Exception 




KeyboardInterrupt in 'zmq.backend.cython.message.Frame.__dealloc__' ignored


KeyboardInterrupt: 