In [2]:
#Basic imports
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
from os import environ
import json

#Classifier imports
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost.sklearn import XGBClassifier

#ML framework imports
from sklearn.metrics import auc, roc_auc_score, precision_recall_curve, average_precision_score, make_scorer
from sklearn.model_selection import StratifiedKFold,train_test_split,GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import scale
import xgboost as xgb

#import matplotlib.pylab as plt

#from matplotlib.pylab import rcParams
from sklearn import metrics   #Additional scklearn functions
#from sklearn.grid_search import 


#Downsamplers imports - prototype generation
from imblearn.under_sampling import ClusterCentroids

#Downsamplers imports - prototype selection - controlled
from imblearn.under_sampling import RandomUnderSampler, NearMiss

#Downsamplers imports - prototype selection - Cleaning techniques
from imblearn.under_sampling import TomekLinks, EditedNearestNeighbours, RepeatedEditedNearestNeighbours

#Downsamplers imports - prototype selection - Cleaning techniques - Condensed nearest neighbors and derived algorithms
from imblearn.under_sampling import CondensedNearestNeighbour, OneSidedSelection, NeighbourhoodCleaningRule

#Downsamplers imports - prototype selection - Cleaning techniques
from imblearn.under_sampling import InstanceHardnessThreshold

from IPython.core.display import HTML
#HTML("<style>.container { width:100% !important; }</style>")

ABSOLUTE_NEGATIVES = False
FILTER_DOMAIN = False

### Reading the input dataset

In [3]:
curr_dir = !pwd
curr_dir[0] = curr_dir[0] + "/.."
input_path = curr_dir[0]+"/domains_similarity/filtered_features_table/"
filename = "positions_features_mediode_filter_01.25.18.csv"

#input_path = curr_dir[0]+"/../9.Features_exploration/binding_df/10/"
#filename = "positions_features_01.25.18.csv"

bind_scores_num = 10

#Features table
features_all = pd.read_csv(input_path+filename, sep='\t', index_col=0)
features_cols = features_all.columns[1:-bind_scores_num] #removing binding scores and domain name
ligands = ["dna", "dnabase", "dnabackbone", "rna", "rnabase", "rnabackbone", "peptide", "ion", "metabolite"]
print "all samples positions #: "+str(features_all.shape[0])

#lignd binding domains dictionary
with open(curr_dir[0]+"/ligands_negatives_domains_dict.pik", 'rb') as handle:
        negatives_dict = pickle.load(handle)

all samples positions #: 38944


#### Dataset of negative examples

In [4]:
def filter_to_ligand_binding_domains(use_max_binding_score):
    
    ligands_negatives_df = {}
    for ligand in ligands:
        
        ligands_negatives_df[ligand] = pd.DataFrame()
        for domain in negatives_dict[ligand].keys():
            if domain == 'negatives' or domain == 'domains':
                continue
            domain_all = features_all.loc[features_all.loc[:,"domain_name"] == domain,:]
            
            #In case this domain was previously filtered
            if len(domain_all) == 0:
                continue
            
            if (use_max_binding_score):
                ligands_negatives_df[ligand] = pd.concat([ligands_negatives_df[ligand],domain_all.loc[domain_all.loc[:,"max_binding_score"] == 0,:]])
            else:
                ligand_bind_str = ligand+"_binding_score"
                ligands_negatives_df[ligand] = pd.concat([ligands_negatives_df[ligand],domain_all.loc[domain_all.loc[:,ligand_bind_str] == 0,:]])
        
    #Handeling the ligand "all_ligands"
    all_ligands_negatives_df = pd.concat([ligands_negatives_df["dna"], ligands_negatives_df["dnabase"], ligands_negatives_df["dnabackbone"], ligands_negatives_df["rna"], ligands_negatives_df["rnabase"], 
                                 ligands_negatives_df["rnabackbone"], ligands_negatives_df["ion"], ligands_negatives_df["peptide"], ligands_negatives_df["metabolite"]])
    all_ligands_negatives_df = all_ligands_negatives_df.drop_duplicates()
    #Filter to just positions with max. binding score = 0
    all_ligands_negatives_df = all_ligands_negatives_df[all_ligands_negatives_df["max_binding_score"] == 0]
    ligands_negatives_df["all_ligands"] = all_ligands_negatives_df
    
    #Leaving just the features columns
    for ligand in ligands_negatives_df.keys():   
        ligands_negatives_df[ligand] = ligands_negatives_df[ligand][features_cols]
        print(ligand+" non-binding #:"+str(len(ligands_negatives_df[ligand])))
    
    return ligands_negatives_df
            

In [5]:
def negatives_by_binding_score(use_max_binding_score):
    
    ligands_negatives_df = {}
    for ligand in ligands:
        
        if use_max_binding_score:
            ligand_bind_str = "max_binding_score"
        else:
            ligand_bind_str = ligand+"_binding_score"
        
        ligands_negatives_df[ligand] = features_all[features_all[ligand_bind_str] == 0]
        ligands_negatives_df[ligand] = ligands_negatives_df[ligand].loc[:,features_cols]
        print(ligand+" non-binding #:"+str(len(ligands_negatives_df[ligand])))
        
    #Handeling the ligand "all_ligands"
    ligands_negatives_df["all_ligands"] = features_all[features_all["max_binding_score"] == 0]
    ligands_negatives_df["all_ligands"] = ligands_negatives_df["all_ligands"].loc[:,features_cols]
    print("all_ligands non-binding #:"+str(len(ligands_negatives_df["all_ligands"])))
    
    return ligands_negatives_df

In [6]:
#Create negatives datasets
if FILTER_DOMAIN:
    if ABSOLUTE_NEGATIVES:
        ligands_negatives_df = filter_to_ligand_binding_domains(True)
    else:
        ligands_negatives_df = filter_to_ligand_binding_domains(False)
else:
    if ABSOLUTE_NEGATIVES:
        ligands_negatives_df = negatives_by_binding_score(True)
    else:
        ligands_negatives_df = negatives_by_binding_score(False)

dna non-binding #:38095
dnabase non-binding #:38577
dnabackbone non-binding #:38203
rna non-binding #:38047
rnabase non-binding #:38407
rnabackbone non-binding #:38223
peptide non-binding #:35437
ion non-binding #:34488
metabolite non-binding #:33971
all_ligands non-binding #:27191


#### Datasets of positive examples by ligand

In [7]:
bind_th = 0.1
ligands_features_df = {}
    
for ligand in ligands:
    score_col_str = ligand+"_binding_score"
    ligand_binding_df = features_all[features_all[score_col_str] >= bind_th]
    print ligand+" #: "+str(ligand_binding_df.shape[0])
    ligands_features_df[ligand] = ligand_binding_df.loc[:,features_cols]

dna #: 501
dnabase #: 193
dnabackbone #: 408
rna #: 433
rnabase #: 224
rnabackbone #: 308
peptide #: 1496
ion #: 1093
metabolite #: 1525


#### Dataset of positive examples - all ligands combined

In [8]:
all_ligands_features_df = pd.concat([ligands_features_df["dna"], ligands_features_df["dnabase"], ligands_features_df["dnabackbone"], ligands_features_df["rna"], ligands_features_df["rnabase"], 
                                     ligands_features_df["rnabackbone"], ligands_features_df["ion"], ligands_features_df["peptide"], ligands_features_df["metabolite"]])
all_ligands_features_df = all_ligands_features_df.drop_duplicates()
print "all_ligands #: "+str(all_ligands_features_df.shape[0])
ligands_features_df["all_ligands"] = all_ligands_features_df

all_ligands #: 4518


### Reading env input for downsampler technique, ligand and classifier  

In [10]:
classifier = "XGB"
#Reading the ligand input
try:
    ligand = environ['ligand']
except:
    ligand = "dna"
print "ligand = "+ligand

ligand = dna


In [11]:
def test_model(clf, pred_dict, auc_dict, auprc_dict, ligand_bind_features, ligand_negatives_features, features = []):
    """
    Test different models in 10-folds cross-validation.
    """
    
    #Default: Exclude no features
    if len(features) == 0:
        features = np.ones([ligand_bind_features.shape[1],]).astype(bool)
        
    #Arranging the features table by the CV order, for each model
    #features_pred_dfs = dict.fromkeys(classifiers.keys())
    
    models_req_scaling = ["SVM", "KNN"]
    
    #classifier = classifier_method
    model = clf
    #print "classifier_method = " + classifier_method
    #print "ligand = " + ligand
    print model.get_xgb_params()
    #features_pred_dfs[classifier] = pd.DataFrame()
        
    #Create X and y with included features
    X = pd.concat([ligand_bind_features.iloc[:,features], ligand_negatives_features.iloc[:,features]])

    y = [1] * ligand_bind_features.shape[0]
    y.extend([0] * ligand_negatives_features.shape[0])
    y = np.array(y)

    binding_skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
    pred_idx = 1

    for train_index, test_index in binding_skf.split(X, y):
            
        print "fold #: "+str(pred_idx)
        X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
        y_train, y_test = y[train_index], y[test_index]
        
        X_train_sampled = X_train
        y_train_sampled = y_train
            
            
        #early_stopping validation set 
        #7.8 Early stopping and Algorithm 7.2
        #http://egrcc.github.io/docs/dl/deeplearningbook-regularization.pdf
            
        #X_subtrain, X_valid, y_subtrain, y_valid = train_test_split(X_train_sampled, 
        #                                y_train_sampled, stratify=y_train_sampled, test_size = .1)
            
        #fit to training data
        #model = classifiers[classifier]
        model.fit(X_train_sampled, y_train_sampled)
        
        #model.fit(X_subtrain, y_subtrain, eval_set = [(X_valid,y_valid)],eval_metric = "map", early_stopping_rounds = 50,verbose = False)
        #print model.best_ntree_limit
        probs_list = []

        #probs = model.predict(X_test)
        #probs_list = probs
            
        probs = model.predict_proba(X_test) #,ntree_limit=model.best_ntree_limit)
        for l in probs:
            probs_list.append(l[1])
                
        pred_dict["obs"].extend(y_test)
        pred_dict["prob"].extend(probs_list)
        fold_list = [pred_idx] * len(probs_list)
        pred_dict["fold"].extend(fold_list)

        model_list = [classifier] * len(probs_list)
        pred_dict["model"].extend(model_list)

        #Update auc auprc dictionaries
        auc_dict[classifier].append(roc_auc_score(y_test, probs[:, 1]))
        precision, recall, _ = precision_recall_curve(y_test, probs[:, 1])
            
        #auc_dict[classifier].append(roc_auc_score(y_test, probs))
        #precision, recall, _ = precision_recall_curve(y_test, probs)
            
        auprc_dict[classifier].append(auc(recall, precision))
            
        #Update features table
        #features_pred_dfs[classifier] = features_pred_dfs[classifier].append(X_test)
        pred_idx += 1
            
        print "AUC = "+str(auc_dict[classifier][-1])
        print "AUPRC = "+str(auprc_dict[classifier][-1])

    avg_auc = np.sum(auc_dict[classifier])/10.0
    print "avg auc = "+str(avg_auc) 
        
    avg_auprc = np.sum(auprc_dict[classifier])/10.0
    print "avg auprc = "+str(avg_auprc)
            
    print "Finished "+ligand+" "+classifier
        
    #return Average AUPRC
    return avg_auprc

### Test model functions

In [12]:
#adapted from https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
#
#Using cross-validation, continues adding estimators until auc does not improve within 50 rounds
#
#Towards the end of my runs I stopped using this in favor of using test_model with different value of n_estimators.
def modelfit(alg, ligand_bind_features, ligand_negatives_features, ligand_name, useTrainCV=True, cv_folds=10, early_stopping_rounds=50):
    
    features = np.ones([ligand_bind_features.shape[1],]).astype(bool)
    X = pd.concat([ligand_bind_features.iloc[:,features], ligand_negatives_features.iloc[:,features]])

    y = [1] * ligand_bind_features.shape[0]
    y.extend([0] * ligand_negatives_features.shape[0])
    y = np.array(y)
    
    print "modelfit"
    xgb_param = alg.get_xgb_params()
    xgtrain = xgb.DMatrix(X, label=y)
    #print alg.get_params()['n_estimators']
    
    #metrics can be changed to a variety of things including "map" for Mean Average Precision (same as Average AUPRC) 
    #I didn't have much luck with that though. 
    #See under eval_metric in https://github.com/dmlc/xgboost/blob/master/doc/parameter.md
    cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, 
                      metrics='auc', early_stopping_rounds=early_stopping_rounds,verbose_eval =True)
    alg.set_params(n_estimators=cvresult.shape[0])
    print "Optimal n_estimators: " + str(cvresult.shape[0])
    
    return alg,cvresult#,dtrain_predictions,dtrain_predprob,alg


In [13]:
ligand_bind_features = ligands_features_df[ligand]
ligand_negatives_features = ligands_negatives_df[ligand]
features = features = np.ones([ligand_bind_features.shape[1],]).astype(bool)
X = pd.concat([ligand_bind_features.iloc[:,features], ligand_negatives_features.iloc[:,features]])
y = [1] * ligand_bind_features.shape[0]
y.extend([0] * ligand_negatives_features.shape[0])
y = np.array(y)

#Calcuate a good value for scale_pos_weight, which gives that value to positive examples. 
#A common value is the ratio of number of negative examples to number of positive examples (#neg/#pos)

val_scale_pos_weight = len([y[i] for i in range(len(y)) if y[i]==0])/len([y[i] for i in range(len(y)) if y[i]==1])
print val_scale_pos_weight

76


In [41]:
%%time

#Uses cross validation and early stopping to find best number of estimators

#I started using the cell below to find the best number of estimators, but I am keeping it here just as another method.

#xgb1 = XGBClassifier(
# learning_rate =0.1,
# n_estimators=1000,
# max_depth=6,
# min_child_weight=1,
# gamma=0,
# subsample=0.8,
# colsample_bytree=0.8,
# objective= 'binary:logistic',
# n_jobs=-1,
# scale_pos_weight=val_scale_pos_weight,
# random_state = 0)

#returns = modelfit(xgb1, ligands_features_df[ligand], ligands_negatives_df[ligand], ligand)
#print "Optimal n_estimators for ligand("+str(ligand)+"): "+str(returns[1].shape[0]) 
#optimized_n_est = returns[1].shape[0] 

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 51 µs


In [47]:
%%time

#Search for best values of n_estimators
#in the given range, which can be changed by changing the ranges in the for-loop
#Multithreading is used to speed up the process

from multiprocessing.dummy import Pool as ThreadPool
#can change number of threads (problem was running out of memory on the cluster)
pool = ThreadPool(20)

def test_model_for_map(clf):
    pred_dict = defaultdict(list)
    auc_dict = defaultdict(list)
    auprc_dict = defaultdict(list)
    return test_model(clf, pred_dict, auc_dict, auprc_dict, ligands_features_df[ligand], ligands_negatives_df[ligand])
 
classifiers = []
params = {} 

for n_est in range(1,1025,25):
    #create an XGB classifier with the given parameters.
    classifiers.append(XGBClassifier(n_estimators=n_est, n_jobs = -1, random_state=0, 
            max_depth=5, min_child_weight=1, gamma = 0,
            colsample_bytree=.8, subsample=.8, scale_pos_weight = val_scale_pos_weight))
    
results = pool.map(test_model_for_map,classifiers)
pool.close()
pool.join()
    
best_score = 0
best_param = ""
for i in range(0,len(classifiers),1):
    print "{n_estimators: " + str(classifiers[i].get_params()['n_estimators']) + "}" + ": "+  str(results[i])
    if(results[i] > best_score):
        optimized_n_est = classifiers[i].get_params()['n_estimators']
        best_score = results[i]
        
print "\nBest params by AUPRC: " + "{n_estimators: " + str(classifiers[i].get_params()['n_estimators']) + "}" + ": " +  str(best_score)


{'reg_alpha': 0, 'colsample_bytree': 0.8, 'silent': 1, 'colsample_bylevel': 1, 'scale_pos_weight': 76, 'learning_rate': 0.1, 'missing': None, 'max_delta_step': 0, 'base_score': 0.5, 'n_estimators': 1, 'subsample': 0.8, 'reg_lambda': 1, 'min_child_weight': 1, 'objective': 'binary:logistic', 'seed': 0, 'max_depth': 5, 'gamma': 0, 'booster': 'gbtree'}
fold #: 1
AUC = 0.9020148216767022
AUPRC = 0.18338141268266292
fold #: 2
AUC = 0.8645354330708661
AUPRC = 0.15289504422998046
fold #: 3
AUC = 0.9146430446194225
AUPRC = 0.16807593840468993
fold #: 4
AUC = 0.8884173228346457
AUPRC = 0.22211158402769038
fold #: 5
AUC = 0.8760498687664041
AUPRC = 0.14509397936518023
fold #: 6
AUC = 0.8596613284326595
AUPRC = 0.09222608828393275
fold #: 7
AUC = 0.912032029404043
AUPRC = 0.19710307279178088
fold #: 8
AUC = 0.8587188238382778
AUPRC = 0.15018286018568838
fold #: 9
AUC = 0.88789183512733
AUPRC = 0.21054824539882766
fold #: 10
AUC = 0.8848227881333683
AUPRC = 0.12844788618236735
avg auc = 0.884878729

In [48]:
#Search for best values of max_depth, min_child_weight, and gamma (which are all related to complexity of the model)
#in the given ranges, which can be changed by changing the ranges in the for-loops
#Multithreading is used to speed up the process

#Dummy module is the same as multiprocessing, but is for multithreading
from multiprocessing.dummy import Pool as ThreadPool
pool = ThreadPool(20)

#Helper function that takes a classifier and runs test_model and returns the Average PRC (score)
def test_model_for_map(clf):
    pred_dict = defaultdict(list)
    auc_dict = defaultdict(list)
    auprc_dict = defaultdict(list)
    return test_model(clf, pred_dict, auc_dict, auprc_dict, ligands_features_df[ligand], ligands_negatives_df[ligand])

classifiers = []
params = {}

#11,6,11
for max_depth in range(3,11,1):
    for min_child_weight in range(0,6,1):
        for gam in [i/10.0 for i in range(0,11,1)]:
            #create an XGB classifier with the given parameters. 
            #colsample_bytree and subsample are fixed to 0.8.
            #n_estimator from previous section.
            classifiers.append(XGBClassifier(n_estimators=optimized_n_est, n_jobs = -1, random_state=0, 
                max_depth=max_depth, min_child_weight=min_child_weight, gamma = gam,
                colsample_bytree=.8, subsample=.8, scale_pos_weight = val_scale_pos_weight))

#runs test_model_for_map function on each classifier in classifiers
results = pool.map(test_model_for_map,classifiers)
pool.close()
pool.join()


#Print out params and their Average AUPRC and get parameter which had best Average AUPRC (score)
best_score = 0
best_param = ""
for i in range(0,len(classifiers),1):
    print "{max_depth: " + str(classifiers[i].get_params()['max_depth']) + ", min_child_weight: " + str(classifiers[i].get_params()['min_child_weight']) +  ", gamma: " + str(classifiers[i].get_params()['gamma'])+"}" + ": "+  str(results[i])
    if(results[i] > best_score):
        opt_max_depth = classifiers[i].get_params()['max_depth']
        opt_min_child_weight = classifiers[i].get_params()['min_child_weight']
        opt_gamma = classifiers[i].get_params()['gamma']
        best_score = results[i]
        
print "\nBest params by AUPRC: " + "{max_depth: " + str(classifiers[i].get_params()['max_depth']) + ", min_child_weight: " + str(classifiers[i].get_params()['min_child_weight']) +  ", gamma: " + str(classifiers[i].get_params()['gamma'])+"}" + ": " +  str(best_score)

[XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1.0, gamma=0.0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=0, missing=None,
       n_estimators=1, n_jobs=-1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=76, seed=None, silent=True,
       subsample=1.0)]
{'reg_alpha': 0, 'colsample_bytree': 1.0, 'silent': 1, 'colsample_bylevel': 1, 'scale_pos_weight': 76, 'learning_rate': 0.1, 'missing': None, 'max_delta_step': 0, 'base_score': 0.5, 'n_estimators': 1, 'subsample': 1.0, 'reg_lambda': 1, 'min_child_weight': 0, 'objective': 'binary:logistic', 'seed': 0, 'max_depth': 3, 'gamma': 0.0, 'booster': 'gbtree'}
fold #: 1
AUC = 0.8213627708301169
AUPRC = 0.155121692799043
fold #: 2
AUC = 0.7931496062992127
AUPRC = 0.24075672102751486
fold #: 3
AUC = 0.8046745406824147
AUPRC = 0.2477251778766975
fold #: 4
AUC = 0.7735511811023623
AUPRC = 0.2146136922218

In [49]:
%%time

#Search for best values of subsample and colsample_bytree (which are all related to robustness of the model)
#in the given ranges, which can be changed by changing the ranges in the for-loops
#Multithreading is used to speed up the process

pool2 = ThreadPool(20)
classifiers = []
params = {}

#11,11
for colsamp in range(1,11,1):
    for subsamp in range(1,11,1):
        #create an XGB classifier with the given parameters. 
        #n_estimators, max_depth, min_child_weight, and gamma are the best values from the previous sections
        classifiers.append(XGBClassifier(n_estimators=optimized_n_est, n_jobs = -1, random_state=0, 
            max_depth=opt_max_depth, gamma = .1, min_child_weight=opt_min_child_weight, 
            colsample_bytree=colsamp/10.0, subsample=subsamp/10.0, scale_pos_weight = val_scale_pos_weight,))

results = pool2.map(test_model_for_map,classifiers)
pool2.close()
pool2.join()

best_score = 0
best_param = ""
for i in range(0,len(classifiers),1):
    print "{colsample_bytree: " + str(classifiers[i].get_params()['colsample_bytree']) + ", subsample:" + str(classifiers[i].get_params()['subsample']) + "}" + ": "+  str(results[i])
    if(results[i] > best_score):
        opt_colsample_bytree = classifiers[i].get_params()['colsample_bytree']
        opt_subsample = classifiers[i].get_params()['subsample']
        best_score = results[i]
        
print "\nBest params by AUPRC: " + "{colsample_bytree: " + str(classifiers[i].get_params()['colsample_bytree']) + ", subsample:" + str(classifiers[i].get_params()['subsample']) + "}" + ": " +  str(best_score)


{'reg_alpha': 0, 'colsample_bytree': 0.1, 'silent': 1, 'colsample_bylevel': 1, 'scale_pos_weight': 76, 'learning_rate': 0.1, 'missing': None, 'max_delta_step': 0, 'base_score': 0.5, 'n_estimators': 1, 'subsample': 0.1, 'reg_lambda': 1, 'min_child_weight': 0, 'objective': 'binary:logistic', 'seed': 0, 'max_depth': 3, 'gamma': 0.1, 'booster': 'gbtree'}
fold #: 1
AUC = 0.6844346662549534
AUPRC = 0.07516296957165385
fold #: 2
AUC = 0.7016797900262467
AUPRC = 0.240605915373761
fold #: 3
AUC = 0.6508372703412073
AUPRC = 0.05828533830461648
fold #: 4
AUC = 0.652496062992126
AUPRC = 0.02199133757365954
fold #: 5
AUC = 0.6744514435695537
AUPRC = 0.05023191685898522
fold #: 6
AUC = 0.6046573903911789
AUPRC = 0.04405999546487148
fold #: 7
AUC = 0.5877658177999474
AUPRC = 0.11099825384490054
fold #: 8
AUC = 0.6117327382515096
AUPRC = 0.07007895062277963
fold #: 9
AUC = 0.5713993174061434
AUPRC = 0.029572950545114034
fold #: 10
AUC = 0.6381937516408506
AUPRC = 0.15912260469696665
avg auc = 0.637764

In [None]:
#I didn't do this in the last run I did on the cluster, but you could run either of the next two sections to retune the
#n_estimator parameter with the new values of hyperparameters found above.

In [None]:
%%time

#Search for best values of n_estimators
#in the given range, which can be changed by changing the ranges in the for-loop
#Multithreading is used to speed up the process

from multiprocessing.dummy import Pool as ThreadPool
#can change number of threads (problem was running out of memory on the cluster)
pool = ThreadPool(20)

def test_model_for_map(clf):
    pred_dict = defaultdict(list)
    auc_dict = defaultdict(list)
    auprc_dict = defaultdict(list)
    return test_model(clf, pred_dict, auc_dict, auprc_dict, ligands_features_df[ligand], ligands_negatives_df[ligand])
 
classifiers = []
params = {} 

for n_est in range(1,1025,25):
    #create an XGB classifier with the given parameters.
    classifiers.append(XGBClassifier(n_estimators=n_est, n_jobs = -1, random_state=0, 
            max_depth=opt_max_depth, min_child_weight=opt_min_child_weight, gamma = opt_gamma,
            colsample_bytree=opt_colsample_bytree, subsample=opt_subsample, scale_pos_weight = val_scale_pos_weight))
    
results = pool.map(test_model_for_map,classifiers)
pool.close()
pool.join()
    
best_score = 0
best_param = ""
for i in range(0,len(classifiers),1):
    print "{n_estimators: " + str(classifiers[i].get_params()['n_estimators']) + "}" + ": "+  str(results[i])
    if(results[i] > best_score):
        optimized_n_est_2 = classifiers[i].get_params()['n_estimators']
        best_score = results[i]
        
print "\nBest params by AUPRC: " + "{n_estimators: " + str(classifiers[i].get_params()['n_estimators']) + "}" + ": " +  str(best_score)


In [None]:
%%time

#Uses cross validation and early stopping to find best number of estimators after finding best parameters
xgb2 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=opt_max_depth,
 min_child_weight=opt_min_child_weight,
 gamma=opt_gamma,
 subsample=opt_subsample,
 colsample_bytree=opt_colsample_bytree,
 objective= 'binary:logistic',
 n_jobs=-1,
 scale_pos_weight=val_scale_pos_weight,
 random_state = 0)

#returns = modelfit(xgb2, ligands_features_df[ligand], ligands_negatives_df[ligand], ligand)
#print "Optimal n_estimators for ligand("+str(ligand)+"): "+str(returns[1].shape[0]) 
#optimized_n_est_2 = returns[1].shape[0] 

In [50]:
#If you do either of the above sections you can uncomment the next line
#optimized_n_est = optimized_n_est_2
print "Best params for {0}:\nnum_estimators = {6}\nmax_depth = {1}\nmin_child_weight = {2}\ngamma = {3}\nsubsample = {4}\ncolsample_bytree = {5}\nscale_pos_weight = {7}".format(ligand,opt_max_depth,opt_min_child_weight,opt_gamma,opt_subsample,opt_colsample_bytree,optimized_n_est,val_scale_pos_weight)


Best params for dna:
num_estimators = 1
max_depth = 3
min_child_weight = 0
gamma = 0.0
subsample = 0.1
colsample_bytree = 0.1
scale_pos_weight = 76
