In [33]:
#Basic imports
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
from os import environ
import json

#Classifier imports
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost.sklearn import XGBClassifier

#ML framework imports
from sklearn.metrics import auc, roc_auc_score, precision_recall_curve, average_precision_score, make_scorer
from sklearn.model_selection import StratifiedKFold,train_test_split,GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import scale
import xgboost as xgb

#import matplotlib.pylab as plt

#from matplotlib.pylab import rcParams
from sklearn import metrics   #Additional scklearn functions
#from sklearn.grid_search import 


#Downsamplers imports - prototype generation
from imblearn.under_sampling import ClusterCentroids

#Downsamplers imports - prototype selection - controlled
from imblearn.under_sampling import RandomUnderSampler, NearMiss

#Downsamplers imports - prototype selection - Cleaning techniques
from imblearn.under_sampling import TomekLinks, EditedNearestNeighbours, RepeatedEditedNearestNeighbours

#Downsamplers imports - prototype selection - Cleaning techniques - Condensed nearest neighbors and derived algorithms
from imblearn.under_sampling import CondensedNearestNeighbour, OneSidedSelection, NeighbourhoodCleaningRule

#Downsamplers imports - prototype selection - Cleaning techniques
from imblearn.under_sampling import InstanceHardnessThreshold

from IPython.core.display import HTML
#HTML("<style>.container { width:100% !important; }</style>")

ABSOLUTE_NEGATIVES = False
FILTER_DOMAIN = False

### Reading the input dataset

In [34]:
curr_dir = !pwd
curr_dir[0] = curr_dir[0] + "/.."
input_path = curr_dir[0]+"/domains_similarity/filtered_features_table/"
filename = "positions_features_mediode_filter_01.25.18.csv"

#input_path = curr_dir[0]+"/../9.Features_exploration/binding_df/10/"
#filename = "positions_features_01.25.18.csv"

bind_scores_num = 10

#Features table
features_all = pd.read_csv(input_path+filename, sep='\t', index_col=0)
features_cols = features_all.columns[1:-bind_scores_num] #removing binding scores and domain name
ligands = ["dna", "dnabase", "dnabackbone", "rna", "rnabase", "rnabackbone", "peptide", "ion", "metabolite"]
print "all samples positions #: "+str(features_all.shape[0])

#lignd binding domains dictionary
with open(curr_dir[0]+"/ligands_negatives_domains_dict.pik", 'rb') as handle:
        negatives_dict = pickle.load(handle)

all samples positions #: 38944


#### Dataset of negative examples

In [35]:
def filter_to_ligand_binding_domains(use_max_binding_score):
    
    ligands_negatives_df = {}
    for ligand in ligands:
        
        ligands_negatives_df[ligand] = pd.DataFrame()
        for domain in negatives_dict[ligand].keys():
            if domain == 'negatives' or domain == 'domains':
                continue
            domain_all = features_all.loc[features_all.loc[:,"domain_name"] == domain,:]
            
            #In case this domain was previously filtered
            if len(domain_all) == 0:
                continue
            
            if (use_max_binding_score):
                ligands_negatives_df[ligand] = pd.concat([ligands_negatives_df[ligand],domain_all.loc[domain_all.loc[:,"max_binding_score"] == 0,:]])
            else:
                ligand_bind_str = ligand+"_binding_score"
                ligands_negatives_df[ligand] = pd.concat([ligands_negatives_df[ligand],domain_all.loc[domain_all.loc[:,ligand_bind_str] == 0,:]])
        
    #Handeling the ligand "all_ligands"
    all_ligands_negatives_df = pd.concat([ligands_negatives_df["dna"], ligands_negatives_df["dnabase"], ligands_negatives_df["dnabackbone"], ligands_negatives_df["rna"], ligands_negatives_df["rnabase"], 
                                 ligands_negatives_df["rnabackbone"], ligands_negatives_df["ion"], ligands_negatives_df["peptide"], ligands_negatives_df["metabolite"]])
    all_ligands_negatives_df = all_ligands_negatives_df.drop_duplicates()
    #Filter to just positions with max. binding score = 0
    all_ligands_negatives_df = all_ligands_negatives_df[all_ligands_negatives_df["max_binding_score"] == 0]
    ligands_negatives_df["all_ligands"] = all_ligands_negatives_df
    
    #Leaving just the features columns
    for ligand in ligands_negatives_df.keys():   
        ligands_negatives_df[ligand] = ligands_negatives_df[ligand][features_cols]
        print(ligand+" non-binding #:"+str(len(ligands_negatives_df[ligand])))
    
    return ligands_negatives_df
            

In [7]:
def negatives_by_binding_score(use_max_binding_score):
    
    ligands_negatives_df = {}
    for ligand in ligands:
        
        if use_max_binding_score:
            ligand_bind_str = "max_binding_score"
        else:
            ligand_bind_str = ligand+"_binding_score"
        
        ligands_negatives_df[ligand] = features_all[features_all[ligand_bind_str] == 0]
        ligands_negatives_df[ligand] = ligands_negatives_df[ligand].loc[:,features_cols]
        print(ligand+" non-binding #:"+str(len(ligands_negatives_df[ligand])))
        
    #Handeling the ligand "all_ligands"
    ligands_negatives_df["all_ligands"] = features_all[features_all["max_binding_score"] == 0]
    ligands_negatives_df["all_ligands"] = ligands_negatives_df["all_ligands"].loc[:,features_cols]
    print("all_ligands non-binding #:"+str(len(ligands_negatives_df["all_ligands"])))
    
    return ligands_negatives_df

In [9]:
#Create negatives datasets
if FILTER_DOMAIN:
    if ABSOLUTE_NEGATIVES:
        ligands_negatives_df = filter_to_ligand_binding_domains(True)
    else:
        ligands_negatives_df = filter_to_ligand_binding_domains(False)
else:
    if ABSOLUTE_NEGATIVES:
        ligands_negatives_df = negatives_by_binding_score(True)
    else:
        ligands_negatives_df = negatives_by_binding_score(False)

dna non-binding #:38095
dnabase non-binding #:38577
dnabackbone non-binding #:38203
rna non-binding #:38047
rnabase non-binding #:38407
rnabackbone non-binding #:38223
peptide non-binding #:35437
ion non-binding #:34488
metabolite non-binding #:33971
all_ligands non-binding #:27191


#### Datasets of positive examples by ligand

In [10]:
bind_th = 0.1
ligands_features_df = {}
    
for ligand in ligands:
    score_col_str = ligand+"_binding_score"
    ligand_binding_df = features_all[features_all[score_col_str] >= bind_th]
    print ligand+" #: "+str(ligand_binding_df.shape[0])
    ligands_features_df[ligand] = ligand_binding_df.loc[:,features_cols]

dna #: 501
dnabase #: 193
dnabackbone #: 408
rna #: 433
rnabase #: 224
rnabackbone #: 308
peptide #: 1496
ion #: 1093
metabolite #: 1525


#### Dataset of positive examples - all ligands combined

In [11]:
all_ligands_features_df = pd.concat([ligands_features_df["dna"], ligands_features_df["dnabase"], ligands_features_df["dnabackbone"], ligands_features_df["rna"], ligands_features_df["rnabase"], 
                                     ligands_features_df["rnabackbone"], ligands_features_df["ion"], ligands_features_df["peptide"], ligands_features_df["metabolite"]])
all_ligands_features_df = all_ligands_features_df.drop_duplicates()
print "all_ligands #: "+str(all_ligands_features_df.shape[0])
ligands_features_df["all_ligands"] = all_ligands_features_df

all_ligands #: 4518


### Reading env input for downsampler technique, ligand and classifier  

In [57]:
#Reading the ligand input
try:
    ligand = environ['ligand']
except:
    ligand = "dna"
print "ligand = "+ligand

#Reading the max_depth input
try:
    max_depth = environ['max_depth']
except:
    max_depth = 1
    
#Reading the min_child_weight input
try:
    min_child_weight = environ['min_child_weight']
except:
    min_child_weight = 0

#Reading the gamma input
try:
    gam = environ['gamma']
except:
    gam = 0
    
#Reading the colsample_bytree input
try:
    colsample_bytree = environ['colsample_bytree']
except:
    colsample_bytree = 1.0

#Reading the subsample input
try:
    subsample = environ['subsample']
except:
    subsample = 1.0

ligand = dna
downsample_method = NoDown
classifier_method = XGB


In [13]:
def test_model(clf, pred_dict, auc_dict, auprc_dict, ligand_bind_features, ligand_negatives_features, features = []):
    """
    Test different models in 10-folds cross-validation.
    """
    
    #Default: Exclude no features
    if len(features) == 0:
        features = np.ones([ligand_bind_features.shape[1],]).astype(bool)
        
    #Arranging the features table by the CV order, for each model
    #features_pred_dfs = dict.fromkeys(classifiers.keys())
    
    models_req_scaling = ["SVM", "KNN"]
    
    classifier = classifier_method
    model = clf
    #print "classifier_method = " + classifier_method
    #print "ligand = " + ligand
    print model.get_xgb_params()
    #features_pred_dfs[classifier] = pd.DataFrame()
        
    #Create X and y with included features
    X = pd.concat([ligand_bind_features.iloc[:,features], ligand_negatives_features.iloc[:,features]])

    y = [1] * ligand_bind_features.shape[0]
    y.extend([0] * ligand_negatives_features.shape[0])
    y = np.array(y)

    binding_skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
    pred_idx = 1

    for train_index, test_index in binding_skf.split(X, y):
            
        print "fold #: "+str(pred_idx)
        X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
        y_train, y_test = y[train_index], y[test_index]
        
        X_train_sampled = X_train
        y_train_sampled = y_train
            
            
        #early_stopping validation set 
        #7.8 Early stopping and Algorithm 7.2
        #http://egrcc.github.io/docs/dl/deeplearningbook-regularization.pdf
            
        #X_subtrain, X_valid, y_subtrain, y_valid = train_test_split(X_train_sampled, 
        #                                y_train_sampled, stratify=y_train_sampled, test_size = .1)
            
        #fit to training data
        #model = classifiers[classifier]
        model.fit(X_train_sampled, y_train_sampled)#,
                      #eval_set = [(X_valid,y_valid)],eval_metric = "map", early_stopping_rounds = 50,verbose = False)
        #print model.best_ntree_limit
        probs_list = []

        #probs = model.predict(X_test)
        #probs_list = probs
            
        probs = model.predict_proba(X_test) #,ntree_limit=model.best_ntree_limit)
        for l in probs:
            probs_list.append(l[1])
                
        pred_dict["obs"].extend(y_test)
        pred_dict["prob"].extend(probs_list)
        fold_list = [pred_idx] * len(probs_list)
        pred_dict["fold"].extend(fold_list)

        model_list = [classifier] * len(probs_list)
        pred_dict["model"].extend(model_list)

        #Update auc auprc dictionaries
        auc_dict[classifier].append(roc_auc_score(y_test, probs[:, 1]))
        precision, recall, _ = precision_recall_curve(y_test, probs[:, 1])
            
        #auc_dict[classifier].append(roc_auc_score(y_test, probs))
        #precision, recall, _ = precision_recall_curve(y_test, probs)
            
        auprc_dict[classifier].append(auc(recall, precision))
            
        #Update features table
        #features_pred_dfs[classifier] = features_pred_dfs[classifier].append(X_test)
        pred_idx += 1
            
        print "AUC = "+str(auc_dict[classifier][-1])
        print "AUPRC = "+str(auprc_dict[classifier][-1])

    avg_auc = np.sum(auc_dict[classifier])/10.0
    print "avg auc = "+str(avg_auc) 
        
    avg_auprc = np.sum(auprc_dict[classifier])/10.0
    print "avg auprc = "+str(avg_auprc)
            
    print "Finished "+ligand+" "+classifier
        
    #return Average AUPRC
    return avg_auprc

### Test model functions

In [60]:
#adapted from https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
#
#Using cross-validation, continues adding estimators until auc does not improve within 50 rounds
def modelfit(alg, ligand_bind_features, ligand_negatives_features, ligand_name, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    features = np.ones([ligand_bind_features.shape[1],]).astype(bool)
    X = pd.concat([ligand_bind_features.iloc[:,features], ligand_negatives_features.iloc[:,features]])

    y = [1] * ligand_bind_features.shape[0]
    y.extend([0] * ligand_negatives_features.shape[0])
    y = np.array(y)
    
    print "modelfit"
    xgb_param = alg.get_xgb_params()
    xgtrain = xgb.DMatrix(X, label=y)
    #print alg.get_params()['n_estimators']
    
    #metrics can be changed to a variety of things including "map" for Mean Average Precision (same as AveragePRC) 
    cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, 
                      metrics='auc', early_stopping_rounds=early_stopping_rounds,verbose_eval =True)
    alg.set_params(n_estimators=cvresult.shape[0])
    print "Optimal n_estimators: " + str(cvresult.shape[0])
    
    return alg,cvresult#,dtrain_predictions,dtrain_predprob,alg


In [40]:
ligand_bind_features = ligands_features_df[ligand]
ligand_negatives_features = ligands_negatives_df[ligand]
features = features = np.ones([ligand_bind_features.shape[1],]).astype(bool)
X = pd.concat([ligand_bind_features.iloc[:,features], ligand_negatives_features.iloc[:,features]])
y = [1] * ligand_bind_features.shape[0]
y.extend([0] * ligand_negatives_features.shape[0])
y = np.array(y)

#Calcuate a good value for scale_pos_weight, which gives that value to positive examples. 
#A common value is the ratio of number of negative examples to number of positive examples (#neg/#pos)

val_scale_pos_weight = len([y[i] for i in range(len(y)) if y[i]==0])/len([y[i] for i in range(len(y)) if y[i]==1])
print val_scale_pos_weight

76


In [61]:
%%time

#Uses cross validation and early stopping to find best number of estimators
xgb1 = XGBClassifier(
     learning_rate =0.1,
     n_estimators=1000,
     max_depth=9,
     min_child_weight=0,
     gamma=0,
     subsample=1.0,
     colsample_bytree=1.0,
     objective= 'binary:logistic',
     n_jobs=-1,
     scale_pos_weight=val_scale_pos_weight,
     random_state=0)

#returns = modelfit(xgb1, ligands_features_df[ligand], ligands_negatives_df[ligand], ligand)
returns = [xgb1,np.zeros(141)]
print "Optimal n_estimators for ligand("+str(ligand)+"): "+str(returns[1].shape[0]) 
optimized_n_est = returns[1].shape[0] 

modelfit
[0]	train-map:0.364542+0.0191594	test-map:0.148676+0.0308632
[1]	train-map:0.60257+0.0505605	test-map:0.211796+0.0406174
[2]	train-map:0.735801+0.0374324	test-map:0.250428+0.0251336
[3]	train-map:0.807666+0.0230271	test-map:0.285107+0.024937
[4]	train-map:0.849375+0.0257286	test-map:0.300232+0.0250184
[5]	train-map:0.887256+0.0231454	test-map:0.328638+0.0338323
[6]	train-map:0.904911+0.022045	test-map:0.359593+0.040778
[7]	train-map:0.920043+0.0137323	test-map:0.35917+0.0427617
[8]	train-map:0.933557+0.0181996	test-map:0.368494+0.038123
[9]	train-map:0.943885+0.0141974	test-map:0.386678+0.0384341
[10]	train-map:0.951101+0.0128876	test-map:0.401048+0.0419737
[11]	train-map:0.955613+0.0108501	test-map:0.411558+0.0438196
[12]	train-map:0.960014+0.00860242	test-map:0.415325+0.0373681
[13]	train-map:0.96522+0.00916513	test-map:0.431717+0.0343965
[14]	train-map:0.969651+0.00992425	test-map:0.440803+0.0383049
[15]	train-map:0.973572+0.00710258	test-map:0.442401+0.0374592
[16]	train-m

In [None]:
%%time

#Search for best values of max_depth, min_child_weight, and gamma (which are all related to complexity of the model)
#in the given ranges, which can be changed by changing the ranges in the for-loops
#Multithreading is used to speed up the process

#Dummy module is the same as multiprocessing, but is for multithreading
#from multiprocessing.dummy import Pool as ThreadPool
#pool = ThreadPool(20)

#Helper function that takes a classifier and runs test_model and returns the Average PRC (score)
def test_model_for_map(clf):
    pred_dict = defaultdict(list)
    auc_dict = defaultdict(list)
    auprc_dict = defaultdict(list)
    return test_model(clf, pred_dict, auc_dict, auprc_dict, ligands_features_df[ligand], ligands_negatives_df[ligand])

classifiers = []
params = {}

#create an XGB classifier with the given parameters. 
#colsample_bytree and subsample are fixed to 0.8.
classifiers.append(XGBClassifier(n_estimators=optimized_n_est, n_jobs = -1, random_state=0, 
    max_depth=max_depth, min_child_weight=min_child_weight, gamma = gam,
    colsample_bytree=.8, subsample=.8, scale_pos_weight = val_scale_pos_weight))
#Keep a dictionary of a string of the parameters and the actual values
#params["{max_depth: " + str(max_depth) + ", min_child_weight: " + str(min_child_weight) +  ", gamma: " + str(gam)+"}"] = [max_depth,min_child_weight,gam]

#runs test_model_for_map function on each classifier in classifiers
#results = pool.map(test_model_for_map,classifiers)
#pool.close()
#pool.join()


#Print out params and their Average AUPRC and get parameter which had best Average AUPRC (score)
#best_score = 0
#best_param = ""
#ps = params.keys()
#for i in range(0,len(classifiers),1):
#   print str(ps[i]) + ": "+  str(results[i])
#    if(results[i] > best_score):
#        best_param = ps[i]
#        best_score = results[i]
        
print "\nBest params by AUPRC: " + best_param + ": " +  str(best_score)

best_vals = params[best_param]
opt_max_depth = best_vals[0]
opt_min_child_weight = best_vals[1]
opt_gamma = best_vals[2]

%%time

#Search for best values of max_depth, min_child_weight, and gamma (which are all related to complexity of the model)
#in the given ranges, which can be changed by changing the ranges in the for-loops
#Multithreading is used to speed up the process

#Dummy module is the same as multiprocessing, but is for multithreading
from multiprocessing.dummy import Pool as ThreadPool
pool = ThreadPool(20)

#Helper function that takes a classifier and runs test_model and returns the Average PRC (score)
def test_model_for_map(clf):
    pred_dict = defaultdict(list)
    auc_dict = defaultdict(list)
    auprc_dict = defaultdict(list)
    return test_model(clf, pred_dict, auc_dict, auprc_dict, ligands_features_df[ligand], ligands_negatives_df[ligand])

classifiers = []
params = {}

#10,6,10
for max_depth in range(3,10,1):
    for min_child_weight in range(0,6,1):
        for gam in [i/10.0 for i in range(0,10,1)]:
            #create an XGB classifier with the given parameters. 
            #colsample_bytree and subsample are fixed to 0.8.
            classifiers.append(XGBClassifier(n_estimators=optimized_n_est, n_jobs = -1, random_state=0, 
                max_depth=max_depth, min_child_weight=min_child_weight, gamma = gam,
                colsample_bytree=1.0, subsample=1.0, scale_pos_weight = val_scale_pos_weight))
            #Keep a dictionary of a string of the parameters and the actual values
            params["{max_depth: " + str(max_depth) + ", min_child_weight: " + str(min_child_weight) +  ", gamma: " + str(gam)+"}"] = [max_depth,min_child_weight,gam]

#runs test_model_for_map function on each classifier in classifiers
results = pool.map(test_model_for_map,classifiers)
pool.close()
pool.join()


#Print out params and their Average AUPRC and get parameter which had best Average AUPRC (score)
best_score = 0
best_param = ""
ps = params.keys()
for i in range(0,len(classifiers),1):
    print str(ps[i]) + ": "+  str(results[i])
    if(results[i] > best_score):
        best_param = ps[i]
        best_score = results[i]
        
print "\nBest params by AUPRC: " + best_param + ": " +  str(best_score)

best_vals = params[best_param]
opt_max_depth = best_vals[0]
opt_min_child_weight = best_vals[1]
opt_gamma = best_vals[2]

%%time

#Search for best values of subsample and colsample_bytree (which are all related to robustness of the model)
#in the given ranges, which can be changed by changing the ranges in the for-loops
#Multithreading is used to speed up the process

pool2 = ThreadPool(20)
classifiers = []
params = {}

#10,10
for colsamp in range(1,10,1):
    for subsamp in range(1,10,1):
        #create an XGB classifier with the given parameters. 
        #max_depth, min_child_weight, and gamma are the best values from the previous section
        classifiers.append(XGBClassifier(n_estimators=optimized_n_est, n_jobs = -1, random_state=0, 
            max_depth=opt_max_depth, gamma = .1, min_child_weight=opt_min_child_weight, 
            colsample_bytree=colsamp/10.0, subsample=subsamp/10.0, scale_pos_weight = val_scale_pos_weight,))
        params["{colsample_bytree: " + str(colsamp/10.0) + ", subsample:" + str(subsamp/10.0) + "}"] = [colsamp/10.0,subsamp/10.0]

results = pool2.map(test_model_for_map,classifiers)
pool2.close()
pool2.join()

best_score = 0
best_param = ""
ps = params.keys()
for i in range(0,len(classifiers),1):
    print str(ps[i]) + ": "+  str(results[i])
    if(results[i] > best_score):
        best_param = ps[i]
        best_score = results[i]
        
print "\nBest params by AUPRC: " + best_param + ": " +  str(best_score)

best_vals = params[best_param]
opt_colsample_bytree = best_vals[0]
opt_subsample = best_vals[1]

In [None]:
print "Best params for {0}:\nmax_depth = {1}\nmin_child_weight = {2}\ngamma = {3}\nsubsample = {4}\ncolsample_bytree = {5}".format(ligand,opt_max_depth,opt_min_child_weight,opt_gamma,opt_subsample,opt_colsample_bytree)
