In [1]:
#Basic imports
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
from os import environ

#Classifier imports
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost.sklearn import XGBClassifier

#ML framework imports
from sklearn.metrics import auc, roc_auc_score, precision_recall_curve, average_precision_score, make_scorer
from sklearn.model_selection import StratifiedKFold,train_test_split,GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import scale
import xgboost as xgb

#import matplotlib.pylab as plt

#from matplotlib.pylab import rcParams
from sklearn import metrics   #Additional scklearn functions
#from sklearn.grid_search import 


#Downsamplers imports - prototype generation
from imblearn.under_sampling import ClusterCentroids

#Downsamplers imports - prototype selection - controlled
from imblearn.under_sampling import RandomUnderSampler, NearMiss

#Downsamplers imports - prototype selection - Cleaning techniques
from imblearn.under_sampling import TomekLinks, EditedNearestNeighbours, RepeatedEditedNearestNeighbours

#Downsamplers imports - prototype selection - Cleaning techniques - Condensed nearest neighbors and derived algorithms
from imblearn.under_sampling import CondensedNearestNeighbour, OneSidedSelection, NeighbourhoodCleaningRule

#Downsamplers imports - prototype selection - Cleaning techniques
from imblearn.under_sampling import InstanceHardnessThreshold

from IPython.core.display import HTML
#HTML("<style>.container { width:100% !important; }</style>")

ABSOLUTE_NEGATIVES = False
FILTER_DOMAIN = False

### Reading the input dataset

In [2]:
curr_dir = !pwd
input_path = curr_dir[0]+"/domains_similarity/filtered_features_table/"
filename = "positions_features_mediode_filter_01.25.18.csv"

#input_path = curr_dir[0]+"/../9.Features_exploration/binding_df/10/"
#filename = "positions_features_01.25.18.csv"

bind_scores_num = 10

#Features table
features_all = pd.read_csv(input_path+filename, sep='\t', index_col=0)
features_cols = features_all.columns[1:-bind_scores_num] #removing binding scores and domain name
ligands = ["dna", "dnabase", "dnabackbone", "rna", "rnabase", "rnabackbone", "peptide", "ion", "metabolite"]
print "all samples positions #: "+str(features_all.shape[0])

#lignd binding domains dictionary
with open(curr_dir[0]+"/ligands_negatives_domains_dict.pik", 'rb') as handle:
        negatives_dict = pickle.load(handle)

all samples positions #: 38944


#### Dataset of negative examples

In [3]:
def filter_to_ligand_binding_domains(use_max_binding_score):
    
    ligands_negatives_df = {}
    for ligand in ligands:
        
        ligands_negatives_df[ligand] = pd.DataFrame()
        for domain in negatives_dict[ligand].keys():
            if domain == 'negatives' or domain == 'domains':
                continue
            domain_all = features_all.loc[features_all.loc[:,"domain_name"] == domain,:]
            
            #In case this domain was previously filtered
            if len(domain_all) == 0:
                continue
            
            if (use_max_binding_score):
                ligands_negatives_df[ligand] = pd.concat([ligands_negatives_df[ligand],domain_all.loc[domain_all.loc[:,"max_binding_score"] == 0,:]])
            else:
                ligand_bind_str = ligand+"_binding_score"
                ligands_negatives_df[ligand] = pd.concat([ligands_negatives_df[ligand],domain_all.loc[domain_all.loc[:,ligand_bind_str] == 0,:]])
        
    #Handeling the ligand "all_ligands"
    all_ligands_negatives_df = pd.concat([ligands_negatives_df["dna"], ligands_negatives_df["dnabase"], ligands_negatives_df["dnabackbone"], ligands_negatives_df["rna"], ligands_negatives_df["rnabase"], 
                                 ligands_negatives_df["rnabackbone"], ligands_negatives_df["ion"], ligands_negatives_df["peptide"], ligands_negatives_df["metabolite"]])
    all_ligands_negatives_df = all_ligands_negatives_df.drop_duplicates()
    #Filter to just positions with max. binding score = 0
    all_ligands_negatives_df = all_ligands_negatives_df[all_ligands_negatives_df["max_binding_score"] == 0]
    ligands_negatives_df["all_ligands"] = all_ligands_negatives_df
    
    #Leaving just the features columns
    for ligand in ligands_negatives_df.keys():   
        ligands_negatives_df[ligand] = ligands_negatives_df[ligand][features_cols]
        print(ligand+" non-binding #:"+str(len(ligands_negatives_df[ligand])))
    
    return ligands_negatives_df
            

In [4]:
def negatives_by_binding_score(use_max_binding_score):
    
    ligands_negatives_df = {}
    for ligand in ligands:
        
        if use_max_binding_score:
            ligand_bind_str = "max_binding_score"
        else:
            ligand_bind_str = ligand+"_binding_score"
        
        ligands_negatives_df[ligand] = features_all[features_all[ligand_bind_str] == 0]
        ligands_negatives_df[ligand] = ligands_negatives_df[ligand].loc[:,features_cols]
        print(ligand+" non-binding #:"+str(len(ligands_negatives_df[ligand])))
        
    #Handeling the ligand "all_ligands"
    ligands_negatives_df["all_ligands"] = features_all[features_all["max_binding_score"] == 0]
    ligands_negatives_df["all_ligands"] = ligands_negatives_df["all_ligands"].loc[:,features_cols]
    print("all_ligands non-binding #:"+str(len(ligands_negatives_df["all_ligands"])))
    
    return ligands_negatives_df

In [5]:
#Create negatives datasets
if FILTER_DOMAIN:
    if ABSOLUTE_NEGATIVES:
        ligands_negatives_df = filter_to_ligand_binding_domains(True)
    else:
        ligands_negatives_df = filter_to_ligand_binding_domains(False)
else:
    if ABSOLUTE_NEGATIVES:
        ligands_negatives_df = negatives_by_binding_score(True)
    else:
        ligands_negatives_df = negatives_by_binding_score(False)

dna non-binding #:38095
dnabase non-binding #:38577
dnabackbone non-binding #:38203
rna non-binding #:38047
rnabase non-binding #:38407
rnabackbone non-binding #:38223
peptide non-binding #:35437
ion non-binding #:34488
metabolite non-binding #:33971
all_ligands non-binding #:27191


#### Datasets of positive examples by ligand

In [6]:
bind_th = 0.1
ligands_features_df = {}
    
for ligand in ligands:
    score_col_str = ligand+"_binding_score"
    ligand_binding_df = features_all[features_all[score_col_str] >= bind_th]
    print ligand+" #: "+str(ligand_binding_df.shape[0])
    ligands_features_df[ligand] = ligand_binding_df.loc[:,features_cols]

dna #: 501
dnabase #: 193
dnabackbone #: 408
rna #: 433
rnabase #: 224
rnabackbone #: 308
peptide #: 1496
ion #: 1093
metabolite #: 1525


#### Dataset of positive examples - all ligands combined

In [7]:
all_ligands_features_df = pd.concat([ligands_features_df["dna"], ligands_features_df["dnabase"], ligands_features_df["dnabackbone"], ligands_features_df["rna"], ligands_features_df["rnabase"], 
                                     ligands_features_df["rnabackbone"], ligands_features_df["ion"], ligands_features_df["peptide"], ligands_features_df["metabolite"]])
all_ligands_features_df = all_ligands_features_df.drop_duplicates()
print "all_ligands #: "+str(all_ligands_features_df.shape[0])
ligands_features_df["all_ligands"] = all_ligands_features_df

all_ligands #: 4518


### Reading env input for downsampler technique, ligand and classifier  

In [8]:
#Reading the ligand input
try:
    ligand = environ['ligand']
except:
    ligand = "dna"
print "ligand = "+ligand
    
#Reading the downsampler input
try: 
    downsample_method = environ['down']
except:
    downsample_method = "NoDown"
print "downsample_method = "+downsample_method

#Reading the classifier input
try: 
    classifier_method = environ['classifier']
except:
    classifier_method = "XGB"
print "classifier_method = "+classifier_method

ligand = dna
downsample_method = NoDown
classifier_method = XGB


### Test model functions

In [9]:
def modelfit(alg, ligand_bind_features, ligand_negatives_features, ligand_name, useTrainCV=True, cv_folds=10, early_stopping_rounds=50):
    
    features = np.ones([ligand_bind_features.shape[1],]).astype(bool)
    X = pd.concat([ligand_bind_features.iloc[:,features], ligand_negatives_features.iloc[:,features]])

    y = [1] * ligand_bind_features.shape[0]
    y.extend([0] * ligand_negatives_features.shape[0])
    y = np.array(y)
    
    print "modelfit"
    xgb_param = alg.get_xgb_params()
    xgtrain = xgb.DMatrix(X, label=y)
    #print alg.get_params()['n_estimators']
    cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, 
                      metrics='auc', early_stopping_rounds=early_stopping_rounds,verbose_eval =True)
    alg.set_params(n_estimators=cvresult.shape[0])
    print "Optimal n_estimators: " + str(cvresult.shape[0])
    
    #Fit the algorithm on the data
    #print "fitting"
    X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=y,test_size=0.25)
    #print X_train
    %time alg.fit(X_train, y_train)
        
    #Predict training set:
    dtrain_predictions = alg.predict(X_train)
    dtrain_predprob = alg.predict_proba(X_train)[:,1]
    
    #Predict test set:
    probs = alg.predict_proba(X_test)
    
    #Print model report:
    #print "\nModel Report"
    auc_score = roc_auc_score(y_test, probs[:, 1])
    #print y_test
    #print probs[:, 1]
    precision , recall, _ = precision_recall_curve(y_test, probs[:, 1])
    auprc = auc(recall, precision)    

    #Print model report:
    print "\nModel Report"
    print "Accuracy(Train): %.4g" % metrics.accuracy_score(y_train, dtrain_predictions)
    print "AUC Score (Train): %f" % metrics.roc_auc_score(y_train, dtrain_predprob)
    print "Average Precision (Train) : %.4g" % metrics.average_precision_score(y_train, dtrain_predprob)
    #print "AUC (Test) = "+str(auc_score)
    print "AUPRC (Test) = "+str(auprc)
    """               
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')
    """
    return alg,cvresult#,dtrain_predictions,dtrain_predprob,alg


%%time
#Choose all predictors except target & IDcols

#%matplotlib inline
#rcParams['figure.figsize'] = 12, 4

"""
ligand_bind_features = ligands_features_df[ligand]
ligand_negatives_features = ligands_negatives_df[ligand]
features = features = np.ones([ligand_bind_features.shape[1],]).astype(bool)
train = pd.concat([ligand_bind_features.iloc[:,features], ligand_negatives_features.iloc[:,features]])

y = [1] * ligand_bind_features.shape[0]
y.extend([0] * ligand_negatives_features.shape[0])
y = np.array(y)
train = train.assign(Disbursed=y)
target = 'Disbursed'
IDcol = 'ID'
predictors = [x for x in train.columns if x not in [target, IDcol]]
"""

xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
print "about to run"
returns = modelfit(xgb1, ligands_features_df[ligand], ligands_negatives_df[ligand], ligand)
print "Optimal n_estimators: "+str(returns[1].shape[0]) 
optimized_n_est = returns[1].shape[0] 

In [10]:
ligand_bind_features = ligands_features_df[ligand]
ligand_negatives_features = ligands_negatives_df[ligand]
features = features = np.ones([ligand_bind_features.shape[1],]).astype(bool)
X = pd.concat([ligand_bind_features.iloc[:,features], ligand_negatives_features.iloc[:,features]])
y = [1] * ligand_bind_features.shape[0]
y.extend([0] * ligand_negatives_features.shape[0])
y = np.array(y)

In [11]:
print len([y[i] for i in range(len(y)) if y[i]==1])
print len([y[i] for i in range(len(y)) if y[i]==0])
val_scale_pos_weight = len([y[i] for i in range(len(y)) if y[i]==0])/len([y[i] for i in range(len(y)) if y[i]==1])
print val_scale_pos_weight

501
38095
76


param_test9 = {
 'n_estimators':[100,200,300,400,500,600,700,800,900,1000,1100,1200,1300,1400,1500]
}
gsearch9 = GridSearchCV(estimator = XGBClassifier(n_estimators=1000, n_jobs=-1, 
                                                  random_state=0, max_depth=6, min_child_weight=0.05, colsample_bytree=0.5), 
param_grid = param_test9, scoring='roc_auc',n_jobs=1,iid=False, cv=10,verbose = 2)
gsearch9.fit(X,y)


gsearch9.grid_scores_

print gsearch9.best_params_, gsearch9.best_score_

param_test9 = {
 'n_estimators':[150,200,250]
}
gsearch9 = GridSearchCV(estimator = XGBClassifier(n_estimators=1000, n_jobs=-1, 
                                                  random_state=0, max_depth=6, min_child_weight=0.05, colsample_bytree=0.5), 
param_grid = param_test9, scoring='roc_auc',n_jobs=1,iid=False, cv=10,verbose = 2)
gsearch9.fit(X,y)
gsearch9.grid_scores_

param_test9 = {
 'n_estimators':[200,210,220,230,240,250]
}
gsearch9 = GridSearchCV(estimator = XGBClassifier(n_estimators=1000, n_jobs=-1, 
                                                  random_state=0, max_depth=6, min_child_weight=0.05, colsample_bytree=0.5), 
param_grid = param_test9, scoring='roc_auc',n_jobs=1,iid=False, cv=10,verbose = 2)
gsearch9.fit(X,y)
gsearch9.grid_scores_

gsearch9.best_params_

param_test9 = {
 'n_estimators':[150,160,170,180,190,200]
}
gsearch9 = GridSearchCV(estimator = XGBClassifier(n_estimators=1000, n_jobs=-1, 
                                                  random_state=0, max_depth=6, min_child_weight=0.05, colsample_bytree=0.5), 
param_grid = param_test9, scoring='roc_auc',n_jobs=1,iid=False, cv=10,verbose = 2)
gsearch9.fit(X,y)
gsearch9.grid_scores_
opt_n_est = gsearch9.best_params_["n_estimators"]

In [12]:
opt_n_est = 190

%%time

param_test_complexity = {
 'max_depth':range(1,10,1),
 'min_child_weight':range(0,6,1),
 'gamma':[i/10.0 for i in range(0,10)] 
}
print "Making GridSearchCV object"
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=190, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=5, scale_pos_weight=1, seed=27),
 param_grid = param_test_complexity, scoring='roc_auc',n_jobs=20,iid=False, cv=10)
print "Fitting"
gsearch1.fit(X,y)


gsearch1.grid_scores_

gsearch1.best_params_, gsearch1.best_score_

%%time

param_test_complexity = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(0,6,2),
 'gamma':[i/10.0 for i in range(0,10,2)] 
}
print "Making GridSearchCV object"
gsearch1 = GridSearchCV(estimator = XGBClassifier(n_estimators=190, random_state=0, max_depth=6, min_child_weight=0.05,
    colsample_bytree=0.5, scale_pos_weight = 76),
 param_grid = param_test_complexity, scoring='roc_auc',n_jobs=20,iid=False, cv=10,verbose =1)
print "Fitting"
gsearch1.fit(X,y)

gsearch1.grid_scores_

gsearch1.best_params_, gsearch1.best_score_

In [13]:
%%time

param_test_complexity = {
 'max_depth':[8,9,10],
 'min_child_weight':[0,.05,.25,.5,1,2,4,8,16,20],
 'gamma':[i/10.0 for i in range(0,4,1)] 
}
print "Making GridSearchCV object"
gsearch1 = GridSearchCV(estimator = XGBClassifier(n_estimators=190, random_state=0, max_depth=6, min_child_weight=0.05,
    colsample_bytree=0.5, scale_pos_weight = 76),
 param_grid = param_test_complexity, scoring='roc_auc',n_jobs=20,iid=False, cv=10,verbose =1)
print "Fitting"
gsearch1.fit(X,y)

Making GridSearchCV object
Fitting
Fitting 10 folds for each of 120 candidates, totalling 1200 fits


[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:  2.7min
[Parallel(n_jobs=20)]: Done 160 tasks      | elapsed: 21.8min
[Parallel(n_jobs=20)]: Done 410 tasks      | elapsed: 57.9min
[Parallel(n_jobs=20)]: Done 760 tasks      | elapsed: 111.6min
[Parallel(n_jobs=20)]: Done 1200 out of 1200 | elapsed: 176.6min finished


CPU times: user 10min 17s, sys: 47.4 s, total: 11min 4s
Wall time: 2h 59min 24s


In [14]:
gsearch1.grid_scores_



[mean: 0.81585, std: 0.09610, params: {'max_depth': 8, 'gamma': 0.0, 'min_child_weight': 0},
 mean: 0.81616, std: 0.10023, params: {'max_depth': 8, 'gamma': 0.0, 'min_child_weight': 0.05},
 mean: 0.81189, std: 0.09699, params: {'max_depth': 8, 'gamma': 0.0, 'min_child_weight': 0.25},
 mean: 0.81579, std: 0.08992, params: {'max_depth': 8, 'gamma': 0.0, 'min_child_weight': 0.5},
 mean: 0.81921, std: 0.09815, params: {'max_depth': 8, 'gamma': 0.0, 'min_child_weight': 1},
 mean: 0.81679, std: 0.09479, params: {'max_depth': 8, 'gamma': 0.0, 'min_child_weight': 2},
 mean: 0.81713, std: 0.10111, params: {'max_depth': 8, 'gamma': 0.0, 'min_child_weight': 4},
 mean: 0.81670, std: 0.09507, params: {'max_depth': 8, 'gamma': 0.0, 'min_child_weight': 8},
 mean: 0.82629, std: 0.08992, params: {'max_depth': 8, 'gamma': 0.0, 'min_child_weight': 16},
 mean: 0.82624, std: 0.08775, params: {'max_depth': 8, 'gamma': 0.0, 'min_child_weight': 20},
 mean: 0.82493, std: 0.09700, params: {'max_depth': 9, 'gamm

In [15]:
gsearch1.best_params_, gsearch1.best_score_

({'gamma': 0.1, 'max_depth': 9, 'min_child_weight': 20}, 0.8352674489866796)

%%time

param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
print "Making GridSearchCV object"
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=optimized_n_est, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27),
 param_grid = param_test1, scoring='roc_auc',n_jobs=1,iid=False, cv=10, verbose=3)
print "Fitting"
#gsearch1.fit(X,y)
#gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

%%time 
#optimized_n_est = 98
param_test2 = {
 'max_depth':[1,2,3,4],#range(3,10,2),
 'min_child_weight':[4,5,6]#range(1,6,2)
}
print "Making GridSearchCV object"
gsearch2 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=optimized_n_est, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test2, scoring='roc_auc',n_jobs=1,iid=False, cv=5, verbose=10)
print "Fitting"
#gsearch2.fit(X,y)
#gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_
#opt_max_depth = gsearch2.best_params_["max_depth"]
#opt_min_child_weight = gsearch2.best_params["min_child_weight"]

%%time 
optimized_n_est = 98
param_test2 = {
 'min_child_weight':[6,8,10,12,14,16,18,20,22,24]#range(1,6,2)
}
print "Making GridSearchCV object"
gsearch2 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=optimized_n_est, max_depth=opt_max_depth,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test2, scoring='average_precision',n_jobs=1,iid=False, cv=5, verbose=1)
print "Fitting"
#gsearch2.fit(X,y)
#gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_
#opt_min_child_weight = gsearch2.best_params_["min_child_weight"]


param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=optimized_n_est, max_depth=opt_max_depth,
 min_child_weight=opt_min_child_weight, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=5, scale_pos_weight=1,seed=27), 
 param_grid = param_test3, scoring='average_precision',n_jobs=1,iid=False, cv=5,verbose=2)
#gsearch3.fit(X,y)
#print gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_
#opt_gamma = gsearch3.best_params_["gamma"]


opt_max_depth = 1
opt_min_child_weight = 16
opt_gamma = 0
#optimized_n_est = 108
xgb2 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=opt_max_depth,
 min_child_weight=opt_min_child_weight,
 gamma=opt_gamma,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
returns = modelfit(xgb2, ligands_features_df[ligand], ligands_negatives_df[ligand], ligand)
print "Optimal n_estimators: "+str(returns[1].shape[0]) 
optimized_n_est_new = returns[1].shape[0] 



param_test4 = {
 'n_estimators':[optimized_n_est,optimized_n_est_new],
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}
gsearch4 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=98, max_depth=opt_max_depth,
 min_child_weight=opt_min_child_weight, gamma=opt_gamma, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test4, scoring='average_precision',n_jobs=1,iid=False, cv=5,verbose=3)
gsearch4.fit(X,y)
print gsearch4.grid_scores_


print gsearch4.best_params_, gsearch4.best_score_

param_test5 = {
 'subsample':[i/100.0 for i in range(65,80,5)],
 'colsample_bytree':[i/100.0 for i in range(65,80,5)]
}
gsearch5 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=optimized_n_est, max_depth=opt_max_depth,
 min_child_weight=opt_min_child_weight, gamma=opt_gamma, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test5, scoring='average_precision',n_jobs=1,iid=False, cv=5,verbose=2)
gsearch5.fit(X,y)

print gsearch5.grid_scores_

print gsearch5.best_params_, gsearch5.best_score_

In [16]:
opt_max_depth = 1
opt_min_child_weight = 16
opt_gamma = 0
opt_subsample = .7
opt_colsample_bytree = .7

param_test6 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
gsearch6 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=optimized_n_est, max_depth=opt_max_depth,
 min_child_weight=opt_min_child_weight, gamma=opt_gamma, subsample=opt_subsample, colsample_bytree=opt_colsample_bytree,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test6, scoring='average_precision',n_jobs=1,iid=False, cv=5,verbose = 2)
gsearch6.fit(X,y)

print gsearch6.grid_scores_

print gsearch6.best_params_, gsearch6.best_score_

param_test7 = {
 'reg_alpha':[.01, .05, .25, .5, .75, 1, 1.5, 2]
}
gsearch7 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=optimized_n_est, max_depth=opt_max_depth,
 min_child_weight=opt_min_child_weight, gamma=opt_gamma, subsample=opt_subsample, colsample_bytree=opt_colsample_bytree,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test7, scoring='average_precision',n_jobs=1,iid=False, cv=5,verbose = 2)
gsearch7.fit(X,y)

print gsearch7.grid_scores_

print gsearch7.best_params_, gsearch7.best_score_

param_test8 = {
 'reg_alpha':[.4, .45, .5, .55, .6]
}
gsearch8 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=optimized_n_est, max_depth=opt_max_depth,
 min_child_weight=opt_min_child_weight, gamma=opt_gamma, subsample=opt_subsample, colsample_bytree=opt_colsample_bytree,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test8, scoring='average_precision',n_jobs=1,iid=False, cv=5,verbose = 2)
gsearch8.fit(X,y)

print gsearch8.grid_scores_

print gsearch8.best_params_, gsearch8.best_score_

In [17]:
opt_alpha = .5

%%time

xgb3 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=opt_max_depth,
 min_child_weight=opt_min_child_weight,
 gamma=opt_gamma,
 reg_alpha = opt_alpha,
 subsample=opt_subsample,
 colsample_bytree=opt_colsample_bytree,
 objective= 'binary:logistic',
 nthread=-1,
 scale_pos_weight=1,
 seed=27)
returns = modelfit(xgb3, ligands_features_df[ligand], ligands_negatives_df[ligand], ligand)
print "Optimal n_estimators: "+str(returns[1].shape[0]) 
optimized_n_est_new_new = returns[1].shape[0]

%%time

xgb4 = XGBClassifier(
 learning_rate =0.01,
 n_estimators=5000,
 max_depth=opt_max_depth,
 min_child_weight=opt_min_child_weight,
 gamma=opt_gamma,
 reg_alpha = opt_alpha,
 subsample=opt_subsample,
 colsample_bytree=opt_colsample_bytree,
 objective= 'binary:logistic',
 nthread=-1,
 scale_pos_weight=1,
 seed=27)
returns = modelfit(xgb4, ligands_features_df[ligand], ligands_negatives_df[ligand], ligand,early_stopping_rounds = 500)
print "Optimal n_estimators: "+str(returns[1].shape[0])
optimized_n_est_new_new_new = returns[1].shape[0]

param_test9 = {
 'n_estimators':[100,200,300,400,500,600,700,800,900,1000,1100,1200,1300,1400,1500]
}
gsearch9 = GridSearchCV(estimator = XGBClassifier(n_estimators=1000, n_jobs=-1, 
                                                  random_state=0, max_depth=6, min_child_weight=0.05, colsample_bytree=0.5), 
param_grid = param_test9, scoring='average_precision',n_jobs=1,iid=False, cv=10,verbose = 2)
gsearch9.fit(X,y)
print gsearch9.grid_scores_
print gsearch9.best_params_, gsearch9.best_score_