In [3]:
#Basic imports
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
from os import environ

#Classifier imports
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost.sklearn import XGBClassifier

#ML framework imports
from sklearn.metrics import auc, roc_auc_score, precision_recall_curve, average_precision_score, make_scorer
from sklearn.model_selection import StratifiedKFold,train_test_split,GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import scale
import xgboost as xgb

#import matplotlib.pylab as plt

#from matplotlib.pylab import rcParams
from sklearn import metrics   #Additional scklearn functions
#from sklearn.grid_search import 


#Downsamplers imports - prototype generation
from imblearn.under_sampling import ClusterCentroids

#Downsamplers imports - prototype selection - controlled
from imblearn.under_sampling import RandomUnderSampler, NearMiss

#Downsamplers imports - prototype selection - Cleaning techniques
from imblearn.under_sampling import TomekLinks, EditedNearestNeighbours, RepeatedEditedNearestNeighbours

#Downsamplers imports - prototype selection - Cleaning techniques - Condensed nearest neighbors and derived algorithms
from imblearn.under_sampling import CondensedNearestNeighbour, OneSidedSelection, NeighbourhoodCleaningRule

#Downsamplers imports - prototype selection - Cleaning techniques
from imblearn.under_sampling import InstanceHardnessThreshold

from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

ABSOLUTE_NEGATIVES = False
FILTER_DOMAIN = False

### Reading the input dataset

In [4]:
curr_dir = !pwd
input_path = curr_dir[0]+"/domains_similarity/filtered_features_table/"
filename = "positions_features_mediode_filter_01.25.18.csv"

#input_path = curr_dir[0]+"/../9.Features_exploration/binding_df/10/"
#filename = "positions_features_01.25.18.csv"

bind_scores_num = 10

#Features table
features_all = pd.read_csv(input_path+filename, sep='\t', index_col=0)
features_cols = features_all.columns[1:-bind_scores_num] #removing binding scores and domain name
ligands = ["dna", "dnabase", "dnabackbone", "rna", "rnabase", "rnabackbone", "peptide", "ion", "metabolite"]
print "all samples positions #: "+str(features_all.shape[0])

#lignd binding domains dictionary
with open(curr_dir[0]+"/ligands_negatives_domains_dict.pik", 'rb') as handle:
        negatives_dict = pickle.load(handle)

all samples positions #: 38944


#### Dataset of negative examples

In [5]:
def filter_to_ligand_binding_domains(use_max_binding_score):
    
    ligands_negatives_df = {}
    for ligand in ligands:
        
        ligands_negatives_df[ligand] = pd.DataFrame()
        for domain in negatives_dict[ligand].keys():
            if domain == 'negatives' or domain == 'domains':
                continue
            domain_all = features_all.loc[features_all.loc[:,"domain_name"] == domain,:]
            
            #In case this domain was previously filtered
            if len(domain_all) == 0:
                continue
            
            if (use_max_binding_score):
                ligands_negatives_df[ligand] = pd.concat([ligands_negatives_df[ligand],domain_all.loc[domain_all.loc[:,"max_binding_score"] == 0,:]])
            else:
                ligand_bind_str = ligand+"_binding_score"
                ligands_negatives_df[ligand] = pd.concat([ligands_negatives_df[ligand],domain_all.loc[domain_all.loc[:,ligand_bind_str] == 0,:]])
        
    #Handeling the ligand "all_ligands"
    all_ligands_negatives_df = pd.concat([ligands_negatives_df["dna"], ligands_negatives_df["dnabase"], ligands_negatives_df["dnabackbone"], ligands_negatives_df["rna"], ligands_negatives_df["rnabase"], 
                                 ligands_negatives_df["rnabackbone"], ligands_negatives_df["ion"], ligands_negatives_df["peptide"], ligands_negatives_df["metabolite"]])
    all_ligands_negatives_df = all_ligands_negatives_df.drop_duplicates()
    #Filter to just positions with max. binding score = 0
    all_ligands_negatives_df = all_ligands_negatives_df[all_ligands_negatives_df["max_binding_score"] == 0]
    ligands_negatives_df["all_ligands"] = all_ligands_negatives_df
    
    #Leaving just the features columns
    for ligand in ligands_negatives_df.keys():   
        ligands_negatives_df[ligand] = ligands_negatives_df[ligand][features_cols]
        print(ligand+" non-binding #:"+str(len(ligands_negatives_df[ligand])))
    
    return ligands_negatives_df
            

In [6]:
def negatives_by_binding_score(use_max_binding_score):
    
    ligands_negatives_df = {}
    for ligand in ligands:
        
        if use_max_binding_score:
            ligand_bind_str = "max_binding_score"
        else:
            ligand_bind_str = ligand+"_binding_score"
        
        ligands_negatives_df[ligand] = features_all[features_all[ligand_bind_str] == 0]
        ligands_negatives_df[ligand] = ligands_negatives_df[ligand].loc[:,features_cols]
        print(ligand+" non-binding #:"+str(len(ligands_negatives_df[ligand])))
        
    #Handeling the ligand "all_ligands"
    ligands_negatives_df["all_ligands"] = features_all[features_all["max_binding_score"] == 0]
    ligands_negatives_df["all_ligands"] = ligands_negatives_df["all_ligands"].loc[:,features_cols]
    print("all_ligands non-binding #:"+str(len(ligands_negatives_df["all_ligands"])))
    
    return ligands_negatives_df

In [7]:
#Create negatives datasets
if FILTER_DOMAIN:
    if ABSOLUTE_NEGATIVES:
        ligands_negatives_df = filter_to_ligand_binding_domains(True)
    else:
        ligands_negatives_df = filter_to_ligand_binding_domains(False)
else:
    if ABSOLUTE_NEGATIVES:
        ligands_negatives_df = negatives_by_binding_score(True)
    else:
        ligands_negatives_df = negatives_by_binding_score(False)

dna non-binding #:38095
dnabase non-binding #:38577
dnabackbone non-binding #:38203
rna non-binding #:38047
rnabase non-binding #:38407
rnabackbone non-binding #:38223
peptide non-binding #:35437
ion non-binding #:34488
metabolite non-binding #:33971
all_ligands non-binding #:27191


#### Datasets of positive examples by ligand

In [8]:
bind_th = 0.1
ligands_features_df = {}
    
for ligand in ligands:
    score_col_str = ligand+"_binding_score"
    ligand_binding_df = features_all[features_all[score_col_str] >= bind_th]
    print ligand+" #: "+str(ligand_binding_df.shape[0])
    ligands_features_df[ligand] = ligand_binding_df.loc[:,features_cols]

dna #: 501
dnabase #: 193
dnabackbone #: 408
rna #: 433
rnabase #: 224
rnabackbone #: 308
peptide #: 1496
ion #: 1093
metabolite #: 1525


#### Dataset of positive examples - all ligands combined

In [9]:
all_ligands_features_df = pd.concat([ligands_features_df["dna"], ligands_features_df["dnabase"], ligands_features_df["dnabackbone"], ligands_features_df["rna"], ligands_features_df["rnabase"], 
                                     ligands_features_df["rnabackbone"], ligands_features_df["ion"], ligands_features_df["peptide"], ligands_features_df["metabolite"]])
all_ligands_features_df = all_ligands_features_df.drop_duplicates()
print "all_ligands #: "+str(all_ligands_features_df.shape[0])
ligands_features_df["all_ligands"] = all_ligands_features_df

all_ligands #: 4518


### Models tested (and their hyper-parameters)

In [10]:
classifiers = {}
classifiers["Logistic"] = LogisticRegression(C=0.001, random_state=0)
classifiers["RF"] = RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=0)  
#classifiers["RF"] = RandomForestRegressor(n_estimators=1000)  
classifiers["KNN"] = KNeighborsClassifier(n_neighbors=100, n_jobs=-1)
#classifiers["KNN"] = KNeighborsRegressor(n_neighbors=100)
classifiers["SVM"] = SVC(kernel='rbf', probability=True, random_state=0)
classifiers["ADA-RF"] = AdaBoostClassifier(n_estimators=1000, random_state=0)
classifiers["ADA-Log"] = AdaBoostClassifier(base_estimator=classifiers["Logistic"], n_estimators=1000, random_state=0)
#classifiers["Bag-Log"] = BaggingClassifier(base_estimator=classifiers["Logistic"], n_estimators=1000, n_jobs=-1, random_state=0)
classifiers["XGB"] = XGBClassifier(n_estimators=1000, n_jobs=-1, random_state=0, max_depth=6, min_child_weight=0.05, colsample_bytree=0.5)


### Downsamplers tested

In [11]:
#documentation on techniques: http://contrib.scikit-learn.org/imbalanced-learn/stable/under_sampling.html#cleaning-under-sampling-techniques
downsamplers = defaultdict(dict)

##Prototype generation##
downsamplers["ClusterCentroids"] = ClusterCentroids(random_state=0)

##Prototype selection##
#Contolled#
downsamplers["RandomUnderSampler"] = RandomUnderSampler(random_state=0)
downsamplers["NearMiss3"] = NearMiss(random_state=0, version=3)
downsamplers["NearMiss2"] = NearMiss(random_state=0, version=2)
downsamplers["NearMiss1"] = NearMiss(random_state=0, version=1)

#Cleaning#
downsamplers["TomekLinks"] = TomekLinks(random_state=0)
downsamplers["EditedNearestNeighbours"] = EditedNearestNeighbours(random_state=0)
downsamplers["RepeatedEditedNearestNeighbours"] = RepeatedEditedNearestNeighbours(random_state=0)
downsamplers["NeighbourhoodCleaningRule"] = NeighbourhoodCleaningRule(random_state=0)

# Instance hardness threshold#
downsamplers["InstanceHardnessThreshold"]["KNN"] = InstanceHardnessThreshold(random_state=0, estimator=classifiers["KNN"])
#downsamplers["InstanceHardnessThreshold"]["KNN"] = InstanceHardnessThreshold(random_state=0, estimator= KNeighborsClassifier(n_neighbors=100))
downsamplers["InstanceHardnessThreshold"]["SVM"] = InstanceHardnessThreshold(random_state=0, estimator=classifiers["SVM"])
downsamplers["InstanceHardnessThreshold"]["RF"] = InstanceHardnessThreshold(random_state=0, estimator=classifiers["RF"])
#downsamplers["InstanceHardnessThreshold"]["RF"] = InstanceHardnessThreshold(random_state=0, estimator=RandomForestClassifier(n_estimators=1000))
downsamplers["InstanceHardnessThreshold"]["Logistic"] = InstanceHardnessThreshold(random_state=0, estimator=classifiers["Logistic"])
#downsamplers["InstanceHardnessThreshold"]["ADA"] = InstanceHardnessThreshold(random_state=0, estimator=classifiers["ADA"])

### Reading env input for downsampler technique, ligand and classifier  

In [12]:
#Reading the ligand input
try:
    ligand = environ['ligand']
except:
    ligand = "dnabase"
print "ligand = "+ligand
    
#Reading the downsampler input
try: 
    downsample_method = environ['down']
except:
    downsample_method = "NoDown"
print "downsample_method = "+downsample_method

#Reading the classifier input
try: 
    classifier_method = environ['classifier']
except:
    classifier_method = "XGB"
print "classifier_method = "+classifier_method

ligand = dnabase
downsample_method = NoDown
classifier_method = XGB


### Test model functions

In [13]:
def test_model(pred_dict, auc_dict, auprc_dict, ligand_bind_features, ligand_negatives_features, ligand_name, downsample_method, features=[]):
    """
    Test different models in 10-folds cross-validation.
    """
    
    #Default: Exclude no features
    if len(features) == 0:
        features = np.ones([ligand_bind_features.shape[1],]).astype(bool)
        
    #Arranging the features table by the CV order, for each model
    features_pred_dfs = dict.fromkeys(classifiers.keys())
    
    models_req_scaling = ["SVM", "KNN"]
    
    for classifier in classifiers.keys():
        print classifier
        classifier = classifier_method
        model = classifiers[classifier]
        features_pred_dfs[classifier] = pd.DataFrame()
        
        #Create X and y with included features
        X = pd.concat([ligand_bind_features.iloc[:,features], ligand_negatives_features.iloc[:,features]])
        
        if (classifier in models_req_scaling):
            idx = X.index
            cols = X.columns
            X = pd.DataFrame(scale(X)) #Is z-scoring the data needed?
            X.index = idx #Restoring indices after scaling
            X.columns = cols

        y = [1] * ligand_bind_features.shape[0]
        y.extend([0] * ligand_negatives_features.shape[0])
        y = np.array(y)

        binding_skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
        pred_idx = 1

        for train_index, test_index in binding_skf.split(X, y):
            print "fold #: "+str(pred_idx)
            X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
            y_train, y_test = y[train_index], y[test_index]

            #Down-sample negative examples to have balanced classes
            if (downsample_method == "NoDown"):
                X_train_sampled = X_train
                y_train_sampled = y_train
            else:
                if (downsample_method == "InstanceHardnessThreshold"):
                    downsampler = downsamplers[downsample_method][classifier]
                else:
                    downsampler = downsamplers[downsample_method]

                X_train_sampled, y_train_sampled = downsampler.fit_sample(X_train, y_train)
            
            #fit to training data
            model = classifiers[classifier]
            model.fit(X_train_sampled, y_train_sampled)
            probs_list = []

            #probs = model.predict(X_test)
            #probs_list = probs
            
            probs = model.predict_proba(X_test)
            for l in probs:
                probs_list.append(l[1])
                
            pred_dict["obs"].extend(y_test)
            pred_dict["prob"].extend(probs_list)
            fold_list = [pred_idx] * len(probs_list)
            pred_dict["fold"].extend(fold_list)

            model_list = [classifier] * len(probs_list)
            pred_dict["model"].extend(model_list)

            #Update auc auprc dictionaries
            auc_dict[classifier].append(roc_auc_score(y_test, probs[:, 1]))
            precision, recall, _ = precision_recall_curve(y_test, probs[:, 1])
            
            #auc_dict[classifier].append(roc_auc_score(y_test, probs))
            #precision, recall, _ = precision_recall_curve(y_test, probs)
            
            auprc_dict[classifier].append(auc(recall, precision))
            
            #Update features table
            features_pred_dfs[classifier] = features_pred_dfs[classifier].append(X_test)
            pred_idx += 1
            
            print "AUC = "+str(auc_dict[classifier][-1])
            print "AUPRC = "+str(auprc_dict[classifier][-1])

        avg_auc = np.sum(auc_dict[classifier])/10.0
        print "avg auc = "+str(avg_auc)
        
        avg_auprc = np.sum(auprc_dict[classifier])/10.0
        print "avg auprc = "+str(avg_auprc)
            
        print "Finished "+ligand+" "+classifier
        break
    
    return features_pred_dfs

In [14]:
def combine_features_predictions(ligand, ordered_features, pred_df):
    
    pred_res = pred_df.copy(deep=True)
    for classifier in classifiers.keys():
        classifier = classifier_method
        model_pred = pred_res[pred_res["model"] == classifier]
        model_pred.index = ordered_features[classifier].index
        
        #Creating the combined table
        features_pred = pd.concat([ordered_features[classifier], model_pred], axis=1)
        
        #Saving
        #features_pred.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/"+downsample_method+"/01.25.2018/features_pred_tables/"+ligand+"_"+classifier+"_features_pred.csv", sep='\t')
        break

#### Predict for each ligand seperatelly

In [15]:
#%%time

#for ligand in ligands:
#    print ligand

#Initialize dictionary
#pred_dict = defaultdict(list)
#auc_dict = defaultdict(list)
#auprc_dict = defaultdict(list)

#ordered_features = test_model(pred_dict, auc_dict, auprc_dict, ligands_features_df[ligand], ligands_negatives_df[ligand], ligand, downsample_method)

#pred_df = pd.DataFrame.from_dict(pred_dict)
#auc_df = pd.DataFrame.from_dict(auc_dict)
#auprc_df = pd.DataFrame.from_dict(auprc_dict)

#Save to file
#pred_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/"+downsample_method+"/01.25.2018/"+ligand+"_"+classifier_method+"_0.1.csv", sep='\t')
#auc_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/"+downsample_method+"/01.25.2018/"+ligand+"_"+classifier_method+"_0.1_auc.csv", sep='\t')
#auprc_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/"+downsample_method+"/01.25.2018/"+ligand+"_"+classifier_method+"_0.1_auprc.csv", sep='\t')

#Combine features and pred results to a unified table
#combine_features_predictions(ligand, ordered_features, pred_df)

#print "Finished ligand "+ligand

In [20]:
def modelfit(alg, ligand_bind_features, ligand_negatives_features, ligand_name, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    features = np.ones([ligand_bind_features.shape[1],]).astype(bool)
    X = pd.concat([ligand_bind_features.iloc[:,features], ligand_negatives_features.iloc[:,features]])

    y = [1] * ligand_bind_features.shape[0]
    y.extend([0] * ligand_negatives_features.shape[0])
    y = np.array(y)
    
    print "modelfit"
    xgb_param = alg.get_xgb_params()
    xgtrain = xgb.DMatrix(X, label=y)
    #print alg.get_params()['n_estimators']
    cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, 
                      metrics='map', early_stopping_rounds=early_stopping_rounds)
    alg.set_params(n_estimators=cvresult.shape[0])
    print "Optimal n_estimators: " + str(cvresult.shape[0])
    
    #Fit the algorithm on the data
    #print "fitting"
    X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=y,test_size=0.25)
    #print X_train
    %time alg.fit(X_train, y_train,eval_metric='map')
        
    #Predict training set:
    dtrain_predictions = alg.predict(X_train)
    dtrain_predprob = alg.predict_proba(X_train)[:,1]
    
    #Predict test set:
    #probs = alg.predict_proba(X_test)
    
    #Print model report:
    #print "\nModel Report"
    #auc_score = roc_auc_score(y_test, probs[:, 1])
    #print y_test
    #print probs[:, 1]
    #precision , recall, _ = precision_recall_curve(y_test, probs[:, 1])
    #auprc = auc(recall, precision)    

    #Print model report:
    print "\nModel Report"
    print "Accuracy(Train): %.4g" % metrics.accuracy_score(y_train, dtrain_predictions)
    print "AUC Score (Train): %f" % metrics.roc_auc_score(y_train, dtrain_predprob)
    print "Average Precision: %.4g" % metrics.average_precision_score(y_train, dtrain_predprob)
    #print "AUC (Test) = "+str(auc_score)
    #print "AUPRC (Test) = "+str(auprc)
    """               
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')
    """
    return alg,cvresult#,dtrain_predictions,dtrain_predprob,alg


In [17]:
def modelfit_test(clf,ligand_bind_features, ligand_negatives_features, ligand_name, features=[]):
    """
    Test different models in 10-folds cross-validation.
    """
    
    #Default: Exclude no features
    if len(features) == 0:
        features = np.ones([ligand_bind_features.shape[1],]).astype(bool)
        
    #Arranging the features table by the CV order, for each model
    features_pred_dfs = dict.fromkeys(classifiers.keys())
    
    #models_req_scaling = ["SVM", "KNN"]
    
    classifier = classifier_method
    #model = classifiers[classifier]
    model = clf
    features_pred_dfs[classifier] = pd.DataFrame()

    #Create X and y with included features
    X = pd.concat([ligand_bind_features.iloc[:,features], ligand_negatives_features.iloc[:,features]])

    y = [1] * ligand_bind_features.shape[0]
    y.extend([0] * ligand_negatives_features.shape[0])
    y = np.array(y)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=y,test_size=0.25)
    
    #binding_skf = StratifiedKFold(n_splits=1, shuffle=True, random_state=0)
    pred_idx = 1

    #for train_index, test_index in binding_skf.split(X, y):
        #print "fold #: "+str(pred_idx)
        #X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
        #y_train, y_test = y[train_index], y[test_index]
    X_train_sampled = X_train
    y_train_sampled = y_train
        
    #fit to training data
    %time model.fit(X_train_sampled, y_train_sampled)
    probs_list = []
        
    probs = model.predict_proba(X_test)
    for l in probs:
        probs_list.append(l[1])
        
    auc_score = roc_auc_score(y_test, probs[:, 1])
    #print y_test
    #print probs[:, 1]
    precision , recall, _ = precision_recall_curve(y_test, probs[:, 1])
    auprc = auc(recall, precision)    
    
    dtrain_predictions = model.predict(X_train)
    dtrain_predprob = model.predict_proba(X_train)[:,1]

    #Print model report:
    print "\nModel Report"
    print "Accuracy(Train): %.4g" % metrics.accuracy_score(y_train, dtrain_predictions)
    print "AUC Score (Train): %f" % metrics.roc_auc_score(y_train, dtrain_predprob)
    print "AUC (Test) = "+str(auc_score)
    print "AUPRC (Test) = "+str(auprc)
    
    
    #feat_imp = pd.Series(model.booster().get_fscore()).sort_values(ascending=False)
    #feat_imp.plot(kind='bar', title='Feature Importances')
    #plt.ylabel('Feature Importance Score')
                                  
    return features_pred_dfs,model

In [21]:
%%time
#Choose all predictors except target & IDcols

#%matplotlib inline
#rcParams['figure.figsize'] = 12, 4

"""
ligand_bind_features = ligands_features_df[ligand]
ligand_negatives_features = ligands_negatives_df[ligand]
features = features = np.ones([ligand_bind_features.shape[1],]).astype(bool)
train = pd.concat([ligand_bind_features.iloc[:,features], ligand_negatives_features.iloc[:,features]])

y = [1] * ligand_bind_features.shape[0]
y.extend([0] * ligand_negatives_features.shape[0])
y = np.array(y)
train = train.assign(Disbursed=y)
target = 'Disbursed'
IDcol = 'ID'
predictors = [x for x in train.columns if x not in [target, IDcol]]
"""

xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
print "about to run"
returns = modelfit(xgb1, ligands_features_df[ligand], ligands_negatives_df[ligand], ligand)
print "Optimal n_estimators: "+str(returns[1].shape[0]) 
optimized_n_est = returns[1].shape[0] 

about to run
modelfit
Optimal n_estimators: 108
CPU times: user 1min 23s, sys: 354 ms, total: 1min 23s
Wall time: 21.9 s

Model Report
Accuracy(Train): 0.9996
AUC Score (Train): 1.000000
Average Precision: 1
Optimal n_estimators: 108
CPU times: user 12min 51s, sys: 2.29 s, total: 12min 54s
Wall time: 3min 20s


  if diff:


In [19]:
ligand_bind_features = ligands_features_df[ligand]
ligand_negatives_features = ligands_negatives_df[ligand]
features = features = np.ones([ligand_bind_features.shape[1],]).astype(bool)
X = pd.concat([ligand_bind_features.iloc[:,features], ligand_negatives_features.iloc[:,features]])
y = [1] * ligand_bind_features.shape[0]
y.extend([0] * ligand_negatives_features.shape[0])
y = np.array(y)

In [19]:
%%time

param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
print "Making GridSearchCV object"
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=optimized_n_est, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27, 
 param_grid = param_test1, scoring='average_precision',n_jobs=1,iid=False, cv=5, verbose=3))
print "Fitting"
gsearch1.fit(X,y)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

Making GridSearchCV object
Fitting
CPU times: user 147 ms, sys: 71 ms, total: 218 ms
Wall time: 218 ms


In [26]:
gsearch1.grid_scores_



[mean: 0.14453, std: 0.18337, params: {'max_depth': 3, 'min_child_weight': 1},
 mean: 0.14071, std: 0.18196, params: {'max_depth': 3, 'min_child_weight': 3},
 mean: 0.15011, std: 0.17757, params: {'max_depth': 3, 'min_child_weight': 5},
 mean: 0.10233, std: 0.13135, params: {'max_depth': 5, 'min_child_weight': 1},
 mean: 0.08294, std: 0.06544, params: {'max_depth': 5, 'min_child_weight': 3},
 mean: 0.11572, std: 0.11639, params: {'max_depth': 5, 'min_child_weight': 5},
 mean: 0.08769, std: 0.09143, params: {'max_depth': 7, 'min_child_weight': 1},
 mean: 0.08281, std: 0.06587, params: {'max_depth': 7, 'min_child_weight': 3},
 mean: 0.09790, std: 0.11390, params: {'max_depth': 7, 'min_child_weight': 5},
 mean: 0.08695, std: 0.07276, params: {'max_depth': 9, 'min_child_weight': 1},
 mean: 0.08511, std: 0.07007, params: {'max_depth': 9, 'min_child_weight': 3},
 mean: 0.09251, std: 0.09015, params: {'max_depth': 9, 'min_child_weight': 5}]

In [20]:
%%time 
optimized_n_est = 98
param_test2 = {
 'max_depth':[1,2,3,4],#range(3,10,2),
 'min_child_weight':[4,5,6]#range(1,6,2)
}
print "Making GridSearchCV object"
gsearch2 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=optimized_n_est, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test2, scoring='average_precision',n_jobs=1,iid=False, cv=5, verbose=10)
print "Fitting"
gsearch2.fit(X,y)
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_
opt_max_depth = gsearch2.best_params_["max_depth"]
opt_min_child_weight = gsearch2.best_params["min_child_weight"]

Making GridSearchCV object
Fitting
Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] max_depth=1, min_child_weight=4 .................................
[CV]  max_depth=1, min_child_weight=4, score=0.683692652021, total=   6.4s
[CV] max_depth=1, min_child_weight=4 .................................
[CV]  max_depth=1, min_child_weight=4, score=0.035514267901, total=   6.5s

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   15.0s remaining:    0.0s



[CV] max_depth=1, min_child_weight=4 .................................
[CV]  max_depth=1, min_child_weight=4, score=0.0369951634474, total=   6.3s
[CV] max_depth=1, min_child_weight=4 .................................
[CV]  max_depth=1, min_child_weight=4, score=0.159002309132, total=   6.3s

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   22.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   29.7s remaining:    0.0s



[CV] max_depth=1, min_child_weight=4 .................................
[CV]  max_depth=1, min_child_weight=4, score=0.0415413885066, total=   6.4s
[CV] max_depth=1, min_child_weight=5 .................................
[CV]  max_depth=1, min_child_weight=5, score=0.68972048855, total=   6.3s

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   37.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   44.6s remaining:    0.0s



[CV] max_depth=1, min_child_weight=5 .................................
[CV]  max_depth=1, min_child_weight=5, score=0.0346486226579, total=   6.3s
[CV] max_depth=1, min_child_weight=5 .................................
[CV]  max_depth=1, min_child_weight=5, score=0.0384724467929, total=   6.4s

[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   52.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   59.4s remaining:    0.0s



[CV] max_depth=1, min_child_weight=5 .................................
[CV]  max_depth=1, min_child_weight=5, score=0.148916296532, total=   6.2s
[CV] max_depth=1, min_child_weight=5 .................................
[CV]  max_depth=1, min_child_weight=5, score=0.0571348192738, total=   6.2s
[CV] max_depth=1, min_child_weight=6 .................................
[CV]  max_depth=1, min_child_weight=6, score=0.684079833265, total=   6.1s
[CV] max_depth=1, min_child_weight=6 .................................
[CV]  max_depth=1, min_child_weight=6, score=0.0343248050016, total=   6.2s
[CV] max_depth=1, min_child_weight=6 .................................
[CV]  max_depth=1, min_child_weight=6, score=0.0456784784749, total=   6.3s
[CV] max_depth=1, min_child_weight=6 .................................
[CV]  max_depth=1, min_child_weight=6, score=0.145841281561, total=   6.4s
[CV] max_depth=1, min_child_weight=6 .................................
[CV]  max_depth=1, min_child_weight=6, score=0.08

[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  1.1min remaining:    0.0s
[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed: 11.8min finished



CPU times: user 39min 20s, sys: 21.8 s, total: 39min 41s
Wall time: 11min 54s




In [29]:
opt_max_depth = gsearch2.best_params_["max_depth"]
gsearch2.grid_scores_,gsearch2.best_params_



([mean: 0.19135, std: 0.25060, params: {'max_depth': 1, 'min_child_weight': 4},
  mean: 0.19378, std: 0.25143, params: {'max_depth': 1, 'min_child_weight': 5},
  mean: 0.19904, std: 0.24564, params: {'max_depth': 1, 'min_child_weight': 6},
  mean: 0.15009, std: 0.18235, params: {'max_depth': 2, 'min_child_weight': 4},
  mean: 0.15300, std: 0.17838, params: {'max_depth': 2, 'min_child_weight': 5},
  mean: 0.16188, std: 0.21720, params: {'max_depth': 2, 'min_child_weight': 6},
  mean: 0.15871, std: 0.18971, params: {'max_depth': 3, 'min_child_weight': 4},
  mean: 0.15011, std: 0.17757, params: {'max_depth': 3, 'min_child_weight': 5},
  mean: 0.15875, std: 0.18304, params: {'max_depth': 3, 'min_child_weight': 6},
  mean: 0.14088, std: 0.14942, params: {'max_depth': 4, 'min_child_weight': 4},
  mean: 0.14032, std: 0.15410, params: {'max_depth': 4, 'min_child_weight': 5},
  mean: 0.11960, std: 0.13783, params: {'max_depth': 4, 'min_child_weight': 6}],
 {'max_depth': 1, 'min_child_weight': 6

In [42]:
%%time 
optimized_n_est = 98
param_test2 = {
 'min_child_weight':[6,8,10,12,14,16,18,20,22,24]#range(1,6,2)
}
print "Making GridSearchCV object"
gsearch2 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=optimized_n_est, max_depth=opt_max_depth,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test2, scoring='average_precision',n_jobs=1,iid=False, cv=5, verbose=1)
print "Fitting"
gsearch2.fit(X,y)
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_
opt_min_child_weight = gsearch2.best_params_["min_child_weight"]

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  6.0min finished


Making GridSearchCV object
Fitting
Fitting 5 folds for each of 10 candidates, totalling 50 fits


AttributeError: 'GridSearchCV' object has no attribute 'best_params'

In [46]:
gsearch2.grid_scores_



[mean: 0.19904, std: 0.24564, params: {'min_child_weight': 6},
 mean: 0.14365, std: 0.10743, params: {'min_child_weight': 8},
 mean: 0.16635, std: 0.12464, params: {'min_child_weight': 10},
 mean: 0.15985, std: 0.10408, params: {'min_child_weight': 12},
 mean: 0.22473, std: 0.21265, params: {'min_child_weight': 14},
 mean: 0.23041, std: 0.24218, params: {'min_child_weight': 16},
 mean: 0.22585, std: 0.24681, params: {'min_child_weight': 18},
 mean: 0.22531, std: 0.25044, params: {'min_child_weight': 20},
 mean: 0.22125, std: 0.24620, params: {'min_child_weight': 22},
 mean: 0.22623, std: 0.24473, params: {'min_child_weight': 24}]

In [None]:
opt_max_depth = 1
opt_min_child_weight = 16
optimized_n_est = 98

In [22]:
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=optimized_n_est, max_depth=opt_max_depth,
 min_child_weight=opt_min_child_weight, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=5, scale_pos_weight=1,seed=27), 
 param_grid = param_test3, scoring='average_precision',n_jobs=1,iid=False, cv=5,verbose=2)
gsearch3.fit(X,y)
print gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_
opt_gamma = gsearch3.best_params_["gamma"]

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] gamma=0.0 .......................................................
[CV] ........................................ gamma=0.0, total=   5.4s
[CV] gamma=0.0 .......................................................
[CV] ........................................ gamma=0.0, total=   5.2s
[CV] gamma=0.0 .......................................................
[CV] ........................................ gamma=0.0, total=   5.2s
[CV] gamma=0.0 .......................................................
[CV] ........................................ gamma=0.0, total=   5.3s
[CV] gamma=0.0 .......................................................
[CV] ........................................ gamma=0.0, total=   5.2s
[CV] gamma=0.1 .......................................................
[CV] ........................................ gamma=0.1, total=   5.2s
[CV] gamma=0.1 .......................................................
[CV] ............

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:  2.6min finished



[mean: 0.23041, std: 0.24218, params: {'gamma': 0.0}, mean: 0.23041, std: 0.24218, params: {'gamma': 0.1}, mean: 0.23041, std: 0.24218, params: {'gamma': 0.2}, mean: 0.23041, std: 0.24218, params: {'gamma': 0.3}, mean: 0.23041, std: 0.24218, params: {'gamma': 0.4}] {'gamma': 0.0} 0.23041349903421232




In [25]:
gsearch3.grid_scores_
opt_gamma



0.0

In [32]:
opt_gamma = 0



[mean: 0.06643, std: 0.02815, params: {'max_depth': 3, 'min_child_weight': 1},
 mean: 0.06781, std: 0.02693, params: {'max_depth': 3, 'min_child_weight': 3},
 mean: 0.06845, std: 0.02790, params: {'max_depth': 3, 'min_child_weight': 5},
 mean: 0.06256, std: 0.02220, params: {'max_depth': 5, 'min_child_weight': 1},
 mean: 0.06280, std: 0.02203, params: {'max_depth': 5, 'min_child_weight': 3},
 mean: 0.06606, std: 0.02392, params: {'max_depth': 5, 'min_child_weight': 5},
 mean: 0.06009, std: 0.02191, params: {'max_depth': 7, 'min_child_weight': 1},
 mean: 0.06245, std: 0.02378, params: {'max_depth': 7, 'min_child_weight': 3},
 mean: 0.06526, std: 0.02807, params: {'max_depth': 7, 'min_child_weight': 5},
 mean: 0.06134, std: 0.02261, params: {'max_depth': 9, 'min_child_weight': 1},
 mean: 0.06147, std: 0.02359, params: {'max_depth': 9, 'min_child_weight': 3},
 mean: 0.06347, std: 0.02713, params: {'max_depth': 9, 'min_child_weight': 5}]

In [20]:
%%time

ligand_bind_features = ligands_features_df[ligand]
ligand_negatives_features = ligands_negatives_df[ligand]
features = features = np.ones([ligand_bind_features.shape[1],]).astype(bool)
X = pd.concat([ligand_bind_features.iloc[:,features], ligand_negatives_features.iloc[:,features]])
y = [1] * ligand_bind_features.shape[0]
y.extend([0] * ligand_negatives_features.shape[0])
y = np.array(y)


param_test2 = {
    #'max_depth':[1],
    #'min_child_weight':range(1,6,2),
    'objective':['reg:linear','binary:logistic']
}
gsearch4 = GridSearchCV(estimator = XGBClassifier(n_estimators=1000, n_jobs=5, random_state=0, max_depth=6, min_child_weight=0, colsample_bytree=0.5), 
 param_grid = param_test2, scoring='average_precision',n_jobs=1,iid=False, cv=5, verbose = 10)
gsearch4.fit(X,y)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV] objective=reg:linear ............................................
[CV] ...... objective=reg:linear, score=0.0674008103229, total= 1.7min
[CV] objective=reg:linear ............................................
[CV] ...... objective=reg:linear, score=0.0206680681253, total= 1.7min

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.7min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  3.4min remaining:    0.0s



[CV] objective=reg:linear ............................................
[CV] ...... objective=reg:linear, score=0.0168567222175, total= 1.7min
[CV] objective=reg:linear ............................................
[CV] ...... objective=reg:linear, score=0.0317870364368, total= 1.7min

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  5.2min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  6.9min remaining:    0.0s



[CV] objective=reg:linear ............................................
[CV] ...... objective=reg:linear, score=0.0527066819977, total= 1.7min
[CV] objective=binary:logistic .......................................
[CV] .. objective=binary:logistic, score=0.291835621816, total= 1.8min

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  8.7min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 10.5min remaining:    0.0s



[CV] objective=binary:logistic .......................................
[CV] . objective=binary:logistic, score=0.0242029477699, total= 1.8min
[CV] objective=binary:logistic .......................................
[CV] . objective=binary:logistic, score=0.0218292191785, total= 1.8min

[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed: 12.3min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 14.1min remaining:    0.0s



[CV] objective=binary:logistic .......................................
[CV] . objective=binary:logistic, score=0.0754873503236, total= 1.8min
[CV] objective=binary:logistic .......................................
[CV] . objective=binary:logistic, score=0.0685046548804, total= 1.8min

[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 15.9min remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 17.7min finished



CPU times: user 1h 38min 4s, sys: 4.86 s, total: 1h 38min 9s
Wall time: 20min 1s


In [26]:
xgb2 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=opt_max_depth,
 min_child_weight=opt_min_child_weight,
 gamma=opt_gamma,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
returns = modelfit(xgb2, ligands_features_df[ligand], ligands_negatives_df[ligand], ligand)
print "Optimal n_estimators: "+str(returns[1].shape[0]) 
optimized_n_est_new = returns[1].shape[0] 

modelfit
Optimal n_estimators: 553
CPU times: user 1min 35s, sys: 206 ms, total: 1min 35s
Wall time: 25 s

Model Report
Accuracy(Train): 0.9957
AUC Score (Train): 0.971171
Optimal n_estimators: 553


  if diff:


In [28]:
param_test4 = {
 'n_estimators':[optimized_n_est,optimized_n_est_new],
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}
gsearch4 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=98, max_depth=opt_max_depth,
 min_child_weight=opt_min_child_weight, gamma=opt_gamma, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test4, scoring='average_precision',n_jobs=1,iid=False, cv=5,verbose=3)
gsearch4.fit(X,y)
print gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] n_estimators=98, subsample=0.6, colsample_bytree=0.6 ............
[CV]  n_estimators=98, subsample=0.6, colsample_bytree=0.6, score=0.694812607951, total=   5.5s
[CV] n_estimators=98, subsample=0.6, colsample_bytree=0.6 ............
[CV]  n_estimators=98, subsample=0.6, colsample_bytree=0.6, score=0.0585643227998, total=   5.4s

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   13.0s remaining:    0.0s



[CV] n_estimators=98, subsample=0.6, colsample_bytree=0.6 ............
[CV]  n_estimators=98, subsample=0.6, colsample_bytree=0.6, score=0.12467589588, total=   5.3s
[CV] n_estimators=98, subsample=0.6, colsample_bytree=0.6 ............
[CV]  n_estimators=98, subsample=0.6, colsample_bytree=0.6, score=0.150642010309, total=   5.4s
[CV] n_estimators=98, subsample=0.6, colsample_bytree=0.6 ............
[CV]  n_estimators=98, subsample=0.6, colsample_bytree=0.6, score=0.101328776833, total=   5.4s
[CV] n_estimators=98, subsample=0.7, colsample_bytree=0.6 ............
[CV]  n_estimators=98, subsample=0.7, colsample_bytree=0.6, score=0.692534304121, total=   5.5s
[CV] n_estimators=98, subsample=0.7, colsample_bytree=0.6 ............
[CV]  n_estimators=98, subsample=0.7, colsample_bytree=0.6, score=0.0416589623288, total=   5.6s
[CV] n_estimators=98, subsample=0.7, colsample_bytree=0.6 ............
[CV]  n_estimators=98, subsample=0.7, colsample_bytree=0.6, score=0.165510288351, total=   5.

[Parallel(n_jobs=1)]: Done 160 out of 160 | elapsed: 45.7min finished



[mean: 0.22600, std: 0.23635, params: {'n_estimators': 98, 'subsample': 0.6, 'colsample_bytree': 0.6}, mean: 0.22903, std: 0.23666, params: {'n_estimators': 98, 'subsample': 0.7, 'colsample_bytree': 0.6}, mean: 0.22729, std: 0.24076, params: {'n_estimators': 98, 'subsample': 0.8, 'colsample_bytree': 0.6}, mean: 0.22090, std: 0.21820, params: {'n_estimators': 98, 'subsample': 0.9, 'colsample_bytree': 0.6}, mean: 0.16893, std: 0.14707, params: {'n_estimators': 553, 'subsample': 0.6, 'colsample_bytree': 0.6}, mean: 0.13921, std: 0.10062, params: {'n_estimators': 553, 'subsample': 0.7, 'colsample_bytree': 0.6}, mean: 0.12238, std: 0.08500, params: {'n_estimators': 553, 'subsample': 0.8, 'colsample_bytree': 0.6}, mean: 0.11535, std: 0.06931, params: {'n_estimators': 553, 'subsample': 0.9, 'colsample_bytree': 0.6}, mean: 0.22203, std: 0.24960, params: {'n_estimators': 98, 'subsample': 0.6, 'colsample_bytree': 0.7}, mean: 0.22743, std: 0.23888, params: {'n_estimators': 98, 'subsample': 0.7, 

In [29]:
 gsearch4.best_params_

{'colsample_bytree': 0.7, 'n_estimators': 98, 'subsample': 0.8}

In [30]:
gsearch4.grid_scores_



[mean: 0.22600, std: 0.23635, params: {'n_estimators': 98, 'subsample': 0.6, 'colsample_bytree': 0.6},
 mean: 0.22903, std: 0.23666, params: {'n_estimators': 98, 'subsample': 0.7, 'colsample_bytree': 0.6},
 mean: 0.22729, std: 0.24076, params: {'n_estimators': 98, 'subsample': 0.8, 'colsample_bytree': 0.6},
 mean: 0.22090, std: 0.21820, params: {'n_estimators': 98, 'subsample': 0.9, 'colsample_bytree': 0.6},
 mean: 0.16893, std: 0.14707, params: {'n_estimators': 553, 'subsample': 0.6, 'colsample_bytree': 0.6},
 mean: 0.13921, std: 0.10062, params: {'n_estimators': 553, 'subsample': 0.7, 'colsample_bytree': 0.6},
 mean: 0.12238, std: 0.08500, params: {'n_estimators': 553, 'subsample': 0.8, 'colsample_bytree': 0.6},
 mean: 0.11535, std: 0.06931, params: {'n_estimators': 553, 'subsample': 0.9, 'colsample_bytree': 0.6},
 mean: 0.22203, std: 0.24960, params: {'n_estimators': 98, 'subsample': 0.6, 'colsample_bytree': 0.7},
 mean: 0.22743, std: 0.23888, params: {'n_estimators': 98, 'subsample

In [2]:
type(gsearch4.grid_scores_)

NameError: name 'gsearch4' is not defined