In [22]:
#Basic imports
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
from os import environ

#Classifier imports
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost.sklearn import XGBClassifier

#ML framework imports
from sklearn.metrics import auc, roc_auc_score, precision_recall_curve
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import scale
import xgboost as xgb

#Downsamplers imports - prototype generation
from imblearn.under_sampling import ClusterCentroids

#Downsamplers imports - prototype selection - controlled
from imblearn.under_sampling import RandomUnderSampler, NearMiss

#Downsamplers imports - prototype selection - Cleaning techniques
from imblearn.under_sampling import TomekLinks, EditedNearestNeighbours, RepeatedEditedNearestNeighbours

#Downsamplers imports - prototype selection - Cleaning techniques - Condensed nearest neighbors and derived algorithms
from imblearn.under_sampling import CondensedNearestNeighbour, OneSidedSelection, NeighbourhoodCleaningRule

#Downsamplers imports - prototype selection - Cleaning techniques
from imblearn.under_sampling import InstanceHardnessThreshold

from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

ABSOLUTE_NEGATIVES = False
FILTER_DOMAIN = False

### Reading the input dataset

In [3]:
curr_dir = !pwd
input_path = curr_dir[0]+"/domains_similarity/filtered_features_table/"
filename = "positions_features_mediode_filter_01.25.18.csv"

#input_path = curr_dir[0]+"/../9.Features_exploration/binding_df/10/"
#filename = "positions_features_01.25.18.csv"

bind_scores_num = 10

#Features table
features_all = pd.read_csv(input_path+filename, sep='\t', index_col=0)
features_cols = features_all.columns[1:-bind_scores_num] #removing binding scores and domain name
ligands = ["dna", "dnabase", "dnabackbone", "rna", "rnabase", "rnabackbone", "peptide", "ion", "metabolite"]
print "all samples positions #: "+str(features_all.shape[0])

#lignd binding domains dictionary
with open(curr_dir[0]+"/ligands_negatives_domains_dict.pik", 'rb') as handle:
        negatives_dict = pickle.load(handle)

all samples positions #: 38944


#### Dataset of negative examples

In [4]:
def filter_to_ligand_binding_domains(use_max_binding_score):
    
    ligands_negatives_df = {}
    for ligand in ligands:
        
        ligands_negatives_df[ligand] = pd.DataFrame()
        for domain in negatives_dict[ligand].keys():
            if domain == 'negatives' or domain == 'domains':
                continue
            domain_all = features_all.loc[features_all.loc[:,"domain_name"] == domain,:]
            
            #In case this domain was previously filtered
            if len(domain_all) == 0:
                continue
            
            if (use_max_binding_score):
                ligands_negatives_df[ligand] = pd.concat([ligands_negatives_df[ligand],domain_all.loc[domain_all.loc[:,"max_binding_score"] == 0,:]])
            else:
                ligand_bind_str = ligand+"_binding_score"
                ligands_negatives_df[ligand] = pd.concat([ligands_negatives_df[ligand],domain_all.loc[domain_all.loc[:,ligand_bind_str] == 0,:]])
        
    #Handeling the ligand "all_ligands"
    all_ligands_negatives_df = pd.concat([ligands_negatives_df["dna"], ligands_negatives_df["dnabase"], ligands_negatives_df["dnabackbone"], ligands_negatives_df["rna"], ligands_negatives_df["rnabase"], 
                                 ligands_negatives_df["rnabackbone"], ligands_negatives_df["ion"], ligands_negatives_df["peptide"], ligands_negatives_df["metabolite"]])
    all_ligands_negatives_df = all_ligands_negatives_df.drop_duplicates()
    #Filter to just positions with max. binding score = 0
    all_ligands_negatives_df = all_ligands_negatives_df[all_ligands_negatives_df["max_binding_score"] == 0]
    ligands_negatives_df["all_ligands"] = all_ligands_negatives_df
    
    #Leaving just the features columns
    for ligand in ligands_negatives_df.keys():   
        ligands_negatives_df[ligand] = ligands_negatives_df[ligand][features_cols]
        print(ligand+" non-binding #:"+str(len(ligands_negatives_df[ligand])))
    
    return ligands_negatives_df
            

In [5]:
def negatives_by_binding_score(use_max_binding_score):
    
    ligands_negatives_df = {}
    for ligand in ligands:
        
        if use_max_binding_score:
            ligand_bind_str = "max_binding_score"
        else:
            ligand_bind_str = ligand+"_binding_score"
        
        ligands_negatives_df[ligand] = features_all[features_all[ligand_bind_str] == 0]
        ligands_negatives_df[ligand] = ligands_negatives_df[ligand].loc[:,features_cols]
        print(ligand+" non-binding #:"+str(len(ligands_negatives_df[ligand])))
        
    #Handeling the ligand "all_ligands"
    ligands_negatives_df["all_ligands"] = features_all[features_all["max_binding_score"] == 0]
    ligands_negatives_df["all_ligands"] = ligands_negatives_df["all_ligands"].loc[:,features_cols]
    print("all_ligands non-binding #:"+str(len(ligands_negatives_df["all_ligands"])))
    
    return ligands_negatives_df

In [6]:
#Create negatives datasets
if FILTER_DOMAIN:
    if ABSOLUTE_NEGATIVES:
        ligands_negatives_df = filter_to_ligand_binding_domains(True)
    else:
        ligands_negatives_df = filter_to_ligand_binding_domains(False)
else:
    if ABSOLUTE_NEGATIVES:
        ligands_negatives_df = negatives_by_binding_score(True)
    else:
        ligands_negatives_df = negatives_by_binding_score(False)

dna non-binding #:38095
dnabase non-binding #:38577
dnabackbone non-binding #:38203
rna non-binding #:38047
rnabase non-binding #:38407
rnabackbone non-binding #:38223
peptide non-binding #:35437
ion non-binding #:34488
metabolite non-binding #:33971
all_ligands non-binding #:27191


#### Datasets of positive examples by ligand

In [7]:
bind_th = 0.1
ligands_features_df = {}

for ligand in ligands:
    score_col_str = ligand+"_binding_score"
    ligand_binding_df = features_all[features_all[score_col_str] >= bind_th]
    print ligand+" #: "+str(ligand_binding_df.shape[0])
    ligands_features_df[ligand] = ligand_binding_df.loc[:,features_cols]

dna #: 501
dnabase #: 193
dnabackbone #: 408
rna #: 433
rnabase #: 224
rnabackbone #: 308
peptide #: 1496
ion #: 1093
metabolite #: 1525


#### Dataset of positive examples - all ligands combined

In [8]:
all_ligands_features_df = pd.concat([ligands_features_df["dna"], ligands_features_df["dnabase"], ligands_features_df["dnabackbone"], ligands_features_df["rna"], ligands_features_df["rnabase"], 
                                     ligands_features_df["rnabackbone"], ligands_features_df["ion"], ligands_features_df["peptide"], ligands_features_df["metabolite"]])
all_ligands_features_df = all_ligands_features_df.drop_duplicates()
print "all_ligands #: "+str(all_ligands_features_df.shape[0])
ligands_features_df["all_ligands"] = all_ligands_features_df

all_ligands #: 4518


### Models tested (and their hyper-parameters)

In [41]:
classifiers = {}
classifiers["Logistic"] = LogisticRegression(C=0.001, random_state=0)
classifiers["RF"] = RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=0)  
#classifiers["RF"] = RandomForestRegressor(n_estimators=1000)  
classifiers["KNN"] = KNeighborsClassifier(n_neighbors=100, n_jobs=-1)
#classifiers["KNN"] = KNeighborsRegressor(n_neighbors=100)
classifiers["SVM"] = SVC(kernel='rbf', probability=True, random_state=0)
classifiers["ADA-RF"] = AdaBoostClassifier(n_estimators=1000, random_state=0)
classifiers["ADA-Log"] = AdaBoostClassifier(base_estimator=classifiers["Logistic"], n_estimators=1000, random_state=0)
#classifiers["Bag-Log"] = BaggingClassifier(base_estimator=classifiers["Logistic"], n_estimators=1000, n_jobs=-1, random_state=0)
classifiers["XGB"] = XGBClassifier(n_estimators=1000, n_jobs=-1, random_state=0, max_depth=6, min_child_weight=0.05, colsample_bytree=0.5)
classifiers["XGB0"] = XGBClassifier(n_estimators=1000, n_jobs=-1, random_state=0, max_depth=6, min_child_weight=0, colsample_bytree=0.5)
classifiers["XGB1"] = XGBClassifier(n_estimators=1000, n_jobs=-1, random_state=0, max_depth=6, min_child_weight=1, colsample_bytree=0.5)
classifiers["XGBTuned1"] = XGBClassifier(n_estimators=108, learning_rate = 0.1, n_jobs=-1, random_state=0, max_depth=3, min_child_weight=16, colsample_bytree=0.7, subsample=0.7, gamma = 0, reg_alpha = .5, scale_pos_weight = 1)
classifiers["XGBTuned2"] = XGBClassifier(n_estimators=268, learning_rate = 0.1, n_jobs=-1, random_state=0, max_depth=3, min_child_weight=16, colsample_bytree=0.7, subsample=0.7, gamma = 0, reg_alpha = .5, scale_pos_weight = 1)
classifiers["XGBTuned3"] = XGBClassifier(n_estimators=86, learning_rate = .01, n_jobs=-1, random_state=0, max_depth=3, min_child_weight=16, colsample_bytree=0.7, subsample=0.7, gamma = 0, reg_alpha = .5, scale_pos_weight = 1)

classifiers["XGBUntuned4Dna"] = XGBClassifier( learning_rate =0.1, n_estimators=190, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', nthread=5, scale_pos_weight=1, random_state=0)
classifiers["XGBTuned4Dna"] = XGBClassifier( learning_rate =0.1, n_estimators=190, max_depth=7, min_child_weight=4, gamma=.6, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', nthread=5, scale_pos_weight=1, random_state=0)

classifiers["XGBSPW=76"] = XGBClassifier(n_estimators=1000, n_jobs=-1, random_state=0, max_depth=6, min_child_weight=0.05, colsample_bytree=0.5, scale_pos_weight = 76)

classifiers["XGBSPW=76Tuned"] = XGBClassifier(n_estimators=190, random_state=0, max_depth=9, min_child_weight=0,
    colsample_bytree=0.5, scale_pos_weight = 76,gamma = .2,)
classifiers["XGBSPW=76Tuned2"] = XGBClassifier(n_estimators=190, n_jobs = -1, random_state=0, max_depth=9, min_child_weight=0,
    colsample_bytree=0.5, scale_pos_weight = 76,gamma = .1,)

### Downsamplers tested

In [10]:
#documentation on techniques: http://contrib.scikit-learn.org/imbalanced-learn/stable/under_sampling.html#cleaning-under-sampling-techniques
downsamplers = defaultdict(dict)

##Prototype generation##
downsamplers["ClusterCentroids"] = ClusterCentroids(random_state=0)

##Prototype selection##
#Contolled#
downsamplers["RandomUnderSampler"] = RandomUnderSampler(random_state=0)
downsamplers["NearMiss3"] = NearMiss(random_state=0, version=3)
downsamplers["NearMiss2"] = NearMiss(random_state=0, version=2)
downsamplers["NearMiss1"] = NearMiss(random_state=0, version=1)

#Cleaning#
downsamplers["TomekLinks"] = TomekLinks(random_state=0)
downsamplers["EditedNearestNeighbours"] = EditedNearestNeighbours(random_state=0)
downsamplers["RepeatedEditedNearestNeighbours"] = RepeatedEditedNearestNeighbours(random_state=0)
downsamplers["NeighbourhoodCleaningRule"] = NeighbourhoodCleaningRule(random_state=0)

# Instance hardness threshold#
downsamplers["InstanceHardnessThreshold"]["KNN"] = InstanceHardnessThreshold(random_state=0, estimator=classifiers["KNN"])
#downsamplers["InstanceHardnessThreshold"]["KNN"] = InstanceHardnessThreshold(random_state=0, estimator= KNeighborsClassifier(n_neighbors=100))
downsamplers["InstanceHardnessThreshold"]["SVM"] = InstanceHardnessThreshold(random_state=0, estimator=classifiers["SVM"])
downsamplers["InstanceHardnessThreshold"]["RF"] = InstanceHardnessThreshold(random_state=0, estimator=classifiers["RF"])
#downsamplers["InstanceHardnessThreshold"]["RF"] = InstanceHardnessThreshold(random_state=0, estimator=RandomForestClassifier(n_estimators=1000))
downsamplers["InstanceHardnessThreshold"]["Logistic"] = InstanceHardnessThreshold(random_state=0, estimator=classifiers["Logistic"])
#downsamplers["InstanceHardnessThreshold"]["ADA"] = InstanceHardnessThreshold(random_state=0, estimator=classifiers["ADA"])

### Reading env input for downsampler technique, ligand and classifier  

In [11]:
#Reading the ligand input
try:
    ligand = environ['ligand']
except:
    ligand = "dnabase"
print "ligand = "+ligand
    
#Reading the downsampler input
try: 
    downsample_method = environ['down']
except:
    downsample_method = "NoDown"
print "downsample_method = "+downsample_method

#Reading the classifier input
try: 
    classifier_method = environ['classifier']
except:
    classifier_method = "XGB"
print "classifier_method = "+classifier_method

ligand = dnabase
downsample_method = NoDown
classifier_method = XGB


### Test model functions

In [37]:
def test_model(pred_dict, auc_dict, auprc_dict, ligand_bind_features, ligand_negatives_features, ligand_name, downsample_method, features=[]):
    """
    Test different models in 10-folds cross-validation.
    """
    
    #Default: Exclude no features
    if len(features) == 0:
        features = np.ones([ligand_bind_features.shape[1],]).astype(bool)
        
    #Arranging the features table by the CV order, for each model
    features_pred_dfs = dict.fromkeys(classifiers.keys())
    
    models_req_scaling = ["SVM", "KNN"]
    
    for classifier in classifiers.keys():
        classifier = classifier_method
        model = classifiers[classifier]
        print "classifier_method = " + classifier_method
        print "ligand = " + ligand
        print model.get_xgb_params()
        features_pred_dfs[classifier] = pd.DataFrame()
        
        #Create X and y with included features
        X = pd.concat([ligand_bind_features.iloc[:,features], ligand_negatives_features.iloc[:,features]])
        
        if (classifier in models_req_scaling):
            idx = X.index
            cols = X.columns
            X = pd.DataFrame(scale(X)) #Is z-scoring the data needed?
            X.index = idx #Restoring indices after scaling
            X.columns = cols

        y = [1] * ligand_bind_features.shape[0]
        y.extend([0] * ligand_negatives_features.shape[0])
        y = np.array(y)

        binding_skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
        pred_idx = 1

        for train_index, test_index in binding_skf.split(X, y):
            print "fold #: "+str(pred_idx)
            X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
            y_train, y_test = y[train_index], y[test_index]

            #Down-sample negative examples to have balanced classes
            if (downsample_method == "NoDown"):
                X_train_sampled = X_train
                y_train_sampled = y_train
            else:
                if (downsample_method == "InstanceHardnessThreshold"):
                    downsampler = downsamplers[downsample_method][classifier]
                else:
                    downsampler = downsamplers[downsample_method]

                X_train_sampled, y_train_sampled = downsampler.fit_sample(X_train, y_train)
            
            #early_stopping validation set 
            #7.8 Early stopping and Algorithm 7.2
            #http://egrcc.github.io/docs/dl/deeplearningbook-regularization.pdf
            
            #X_subtrain, X_valid, y_subtrain, y_valid = train_test_split(X_train_sampled, 
            #                                y_train_sampled, stratify=y_train_sampled, test_size = .1)
            
            #fit to training data
            model = classifiers[classifier]
            model.fit(X_train_sampled, y_train_sampled)#,
                      #eval_set = [(X_valid,y_valid)],eval_metric = "map", early_stopping_rounds = 50,verbose = False)
            #print model.best_ntree_limit
            probs_list = []

            #probs = model.predict(X_test)
            #probs_list = probs
            
            probs = model.predict_proba(X_test)#,ntree_limit=model.best_ntree_limit)
            for l in probs:
                probs_list.append(l[1])
                
            pred_dict["obs"].extend(y_test)
            pred_dict["prob"].extend(probs_list)
            fold_list = [pred_idx] * len(probs_list)
            pred_dict["fold"].extend(fold_list)

            model_list = [classifier] * len(probs_list)
            pred_dict["model"].extend(model_list)

            #Update auc auprc dictionaries
            auc_dict[classifier].append(roc_auc_score(y_test, probs[:, 1]))
            precision, recall, _ = precision_recall_curve(y_test, probs[:, 1])
            
            #auc_dict[classifier].append(roc_auc_score(y_test, probs))
            #precision, recall, _ = precision_recall_curve(y_test, probs)
            
            auprc_dict[classifier].append(auc(recall, precision))
            
            #Update features table
            features_pred_dfs[classifier] = features_pred_dfs[classifier].append(X_test)
            pred_idx += 1
            
            print "AUC = "+str(auc_dict[classifier][-1])
            print "AUPRC = "+str(auprc_dict[classifier][-1])

        avg_auc = np.sum(auc_dict[classifier])/10.0
        print "avg auc = "+str(avg_auc)
        
        avg_auprc = np.sum(auprc_dict[classifier])/10.0
        print "avg auprc = "+str(avg_auprc)
            
        print "Finished "+ligand+" "+classifier
        break
    
    return features_pred_dfs

In [13]:
def combine_features_predictions(ligand, ordered_features, pred_df):
    
    pred_res = pred_df.copy(deep=True)
    for classifier in classifiers.keys():
        classifier = classifier_method
        model_pred = pred_res[pred_res["model"] == classifier]
        model_pred.index = ordered_features[classifier].index
        
        #Creating the combined table
        features_pred = pd.concat([ordered_features[classifier], model_pred], axis=1)
        
        #Saving
        #features_pred.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/"+downsample_method+"/01.25.2018/features_pred_tables/"+ligand+"_"+classifier+"_features_pred.csv", sep='\t')
        break

#### Predict for each ligand seperatelly

In [14]:
ligands

['dna',
 'dnabase',
 'dnabackbone',
 'rna',
 'rnabase',
 'rnabackbone',
 'peptide',
 'ion',
 'metabolite']

In [18]:
%%time
    

#Initialize dictionary
pred_dict = defaultdict(list)
auc_dict = defaultdict(list)
auprc_dict = defaultdict(list)
classifier_method = "XGB"
ligand = "dna"
print "classifier_method = "+classifier_method
ordered_features = test_model(pred_dict, auc_dict, auprc_dict, ligands_features_df[ligand], ligands_negatives_df[ligand], ligand, downsample_method)

pred_df = pd.DataFrame.from_dict(pred_dict)
auc_df = pd.DataFrame.from_dict(auc_dict)
auprc_df = pd.DataFrame.from_dict(auprc_dict)

#Save to file
#pred_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/"+downsample_method+"/01.25.2018/"+ligand+"_"+classifier_method+"_0.1.csv", sep='\t')
#auc_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/"+downsample_method+"/01.25.2018/"+ligand+"_"+classifier_method+"_0.1_auc.csv", sep='\t')
#auprc_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/"+downsample_method+"/01.25.2018/"+ligand+"_"+classifier_method+"_0.1_auprc.csv", sep='\t')

#Combine features and pred results to a unified table
combine_features_predictions(ligand, ordered_features, pred_df)

print "Finished ligand "+ligand

classifier_method = XGB
classifier_method = XGB
ligand = dna
{'reg_alpha': 0, 'colsample_bytree': 0.5, 'silent': 1, 'colsample_bylevel': 1, 'scale_pos_weight': 1, 'learning_rate': 0.1, 'missing': None, 'max_delta_step': 0, 'base_score': 0.5, 'n_estimators': 1000, 'subsample': 1, 'reg_lambda': 1, 'min_child_weight': 0.05, 'objective': 'binary:logistic', 'seed': 0, 'max_depth': 6, 'gamma': 0, 'booster': 'gbtree'}
fold #: 1
AUC = 0.9689619679892953
AUPRC = 0.5367477078050494
fold #: 2
AUC = 0.9709553805774278
AUPRC = 0.5483980379911849
fold #: 3
AUC = 0.947270341207349
AUPRC = 0.5784313941778981
fold #: 4
AUC = 0.9758582677165354
AUPRC = 0.6492696441645193
fold #: 5
AUC = 0.9740472440944882
AUPRC = 0.5311842129046221
fold #: 6
AUC = 0.9831451824625885
AUPRC = 0.5266800071534994
fold #: 7
AUC = 0.9552323444473615
AUPRC = 0.5682737046019297
fold #: 8
AUC = 0.9846153846153846
AUPRC = 0.6715219169362363
fold #: 9
AUC = 0.9847256497768443
AUPRC = 0.5962302036188929
fold #: 10
AUC = 0.943644001

In [14]:
%%time
    

#Initialize dictionary
pred_dict = defaultdict(list)
auc_dict = defaultdict(list)
auprc_dict = defaultdict(list)
classifier_method = "XGB0"
print "classifier_method = "+classifier_method
ordered_features = test_model(pred_dict, auc_dict, auprc_dict, ligands_features_df[ligand], ligands_negatives_df[ligand], ligand, downsample_method)

pred_df = pd.DataFrame.from_dict(pred_dict)
auc_df = pd.DataFrame.from_dict(auc_dict)
auprc_df = pd.DataFrame.from_dict(auprc_dict)

#Save to file
#pred_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/"+downsample_method+"/01.25.2018/"+ligand+"_"+classifier_method+"_0.1.csv", sep='\t')
#auc_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/"+downsample_method+"/01.25.2018/"+ligand+"_"+classifier_method+"_0.1_auc.csv", sep='\t')
#auprc_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/"+downsample_method+"/01.25.2018/"+ligand+"_"+classifier_method+"_0.1_auprc.csv", sep='\t')

#Combine features and pred results to a unified table
combine_features_predictions(ligand, ordered_features, pred_df)

print "Finished ligand "+ligand

classifier_method = XGB0
XGB0
{'reg_alpha': 0, 'colsample_bytree': 0.5, 'silent': 1, 'colsample_bylevel': 1, 'scale_pos_weight': 1, 'learning_rate': 0.1, 'missing': None, 'max_delta_step': 0, 'base_score': 0.5, 'n_estimators': 1000, 'subsample': 1, 'reg_lambda': 1, 'min_child_weight': 0, 'objective': 'binary:logistic', 'seed': 0, 'max_depth': 6, 'gamma': 0, 'booster': 'gbtree'}
fold #: 1
AUC = 0.9283307413167443
AUPRC = 0.20598023081482716
fold #: 2
AUC = 0.9261664074650078
AUPRC = 0.3347628515265228
fold #: 3
AUC = 0.9717340590979783
AUPRC = 0.34607683761326613
fold #: 4
AUC = 0.9706692859676407
AUPRC = 0.2893260762256532
fold #: 5
AUC = 0.9336853019017217
AUPRC = 0.15494171444777186
fold #: 6
AUC = 0.9876538157212628
AUPRC = 0.33022125279318804
fold #: 7
AUC = 0.9682955444599055
AUPRC = 0.23838729908026665
fold #: 8
AUC = 0.9406274306455795
AUPRC = 0.13281975767740178
fold #: 9
AUC = 0.9778666266391933
AUPRC = 0.2318335902319532
fold #: 10
AUC = 0.9808550414147893
AUPRC = 0.203237605

In [15]:
%%time
    

#Initialize dictionary
pred_dict = defaultdict(list)
auc_dict = defaultdict(list)
auprc_dict = defaultdict(list)
classifier_method = "XGB1"
print "classifier_method = "+classifier_method
ordered_features = test_model(pred_dict, auc_dict, auprc_dict, ligands_features_df[ligand], ligands_negatives_df[ligand], ligand, downsample_method)

pred_df = pd.DataFrame.from_dict(pred_dict)
auc_df = pd.DataFrame.from_dict(auc_dict)
auprc_df = pd.DataFrame.from_dict(auprc_dict)

#Save to file
#pred_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/"+downsample_method+"/01.25.2018/"+ligand+"_"+classifier_method+"_0.1.csv", sep='\t')
#auc_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/"+downsample_method+"/01.25.2018/"+ligand+"_"+classifier_method+"_0.1_auc.csv", sep='\t')
#auprc_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/"+downsample_method+"/01.25.2018/"+ligand+"_"+classifier_method+"_0.1_auprc.csv", sep='\t')

#Combine features and pred results to a unified table
combine_features_predictions(ligand, ordered_features, pred_df)

print "Finished ligand "+ligand

classifier_method = XGB1
XGB1
{'reg_alpha': 0, 'colsample_bytree': 0.5, 'silent': 1, 'colsample_bylevel': 1, 'scale_pos_weight': 1, 'learning_rate': 0.1, 'missing': None, 'max_delta_step': 0, 'base_score': 0.5, 'n_estimators': 1000, 'subsample': 1, 'reg_lambda': 1, 'min_child_weight': 1, 'objective': 'binary:logistic', 'seed': 0, 'max_depth': 6, 'gamma': 0, 'booster': 'gbtree'}
fold #: 1
AUC = 0.9459823742871954
AUPRC = 0.18759793948050751
fold #: 2
AUC = 0.9256220839813375
AUPRC = 0.3134783583479348
fold #: 3
AUC = 0.9661093831000518
AUPRC = 0.2545237423287126
fold #: 4
AUC = 0.9755259065236964
AUPRC = 0.2908478006779861
fold #: 5
AUC = 0.926332160104772
AUPRC = 0.15980258304066633
fold #: 6
AUC = 0.9837385064527571
AUPRC = 0.2973899886274574
fold #: 7
AUC = 0.9737933480669013
AUPRC = 0.24637447311508268
fold #: 8
AUC = 0.9481735190971985
AUPRC = 0.13682648794125002
fold #: 9
AUC = 0.9696518974386966
AUPRC = 0.2197572267935536
fold #: 10
AUC = 0.9812098303835814
AUPRC = 0.208687976122

In [15]:
%%time
    

#Initialize dictionary
pred_dict = defaultdict(list)
auc_dict = defaultdict(list)
auprc_dict = defaultdict(list)
classifier_method = "XGBTuned1"
print "classifier_method = "+classifier_method
ordered_features = test_model(pred_dict, auc_dict, auprc_dict, ligands_features_df[ligand], ligands_negatives_df[ligand], ligand, downsample_method)

pred_df = pd.DataFrame.from_dict(pred_dict)
auc_df = pd.DataFrame.from_dict(auc_dict)
auprc_df = pd.DataFrame.from_dict(auprc_dict)

#Save to file
#pred_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/"+downsample_method+"/01.25.2018/"+ligand+"_"+classifier_method+"_0.1.csv", sep='\t')
#auc_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/"+downsample_method+"/01.25.2018/"+ligand+"_"+classifier_method+"_0.1_auc.csv", sep='\t')
#auprc_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/"+downsample_method+"/01.25.2018/"+ligand+"_"+classifier_method+"_0.1_auprc.csv", sep='\t')

#Combine features and pred results to a unified table
combine_features_predictions(ligand, ordered_features, pred_df)

print "Finished ligand "+ligand

classifier_method = XGBTuned1
XGBTuned1
{'reg_alpha': 0.5, 'colsample_bytree': 0.7, 'silent': 1, 'colsample_bylevel': 1, 'scale_pos_weight': 1, 'learning_rate': 0.1, 'missing': None, 'max_delta_step': 0, 'base_score': 0.5, 'n_estimators': 108, 'subsample': 0.7, 'reg_lambda': 1, 'min_child_weight': 16, 'objective': 'binary:logistic', 'seed': 0, 'max_depth': 1, 'gamma': 0, 'booster': 'gbtree'}
fold #: 1
AUC = 0.807030844997408
AUPRC = 0.14713145744324044
fold #: 2
AUC = 0.8922174701918093
AUPRC = 0.2456178570800179
fold #: 3
AUC = 0.9034862623120788
AUPRC = 0.26463888531935736
fold #: 4
AUC = 0.8979495784562495
AUPRC = 0.13171896467807515
fold #: 5
AUC = 0.8494925104362773
AUPRC = 0.14732196704699865
fold #: 6
AUC = 0.9173760606804725
AUPRC = 0.3133979962441291
fold #: 7
AUC = 0.8274671905268615
AUPRC = 0.10221648986231564
fold #: 8
AUC = 0.8706521294160992
AUPRC = 0.10743259909223946
fold #: 9
AUC = 0.8335630364477437
AUPRC = 0.08929582855078706
fold #: 10
AUC = 0.9402317044880805
AUPRC

In [16]:
%%time
    

#Initialize dictionary
pred_dict = defaultdict(list)
auc_dict = defaultdict(list)
auprc_dict = defaultdict(list)
classifier_method = "XGBTuned2"
ligand = "dna"
print "classifier_method = "+classifier_method
ordered_features = test_model(pred_dict, auc_dict, auprc_dict, ligands_features_df[ligand], ligands_negatives_df[ligand], ligand, downsample_method)

pred_df = pd.DataFrame.from_dict(pred_dict)
auc_df = pd.DataFrame.from_dict(auc_dict)
auprc_df = pd.DataFrame.from_dict(auprc_dict)

#Save to file
#pred_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/"+downsample_method+"/01.25.2018/"+ligand+"_"+classifier_method+"_0.1.csv", sep='\t')
#auc_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/"+downsample_method+"/01.25.2018/"+ligand+"_"+classifier_method+"_0.1_auc.csv", sep='\t')
#auprc_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/"+downsample_method+"/01.25.2018/"+ligand+"_"+classifier_method+"_0.1_auprc.csv", sep='\t')

#Combine features and pred results to a unified table
combine_features_predictions(ligand, ordered_features, pred_df)

print "Finished ligand "+ligand

classifier_method = XGBTuned2
XGBTuned2
{'reg_alpha': 0.5, 'colsample_bytree': 0.7, 'silent': 1, 'colsample_bylevel': 1, 'scale_pos_weight': 1, 'learning_rate': 0.1, 'missing': None, 'max_delta_step': 0, 'base_score': 0.5, 'n_estimators': 268, 'subsample': 0.7, 'reg_lambda': 1, 'min_child_weight': 16, 'objective': 'binary:logistic', 'seed': 0, 'max_depth': 1, 'gamma': 0, 'booster': 'gbtree'}
fold #: 1
AUC = 0.8260627268014515
AUPRC = 0.11302788408812203
fold #: 2
AUC = 0.9217988595127009
AUPRC = 0.25699153321887486
fold #: 3
AUC = 0.9217340590979782
AUPRC = 0.21325011468921576
fold #: 4
AUC = 0.9212026956972523
AUPRC = 0.25630803736264407
fold #: 5
AUC = 0.8588442334451993
AUPRC = 0.13308505025211484
fold #: 6
AUC = 0.9434121852064064
AUPRC = 0.3721012794570318
fold #: 7
AUC = 0.8568933999072331
AUPRC = 0.09290295799242532
fold #: 8
AUC = 0.8893058417368285
AUPRC = 0.1362742417427251
fold #: 9
AUC = 0.8805589290831435
AUPRC = 0.11963527918550573
fold #: 10
AUC = 0.9449667726484997
AUPR

In [17]:
%%time
    

#Initialize dictionary
pred_dict = defaultdict(list)
auc_dict = defaultdict(list)
auprc_dict = defaultdict(list)
classifier_method = "XGBTuned3"
print "classifier_method = "+classifier_method
ordered_features = test_model(pred_dict, auc_dict, auprc_dict, ligands_features_df[ligand], ligands_negatives_df[ligand], ligand, downsample_method)

pred_df = pd.DataFrame.from_dict(pred_dict)
auc_df = pd.DataFrame.from_dict(auc_dict)
auprc_df = pd.DataFrame.from_dict(auprc_dict)

#Save to file
#pred_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/"+downsample_method+"/01.25.2018/"+ligand+"_"+classifier_method+"_0.1.csv", sep='\t')
#auc_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/"+downsample_method+"/01.25.2018/"+ligand+"_"+classifier_method+"_0.1_auc.csv", sep='\t')
#auprc_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/"+downsample_method+"/01.25.2018/"+ligand+"_"+classifier_method+"_0.1_auprc.csv", sep='\t')

#Combine features and pred results to a unified table
combine_features_predictions(ligand, ordered_features, pred_df)

print "Finished ligand "+ligand

classifier_method = XGBTuned3
XGBTuned3
{'reg_alpha': 0.5, 'colsample_bytree': 0.7, 'silent': 1, 'colsample_bylevel': 1, 'scale_pos_weight': 1, 'learning_rate': 0.01, 'missing': None, 'max_delta_step': 0, 'base_score': 0.5, 'n_estimators': 86, 'subsample': 0.7, 'reg_lambda': 1, 'min_child_weight': 16, 'objective': 'binary:logistic', 'seed': 0, 'max_depth': 1, 'gamma': 0, 'booster': 'gbtree'}
fold #: 1
AUC = 0.7570697252462416
AUPRC = 0.05553485190377219
fold #: 2
AUC = 0.853538102643857
AUPRC = 0.07532002163355103
fold #: 3
AUC = 0.8338711767755314
AUPRC = 0.11065110247574282
fold #: 4
AUC = 0.8272625576382636
AUPRC = 0.08251685637131378
fold #: 5
AUC = 0.8352705246787264
AUPRC = 0.09161760242813605
fold #: 6
AUC = 0.8501814411612234
AUPRC = 0.15406479264345818
fold #: 7
AUC = 0.7363441652342365
AUPRC = 0.09690432634167996
fold #: 8
AUC = 0.7693939931498437
AUPRC = 0.0621913231656854
fold #: 9
AUC = 0.6681222111540194
AUPRC = 0.02431524051758961
fold #: 10
AUC = 0.8541066823137698
AUPR

In [22]:
%%time
    

#Initialize dictionary
pred_dict = defaultdict(list)
auc_dict = defaultdict(list)
auprc_dict = defaultdict(list)
classifier_method = "XGBUntuned4Dna"
ligand = "dna"
print "classifier_method = "+classifier_method
ordered_features = test_model(pred_dict, auc_dict, auprc_dict, ligands_features_df[ligand], ligands_negatives_df[ligand], ligand, downsample_method)

pred_df = pd.DataFrame.from_dict(pred_dict)
auc_df = pd.DataFrame.from_dict(auc_dict)
auprc_df = pd.DataFrame.from_dict(auprc_dict)

#Save to file
#pred_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/"+downsample_method+"/01.25.2018/"+ligand+"_"+classifier_method+"_0.1.csv", sep='\t')
#auc_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/"+downsample_method+"/01.25.2018/"+ligand+"_"+classifier_method+"_0.1_auc.csv", sep='\t')
#auprc_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/"+downsample_method+"/01.25.2018/"+ligand+"_"+classifier_method+"_0.1_auprc.csv", sep='\t')

#Combine features and pred results to a unified table
combine_features_predictions(ligand, ordered_features, pred_df)

print "Finished ligand "+ligand

classifier_method = XGBUntuned4Dna
classifier_method = XGBUntuned4Dna
ligand = dna
{'reg_alpha': 0, 'colsample_bytree': 0.8, 'silent': 1, 'colsample_bylevel': 1, 'scale_pos_weight': 1, 'learning_rate': 0.1, 'missing': None, 'max_delta_step': 0, 'nthread': 5, 'base_score': 0.5, 'n_estimators': 190, 'subsample': 0.8, 'reg_lambda': 1, 'min_child_weight': 1, 'objective': 'binary:logistic', 'seed': 27, 'max_depth': 5, 'gamma': 0, 'booster': 'gbtree'}
fold #: 1
AUC = 0.9595749060779167
AUPRC = 0.5411362662126133
fold #: 2
AUC = 0.9730236220472441
AUPRC = 0.568356016856487
fold #: 3
AUC = 0.940503937007874
AUPRC = 0.570108574610205
fold #: 4
AUC = 0.9656797900262467
AUPRC = 0.6134804384575574
fold #: 5
AUC = 0.9658005249343832
AUPRC = 0.5371323639161083
fold #: 6
AUC = 0.9845628773956419
AUPRC = 0.518449800821479
fold #: 7
AUC = 0.9602362824888422
AUPRC = 0.5314029051981678
fold #: 8
AUC = 0.9885481753741139
AUPRC = 0.6704159617094891
fold #: 9
AUC = 0.9838697820950381
AUPRC = 0.5278406734887

In [23]:
%%time
    

#Initialize dictionary
pred_dict = defaultdict(list)
auc_dict = defaultdict(list)
auprc_dict = defaultdict(list)
classifier_method = "XGBTuned4Dna"
ligand = "dna"
print "classifier_method = "+classifier_method
ordered_features = test_model(pred_dict, auc_dict, auprc_dict, ligands_features_df[ligand], ligands_negatives_df[ligand], ligand, downsample_method)

pred_df = pd.DataFrame.from_dict(pred_dict)
auc_df = pd.DataFrame.from_dict(auc_dict)
auprc_df = pd.DataFrame.from_dict(auprc_dict)

#Save to file
#pred_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/"+downsample_method+"/01.25.2018/"+ligand+"_"+classifier_method+"_0.1.csv", sep='\t')
#auc_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/"+downsample_method+"/01.25.2018/"+ligand+"_"+classifier_method+"_0.1_auc.csv", sep='\t')
#auprc_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/"+downsample_method+"/01.25.2018/"+ligand+"_"+classifier_method+"_0.1_auprc.csv", sep='\t')

#Combine features and pred results to a unified table
combine_features_predictions(ligand, ordered_features, pred_df)

print "Finished ligand "+ligand

classifier_method = XGBTuned4Dna
classifier_method = XGBTuned4Dna
ligand = dna
{'reg_alpha': 0, 'colsample_bytree': 0.8, 'silent': 1, 'colsample_bylevel': 1, 'scale_pos_weight': 1, 'learning_rate': 0.1, 'missing': None, 'max_delta_step': 0, 'nthread': 5, 'base_score': 0.5, 'n_estimators': 190, 'subsample': 0.8, 'reg_lambda': 1, 'min_child_weight': 4, 'objective': 'binary:logistic', 'seed': 27, 'max_depth': 7, 'gamma': 0.6, 'booster': 'gbtree'}
fold #: 1
AUC = 0.9623436776285317
AUPRC = 0.5126136645073994
fold #: 2
AUC = 0.9641049868766405
AUPRC = 0.5320622749349843
fold #: 3
AUC = 0.9473490813648295
AUPRC = 0.5641662809068302
fold #: 4
AUC = 0.9700892388451443
AUPRC = 0.6111808578460529
fold #: 5
AUC = 0.9708188976377953
AUPRC = 0.5459117239500124
fold #: 6
AUC = 0.98367550538199
AUPRC = 0.5178210309843262
fold #: 7
AUC = 0.9613179312155422
AUPRC = 0.5570482001065977
fold #: 8
AUC = 0.9876503019165135
AUPRC = 0.6658482408190497
fold #: 9
AUC = 0.9851614597007089
AUPRC = 0.5209172610698

In [21]:
%%time
    

#Initialize dictionary
pred_dict = defaultdict(list)
auc_dict = defaultdict(list)
auprc_dict = defaultdict(list)
classifier_method = "XGBSPW=76"
ligand = "dna"
print "classifier_method = "+classifier_method
ordered_features = test_model(pred_dict, auc_dict, auprc_dict, ligands_features_df[ligand], ligands_negatives_df[ligand], ligand, downsample_method)

pred_df = pd.DataFrame.from_dict(pred_dict)
auc_df = pd.DataFrame.from_dict(auc_dict)
auprc_df = pd.DataFrame.from_dict(auprc_dict)

#Save to file
#pred_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/"+downsample_method+"/01.25.2018/"+ligand+"_"+classifier_method+"_0.1.csv", sep='\t')
#auc_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/"+downsample_method+"/01.25.2018/"+ligand+"_"+classifier_method+"_0.1_auc.csv", sep='\t')
#auprc_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/"+downsample_method+"/01.25.2018/"+ligand+"_"+classifier_method+"_0.1_auprc.csv", sep='\t')

#Combine features and pred results to a unified table
combine_features_predictions(ligand, ordered_features, pred_df)

print "Finished ligand "+ligand

classifier_method = XGBSPW=76
classifier_method = XGBSPW=76
ligand = dna
{'reg_alpha': 0, 'colsample_bytree': 0.5, 'silent': 1, 'colsample_bylevel': 1, 'scale_pos_weight': 76, 'learning_rate': 0.1, 'missing': None, 'max_delta_step': 0, 'base_score': 0.5, 'n_estimators': 1000, 'subsample': 1, 'reg_lambda': 1, 'min_child_weight': 0.05, 'objective': 'binary:logistic', 'seed': 0, 'max_depth': 6, 'gamma': 0, 'booster': 'gbtree'}
fold #: 1
AUC = 0.9681231022592764
AUPRC = 0.5314034298120206
fold #: 2
AUC = 0.9625511811023623
AUPRC = 0.5532429779991392
fold #: 3
AUC = 0.9589238845144357
AUPRC = 0.5587261390338145
fold #: 4
AUC = 0.9768713910761154
AUPRC = 0.6504401216125754
fold #: 5
AUC = 0.9740419947506562
AUPRC = 0.5433794925930066
fold #: 6
AUC = 0.9820005250721974
AUPRC = 0.5436923714958115
fold #: 7
AUC = 0.9619952743502231
AUPRC = 0.5422731828918693
fold #: 8
AUC = 0.9803622998162248
AUPRC = 0.6599993114435061
fold #: 9
AUC = 0.9859280651089525
AUPRC = 0.6135566355842432
fold #: 10
AUC

In [17]:
%%time
    

#Initialize dictionary
pred_dict = defaultdict(list)
auc_dict = defaultdict(list)
auprc_dict = defaultdict(list)
classifier_method = "XGBSPW=76Tuned"
ligand = "dna"
print "classifier_method = "+classifier_method
ordered_features = test_model(pred_dict, auc_dict, auprc_dict, ligands_features_df[ligand], ligands_negatives_df[ligand], ligand, downsample_method)

pred_df = pd.DataFrame.from_dict(pred_dict)
auc_df = pd.DataFrame.from_dict(auc_dict)
auprc_df = pd.DataFrame.from_dict(auprc_dict)

#Save to file
#pred_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/"+downsample_method+"/01.25.2018/"+ligand+"_"+classifier_method+"_0.1.csv", sep='\t')
#auc_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/"+downsample_method+"/01.25.2018/"+ligand+"_"+classifier_method+"_0.1_auc.csv", sep='\t')
#auprc_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/"+downsample_method+"/01.25.2018/"+ligand+"_"+classifier_method+"_0.1_auprc.csv", sep='\t')

#Combine features and pred results to a unified table
combine_features_predictions(ligand, ordered_features, pred_df)

print "Finished ligand "+ligand

classifier_method = XGBSPW=76Tuned
classifier_method = XGBSPW=76Tuned
ligand = dna
{'reg_alpha': 0, 'colsample_bytree': 0.5, 'silent': 1, 'colsample_bylevel': 1, 'scale_pos_weight': 76, 'learning_rate': 0.1, 'missing': None, 'max_delta_step': 0, 'nthread': 1, 'base_score': 0.5, 'n_estimators': 190, 'subsample': 1, 'reg_lambda': 1, 'min_child_weight': 0, 'objective': 'binary:logistic', 'seed': 0, 'max_depth': 9, 'gamma': 0.2, 'booster': 'gbtree'}
fold #: 1
AUC = 0.9725696052699295
AUPRC = 0.5496023107391048
fold #: 2
AUC = 0.9729238845144357
AUPRC = 0.5945276617158096
fold #: 3
AUC = 0.9619737532808399
AUPRC = 0.5465831329057493
fold #: 4
AUC = 0.9754803149606299
AUPRC = 0.648100795656263
fold #: 5
AUC = 0.9752755905511812
AUPRC = 0.6081963950632989
fold #: 6
AUC = 0.9857915463376214
AUPRC = 0.5413520126426142
fold #: 7
AUC = 0.9728905224468365
AUPRC = 0.528470428500273
fold #: 8
AUC = 0.985607771068522
AUPRC = 0.706295508052633
fold #: 9
AUC = 0.9846101338934103
AUPRC = 0.6164476999403

In [42]:
%%time
    

#Initialize dictionary
pred_dict = defaultdict(list)
auc_dict = defaultdict(list)
auprc_dict = defaultdict(list)
classifier_method = "XGBSPW=76Tuned2"
ligand = "dna"
print "classifier_method = "+classifier_method
ordered_features = test_model(pred_dict, auc_dict, auprc_dict, ligands_features_df[ligand], ligands_negatives_df[ligand], ligand, downsample_method)

pred_df = pd.DataFrame.from_dict(pred_dict)
auc_df = pd.DataFrame.from_dict(auc_dict)
auprc_df = pd.DataFrame.from_dict(auprc_dict)

#Save to file
#pred_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/"+downsample_method+"/01.25.2018/"+ligand+"_"+classifier_method+"_0.1.csv", sep='\t')
#auc_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/"+downsample_method+"/01.25.2018/"+ligand+"_"+classifier_method+"_0.1_auc.csv", sep='\t')
#auprc_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/"+downsample_method+"/01.25.2018/"+ligand+"_"+classifier_method+"_0.1_auprc.csv", sep='\t')

#Combine features and pred results to a unified table
combine_features_predictions(ligand, ordered_features, pred_df)

print "Finished ligand "+ligand

classifier_method = XGBSPW=76Tuned2
classifier_method = XGBSPW=76Tuned2
ligand = dna
{'reg_alpha': 0, 'colsample_bytree': 0.5, 'silent': 1, 'colsample_bylevel': 1, 'scale_pos_weight': 76, 'learning_rate': 0.1, 'missing': None, 'max_delta_step': 0, 'base_score': 0.5, 'n_estimators': 190, 'subsample': 1, 'reg_lambda': 1, 'min_child_weight': 0, 'objective': 'binary:logistic', 'seed': 0, 'max_depth': 9, 'gamma': 0.1, 'booster': 'gbtree'}
fold #: 1
AUC = 0.9654315269414853
AUPRC = 0.5395793256024765
fold #: 2
AUC = 0.9682992125984251
AUPRC = 0.5437820066712443
fold #: 3
AUC = 0.9600000000000001
AUPRC = 0.5765961423656617
fold #: 4
AUC = 0.9676482939632546
AUPRC = 0.680253390857843
fold #: 5
AUC = 0.9786141732283465
AUPRC = 0.5984576348544359
fold #: 6
AUC = 0.9876765555263849
AUPRC = 0.5662582795430952
fold #: 7
AUC = 0.9741454449986873
AUPRC = 0.5129763847646818
fold #: 8
AUC = 0.9811551588343397
AUPRC = 0.7159968783103337
fold #: 9
AUC = 0.9825623523234445
AUPRC = 0.6043627410301569
fold 

In [29]:
%%time
    
#Trying early stopping
#Initialize dictionary
pred_dict = defaultdict(list)
auc_dict = defaultdict(list)
auprc_dict = defaultdict(list)
classifier_method = "XGB"
ligand = "dna"
print "classifier_method = "+classifier_method
ordered_features = test_model(pred_dict, auc_dict, auprc_dict, ligands_features_df[ligand], ligands_negatives_df[ligand], ligand, downsample_method)

pred_df = pd.DataFrame.from_dict(pred_dict)
auc_df = pd.DataFrame.from_dict(auc_dict)
auprc_df = pd.DataFrame.from_dict(auprc_dict)

#Save to file
#pred_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/"+downsample_method+"/01.25.2018/"+ligand+"_"+classifier_method+"_0.1.csv", sep='\t')
#auc_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/"+downsample_method+"/01.25.2018/"+ligand+"_"+classifier_method+"_0.1_auc.csv", sep='\t')
#auprc_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegLigand_NoFilter/"+downsample_method+"/01.25.2018/"+ligand+"_"+classifier_method+"_0.1_auprc.csv", sep='\t')

#Combine features and pred results to a unified table
combine_features_predictions(ligand, ordered_features, pred_df)

print "Finished ligand "+ligand

classifier_method = XGB
classifier_method = XGB
ligand = dna
{'reg_alpha': 0, 'colsample_bytree': 0.5, 'silent': 1, 'colsample_bylevel': 1, 'scale_pos_weight': 1, 'learning_rate': 0.1, 'missing': None, 'max_delta_step': 0, 'base_score': 0.5, 'n_estimators': 1000, 'subsample': 1, 'reg_lambda': 1, 'min_child_weight': 0.05, 'objective': 'binary:logistic', 'seed': 0, 'max_depth': 6, 'gamma': 0, 'booster': 'gbtree'}
fold #: 1
91
AUC = 0.9752148628480264
AUPRC = 0.5490236817104815
fold #: 2
116
AUC = 0.972482939632546
AUPRC = 0.5373684053454282
fold #: 3
90
AUC = 0.9669238845144357
AUPRC = 0.5781991142319687
fold #: 4
111
AUC = 0.9765984251968505
AUPRC = 0.6244182297791343
fold #: 5
61
AUC = 0.9680209973753281
AUPRC = 0.46974786568908633
fold #: 6
97
AUC = 0.9891572591231294
AUPRC = 0.5418884275389815
fold #: 7
96
AUC = 0.9791493830401681
AUPRC = 0.5526473263389557
fold #: 8
73
AUC = 0.98947755316356
AUPRC = 0.6891099855823766
fold #: 9
75
AUC = 0.9856130217904961
AUPRC = 0.5666105174181898
