In [1]:
#Basic imports
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
from os import environ

#Classifier imports
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, RandomForestRegressor, BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier

#ML framework imports
from sklearn.metrics import auc, roc_auc_score, precision_recall_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import scale

#Downsamplers imports - prototype generation
from imblearn.under_sampling import ClusterCentroids

#Downsamplers imports - prototype selection - controlled
from imblearn.under_sampling import RandomUnderSampler, NearMiss

#Downsamplers imports - prototype selection - Cleaning techniques
from imblearn.under_sampling import TomekLinks, EditedNearestNeighbours, RepeatedEditedNearestNeighbours

#Downsamplers imports - prototype selection - Cleaning techniques - Condensed nearest neighbors and derived algorithms
from imblearn.under_sampling import CondensedNearestNeighbour, OneSidedSelection, NeighbourhoodCleaningRule

#Downsamplers imports - prototype selection - Cleaning techniques
from imblearn.under_sampling import InstanceHardnessThreshold

from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

ABSOLUTE_NEGATIVES = False
FILTER_DOMAIN = False
FILTER_MAX_SCORE_ZERO = False
K=10
out_dir = "mediode_stacking"

### Reading the input dataset

In [2]:
curr_dir = !pwd
input_path = curr_dir[0]+"/../domains_similarity/filtered_features_table/"
filename = "positions_features_mediode_filter_01.25.18.csv"

bind_scores_num = 10

#Features table
features_all = pd.read_csv(input_path+filename, sep='\t', index_col=0)
features_cols = features_all.columns[1:-bind_scores_num] #removing binding scores and domain name
print "all samples positions #: "+str(features_all.shape[0])

#Constants
ligands = ["dna", "dnabase", "dnabackbone", "rna", "rnabase", "rnabackbone", "peptide", "ion", "metabolite"]
models = ["XGB", "RF", "ADA", "KNN", "SVM", "Logistic"]

#lignd binding domains dictionary
with open(curr_dir[0]+"/../ligands_negatives_domains_dict.pik", 'rb') as handle:
        negatives_dict = pickle.load(handle)

all samples positions #: 38944


#### Dataset of negative examples

In [3]:
def filter_to_ligand_binding_domains(use_max_binding_score):
    
    ligands_negatives_df = {}
    for ligand in ligands:
        
        ligands_negatives_df[ligand] = pd.DataFrame()
        for domain in negatives_dict[ligand].keys():
            if domain == 'negatives' or domain == 'domains':
                continue
            domain_all = features_all.loc[features_all.loc[:,"domain_name"] == domain,:]
            
            #In case this domain was previously filtered
            if len(domain_all) == 0:
                continue
            
            if (use_max_binding_score):
                ligands_negatives_df[ligand] = pd.concat([ligands_negatives_df[ligand],domain_all.loc[domain_all.loc[:,"max_binding_score"] == 0,:]])
            else:
                ligand_bind_str = ligand+"_binding_score"
                ligands_negatives_df[ligand] = pd.concat([ligands_negatives_df[ligand],domain_all.loc[domain_all.loc[:,ligand_bind_str] == 0,:]])
        
    #Handeling the ligand "all_ligands"
    all_ligands_negatives_df = pd.concat([ligands_negatives_df["dna"], ligands_negatives_df["dnabase"], ligands_negatives_df["dnabackbone"], ligands_negatives_df["rna"], ligands_negatives_df["rnabase"], 
                                 ligands_negatives_df["rnabackbone"], ligands_negatives_df["ion"], ligands_negatives_df["peptide"], ligands_negatives_df["metabolite"]])
    all_ligands_negatives_df = all_ligands_negatives_df.drop_duplicates()
    #Filter to just positions with max. binding score = 0
    all_ligands_negatives_df = all_ligands_negatives_df[all_ligands_negatives_df["max_binding_score"] == 0]
    ligands_negatives_df["all_ligands"] = all_ligands_negatives_df
    
    #Leaving just the features columns
    for ligand in ligands_negatives_df.keys():   
        ligands_negatives_df[ligand] = ligands_negatives_df[ligand][features_cols]
        print(ligand+" non-binding #:"+str(len(ligands_negatives_df[ligand])))
    
    return ligands_negatives_df

In [4]:
def negatives_by_binding_score(use_max_binding_score, filter_max_score_zero):
    
    ligands_negatives_df = {}
    for ligand in ligands:
        
        if use_max_binding_score:
            ligand_bind_str = "max_binding_score"
        else:
            ligand_bind_str = ligand+"_binding_score"
        
        if (filter_max_score_zero):
            ligands_negatives_df[ligand] = features_all[features_all[ligand_bind_str] == 0][features_all["max_binding_score"] != 0]
        else:
            ligands_negatives_df[ligand] = features_all[features_all[ligand_bind_str] == 0]
        ligands_negatives_df[ligand] = ligands_negatives_df[ligand].loc[:,features_cols]
        print(ligand+" non-binding #:"+str(len(ligands_negatives_df[ligand])))
        
    #Handeling the ligand "all_ligands"
    ligands_negatives_df["all_ligands"] = features_all[features_all["max_binding_score"] == 0]
    ligands_negatives_df["all_ligands"] = ligands_negatives_df["all_ligands"].loc[:,features_cols]
    print("all_ligands non-binding #:"+str(len(ligands_negatives_df["all_ligands"])))
    
    return ligands_negatives_df

In [5]:
#Create negatives datasets
if FILTER_DOMAIN:
    if ABSOLUTE_NEGATIVES:
        ligands_negatives_df = filter_to_ligand_binding_domains(True)
    else:
        ligands_negatives_df = filter_to_ligand_binding_domains(False)
else:
    if ABSOLUTE_NEGATIVES:
        ligands_negatives_df = negatives_by_binding_score(True, FILTER_MAX_SCORE_ZERO)
    else:
        ligands_negatives_df = negatives_by_binding_score(False, FILTER_MAX_SCORE_ZERO)

dna non-binding #:38095
dnabase non-binding #:38577
dnabackbone non-binding #:38203
rna non-binding #:38047
rnabase non-binding #:38407
rnabackbone non-binding #:38223
peptide non-binding #:35437
ion non-binding #:34488
metabolite non-binding #:33971
all_ligands non-binding #:27191


#### Datasets of positive examples by ligand

In [6]:
bind_th = 0.1
ligands_features_df = {}

for ligand in ligands:
    score_col_str = ligand+"_binding_score"
    ligand_binding_df = features_all[features_all[score_col_str] >= bind_th]
    print ligand+" #: "+str(ligand_binding_df.shape[0])
    ligands_features_df[ligand] = ligand_binding_df.loc[:,features_cols]

dna #: 501
dnabase #: 193
dnabackbone #: 408
rna #: 433
rnabase #: 224
rnabackbone #: 308
peptide #: 1496
ion #: 1093
metabolite #: 1525


#### Dataset of positive examples - all ligands combined

In [7]:
all_ligands_features_df = pd.concat([ligands_features_df["dna"], ligands_features_df["dnabase"], ligands_features_df["dnabackbone"], ligands_features_df["rna"], ligands_features_df["rnabase"], 
                                     ligands_features_df["rnabackbone"], ligands_features_df["ion"], ligands_features_df["peptide"], ligands_features_df["metabolite"]])
all_ligands_features_df = all_ligands_features_df.drop_duplicates()
print "all_ligands #: "+str(all_ligands_features_df.shape[0])
ligands_features_df["all_ligands"] = all_ligands_features_df

all_ligands #: 4518


### Models tested (and their hyper-parameters)

In [8]:
classifiers = {}
classifiers["Logistic"] = LogisticRegression(C=0.001, random_state=0)
classifiers["RF"] = RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=0)  
#classifiers["RF"] = RandomForestRegressor(n_estimators=1000)  
classifiers["KNN"] = KNeighborsClassifier(n_neighbors=100, n_jobs=-1)
#classifiers["KNN"] = KNeighborsRegressor(n_neighbors=100)
classifiers["SVM"] = SVC(kernel='rbf', probability=True, random_state=0)
classifiers["ADA"] = AdaBoostClassifier(n_estimators=1000, random_state=0)
classifiers["ADA-Log"] = AdaBoostClassifier(base_estimator=classifiers["Logistic"], n_estimators=1000, random_state=0)
classifiers["Bag-Log"] = BaggingClassifier(base_estimator=classifiers["Logistic"], n_estimators=1000, n_jobs=-1, random_state=0)
classifiers["XGB"] = XGBClassifier(n_estimators=1000, n_jobs=-1, random_state=0, max_depth=6, min_child_weight=0.05, colsample_bytree=0.5)

### Downsamplers tested

In [9]:
#documentation on techniques: http://contrib.scikit-learn.org/imbalanced-learn/stable/under_sampling.html#cleaning-under-sampling-techniques
downsamplers = defaultdict(dict)

##Prototype generation##
downsamplers["ClusterCentroids"] = ClusterCentroids(random_state=0)

##Prototype selection##
#Contolled#
downsamplers["RandomUnderSampler"] = RandomUnderSampler(random_state=0)
downsamplers["NearMiss3"] = NearMiss(random_state=0, version=3)
downsamplers["NearMiss2"] = NearMiss(random_state=0, version=2)
downsamplers["NearMiss1"] = NearMiss(random_state=0, version=1)

#Cleaning#
downsamplers["TomekLinks"] = TomekLinks(random_state=0)
downsamplers["EditedNearestNeighbours"] = EditedNearestNeighbours(random_state=0)
downsamplers["RepeatedEditedNearestNeighbours"] = RepeatedEditedNearestNeighbours(random_state=0)
downsamplers["NeighbourhoodCleaningRule"] = NeighbourhoodCleaningRule(random_state=0)

# Instance hardness threshold#
downsamplers["InstanceHardnessThreshold"]["KNN"] = InstanceHardnessThreshold(random_state=0, estimator=classifiers["KNN"])
#downsamplers["InstanceHardnessThreshold"]["KNN"] = InstanceHardnessThreshold(random_state=0, estimator= KNeighborsClassifier(n_neighbors=100))
downsamplers["InstanceHardnessThreshold"]["SVM"] = InstanceHardnessThreshold(random_state=0, estimator=classifiers["SVM"])
downsamplers["InstanceHardnessThreshold"]["RF"] = InstanceHardnessThreshold(random_state=0, estimator=classifiers["RF"])
#downsamplers["InstanceHardnessThreshold"]["RF"] = InstanceHardnessThreshold(random_state=0, estimator=RandomForestClassifier(n_estimators=1000))
downsamplers["InstanceHardnessThreshold"]["Logistic"] = InstanceHardnessThreshold(random_state=0, estimator=classifiers["Logistic"])
downsamplers["InstanceHardnessThreshold"]["ADA"] = InstanceHardnessThreshold(random_state=0, estimator=classifiers["ADA"])
downsamplers["InstanceHardnessThreshold"]["ADA-Log"] = InstanceHardnessThreshold(random_state=0, estimator=classifiers["ADA-Log"])
downsamplers["InstanceHardnessThreshold"]["Bag-Log"] = InstanceHardnessThreshold(random_state=0, estimator=classifiers["Bag-Log"])

### Reading env input for downsampler technique, ligand and classifier

In [10]:
#Reading the ligand input
try:
    ligand = environ['ligand']
except:
    ligand = "dnabase"
print "ligand = "+ligand
    
#Reading the downsampler input
try: 
    downsample_method = environ['down']
except:
    downsample_method = "RandomUnderSampler"
print "downsample_method = "+downsample_method

#Reading the classifier input
try: 
    classifier_method = environ['classifier']
except:
    classifier_method = "Logistic"
print "classifier_method = "+classifier_method

ligand = dnabase
downsample_method = RandomUnderSampler
classifier_method = Logistic


### Create out-of-fold 1st level predictions: one for each test-train split

In [11]:
def train_predict_auc(X_train, y_train, X_test, y_test, downsample_method, pred_dict, pred_idx, auc_dict, auprc_dict, features_pred_dfs):
    
    #Downsampling according to "downsample_method"
    if (downsample_method == "NoDown"):
        X_train_sampled = X_train
        y_train_sampled = y_train
    else:
        if (downsample_method == "InstanceHardnessThreshold"):
            downsampler = downsamplers[downsample_method][classifier_method]
        else:
            downsampler = downsamplers[downsample_method]

        X_train_sampled, y_train_sampled = downsampler.fit_sample(X_train, y_train)
        
    #fit to training data
    model = classifiers[classifier_method]
    model.fit(X_train_sampled, y_train_sampled)
    
    #predict for the test data
    probs = model.predict_proba(X_test)
    probs_list = []
    for l in probs:
        probs_list.append(l[1])
        
    #Save predcition results for plotting
    pred_dict["obs"].extend(y_test)
    pred_dict["prob"].extend(probs_list)
    fold_list = [pred_idx] * len(probs_list)
    pred_dict["fold"].extend(fold_list)
    model_list = [classifier_method] * len(probs_list)
    pred_dict["model"].extend(model_list)

    #Update auc auprc dictionaries
    auc_dict[classifier_method].append(roc_auc_score(y_test, probs[:, 1]))
    precision, recall, _ = precision_recall_curve(y_test, probs[:, 1])
    auprc_dict[classifier_method].append(auc(recall, precision))
    print "AUC = "+str(auc_dict[classifier_method][-1])
    print "AUPRC = "+str(auprc_dict[classifier_method][-1])

    #Update features table
    features_pred_dfs[classifier_method] = features_pred_dfs[classifier_method].append(X_test)

In [12]:
def combine_features_predictions(ligand, ordered_features, pred_df, i):
    
    pred_res = pred_df.copy(deep=True)
    
    model_pred = pred_res[pred_res["model"] == classifier_method]
    model_pred.index = ordered_features[classifier_method].index

    #Creating the combined table
    features_pred = pd.concat([ordered_features[classifier_method], model_pred], axis=1)

    #Saving
    features_pred.to_csv(curr_dir[0]+"/1st_level_pred/"+str(i+1)+"/"+ligand+"_"+classifier_method+"_fold"+str(i+1)+"_features_pred.csv", sep='\t')

In [18]:
def test_model(ligand_bind_features, ligand_negatives_features, ligand_name, downsample_method, features=[]):
    """
    compute 1st level stacking predictions from "out-of-fold" data, 10-folds CV
    """
    
    #Default: Exclude no features
    if len(features) == 0:
        features = np.ones([ligand_bind_features.shape[1],]).astype(bool)
    features_pred_dfs = dict.fromkeys(classifiers.keys())
    models_req_scaling = ["SVM", "KNN"]
    
    #Create X and y with included features
    X = pd.concat([ligand_bind_features.iloc[:,features], ligand_negatives_features.iloc[:,features]])

    if (classifier_method in models_req_scaling):
        idx = X.index
        cols = X.columns
        X = pd.DataFrame(scale(X)) #Is z-scoring the data needed?
        X.index = idx #Restoring indices after scaling
        X.columns = cols

    y = [1] * ligand_bind_features.shape[0]
    y.extend([0] * ligand_negatives_features.shape[0])
    y = np.array(y)

    #Create the external CV indices that defines the held-out set
    ext_cv_idx = []
    external_skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=0)
    for ext_train_index, heldout_index in external_skf.split(X, y):
        ext_cv_idx.append({"train": ext_train_index, "test": heldout_index})
    
    #For each heldout set, create out-of-fold 1st prediction results and save them
    for i in range(K):
        print "ext fold #"+str(i+1)
        ext_train_index = ext_cv_idx[i]["train"]
        heldout_index = ext_cv_idx[i]["test"]
        X_ext_train, X_heldout = X.iloc[ext_train_index,:], X.iloc[heldout_index,:]
        y_ext_train, y_heldout = y[ext_train_index], y[heldout_index]
        
        #Initialize dictionaries and features df
        pred_dict = defaultdict(list)
        auc_dict = defaultdict(list)
        auprc_dict = defaultdict(list)
        features_pred_dfs[classifier_method] = pd.DataFrame()

        #internal CV on all folds except fold #i
        pred_idx = 1
        for j in range(K):
            print "int fold #"+str(j+1)
            if (j == i):
                continue
            int_test_index = ext_cv_idx[j]["test"]
            int_train_index = np.array([x for x in ext_train_index if x not in int_test_index]) #all indices in ext train that aren't in current test
            X_int_train, X_int_test = X.iloc[int_train_index,:], X.iloc[int_test_index,:]
            y_int_train, y_int_test = y[int_train_index], y[int_test_index]

            #fit to training data: (K-2) folds, and predict for the remaining fold
            train_predict_auc(X_int_train, y_int_train, X_int_test, y_int_test, downsample_method, pred_dict, pred_idx, auc_dict, auprc_dict, features_pred_dfs)
            pred_idx += 1

        #print averages across internal CV
        avg_auc = np.sum(auc_dict[classifier_method])/(K-1)
        print "avg int auc = "+str(avg_auc)

        avg_auprc = np.sum(auprc_dict[classifier_method])/(K-1)
        print "avg int auprc = "+str(avg_auprc)

        #Fit to all K-1 folds to predict 1st level on the heldout
        print "heldout fold #"+str(i+1)
        train_predict_auc(X_ext_train, y_ext_train, X_heldout, y_heldout, downsample_method, pred_dict, pred_idx, auc_dict, auprc_dict, features_pred_dfs)
        
        #Save auc,auprc,prob to file
        pred_df = pd.DataFrame.from_dict(pred_dict)
        auc_df = pd.DataFrame.from_dict(auc_dict)
        auprc_df = pd.DataFrame.from_dict(auprc_dict)
        pred_df.to_csv(curr_dir[0]+"/1st_level_pred/"+str(i+1)+"/"+ligand+"_"+classifier_method+"_fold"+str(i+1)+"_0.1.csv", sep='\t')
        auc_df.to_csv(curr_dir[0]+"/1st_level_pred/"+str(i+1)+"/"+ligand+"_"+classifier_method+"_fold"+str(i+1)+"_0.1_auc.csv", sep='\t')
        auprc_df.to_csv(curr_dir[0]+"/1st_level_pred/"+str(i+1)+"/"+ligand+"_"+classifier_method+"_fold"+str(i+1)+"_0.1_auprc.csv", sep='\t')

        #Save 1st level predictions to file, one file for each heldout
        combine_features_predictions(ligand, features_pred_dfs, pred_df, i)
        
        print "Finished ext fold #"+str(i+1)

### Test model functions

In [19]:
%%time

test_model(ligands_features_df[ligand], ligands_negatives_df[ligand], ligand, downsample_method)
  
print "Finished ligand "+ligand

ext fold #1
int fold #1
int fold #2
AUC = 0.671632545932
AUPRC = 0.0826057737086
int fold #3
AUC = 0.65811023622
AUPRC = 0.0672875065179
int fold #4
AUC = 0.598572178478
AUPRC = 0.042870876984
int fold #5
AUC = 0.545742782152
AUPRC = 0.0169680692455
int fold #6
AUC = 0.625954318719
AUPRC = 0.0437931227421
int fold #7
AUC = 0.502614859543
AUPRC = 0.0195409413828
int fold #8
AUC = 0.538241008139
AUPRC = 0.0139406273237
int fold #9
AUC = 0.652118666317
AUPRC = 0.080789884169
int fold #10
AUC = 0.641008138619
AUPRC = 0.0421641357006
avg int auc = 0.60377719268
avg int auprc = 0.0455512153083
heldout fold #1
AUC = 0.486542123411
AUPRC = 0.0365969839255
Finished ext fold #1
ext fold #2
int fold #1
AUC = 0.632051875868
AUPRC = 0.0656197529583
int fold #2
int fold #3
AUC = 0.767805774278
AUPRC = 0.101551066903
int fold #4
AUC = 0.603044619423
AUPRC = 0.0435280392152
int fold #5
AUC = 0.484078740157
AUPRC = 0.0157521266676
int fold #6
AUC = 0.612354948805
AUPRC = 0.0408745120283
int fold #7
AUC

KeyboardInterrupt: 