In [1]:
#Basic imports
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
from os import environ, getcwd
import sys

#Classifier imports
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier

#ML framework imports
from sklearn.metrics import auc, roc_auc_score, precision_recall_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler

#Import utils functions
curr_dir = !pwd
sys.path.append(curr_dir[0]+"/utils")
from neg_pos_funcs import create_negatives_datasets, create_positives_datasets
from CV_funcs import add_domain_name_from_table_idx, calc_CV_idx_iterative

from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

ABSOLUTE_NEGATIVES = False
FILTER_DOMAIN = False
FILTER_MAX_SCORE_ZERO = False
out_dir = "mediode_NegLigand_NoFilter"

### Reading the input dataset

In [2]:
curr_dir = !pwd
input_path = curr_dir[0]+"/domains_similarity/filtered_features_table/"
filename = "windowed_positions_features_mediode_filter_05.11.18.csv"

bind_scores_num = 10
ligands = ["dna", "dnabase", "dnabackbone", "rna", "rnabase", "rnabackbone", "peptide", "ion", "metabolite"]

#Features table
features_all = pd.read_csv(input_path+filename, sep='\t', index_col=0)
#Features columns names, without the labels (the binding scores)
features_cols = features_all.columns.tolist()
#removing binding scores and domain name
for ligand in ligands:
    score_str = ligand+"_binding_score"
    features_cols.remove(score_str)
features_cols.remove("max_binding_score")
features_cols.remove("domain_name")


print "all samples positions #: "+str(features_all.shape[0])

#lignd binding domains dictionary
with open(curr_dir[0]+"/ligands_negatives_domains_dict.pik", 'rb') as handle:
        negatives_dict = pickle.load(handle)

#CV splits dictionary
with open(curr_dir[0]+"/CV_splits/domain_10_splits_dict.pik", 'rb') as handle:
        splits_dict = pickle.load(handle)

all samples positions #: 38944


#### Dataset of negative examples

In [3]:
ligands_negatives_df = create_negatives_datasets(FILTER_DOMAIN, ABSOLUTE_NEGATIVES, FILTER_MAX_SCORE_ZERO, features_all, features_cols)

dna non-binding #:38095
dnabase non-binding #:38577
dnabackbone non-binding #:38203
rna non-binding #:38047
rnabase non-binding #:38407
rnabackbone non-binding #:38223
peptide non-binding #:35437
ion non-binding #:34488
metabolite non-binding #:33971
all_ligands non-binding #:27191


#### Datasets of positive examples by ligand

In [4]:
bind_th = 0.1
ligands_features_df = create_positives_datasets(bind_th, features_all, features_cols)

dna #: 501
dnabase #: 193
dnabackbone #: 408
rna #: 433
rnabase #: 224
rnabackbone #: 308
peptide #: 1496
ion #: 1093
metabolite #: 1525


### Reading env input for downsampler technique, ligand and classifier

In [5]:
#Reading the ligand input
try:
    ligand = environ['ligand']
except:
    ligand = "dna"
print "ligand = "+ligand
    
#Reading the downsampler input
try: 
    fold = environ['fold']
except:
    fold = "1"
print "fold = "+fold

#Reading the classifier input
try: 
    classifier_method = environ['classifier']
except:
    classifier_method = "XGB"
print "classifier_method = "+classifier_method

ligand = dna
fold = 1
classifier_method = XGB


### Models tested (and their hyper-parameters)

In [6]:
classifiers = {}
if (classifier_method == "XGB"):
    ligand_pos = ligands_features_df[ligand].shape[0]
    ligand_neg = ligands_negatives_df[ligand].shape[0]
    scale_weight = ligand_neg/float(ligand_pos)
    classifiers["XGB"] = XGBClassifier(n_estimators=1000, n_jobs=-1, random_state=0, max_depth=6, min_child_weight=0.05, colsample_bytree=0.5, scale_pos_weight=scale_weight)
elif (classifier_method == "RF"):
    classifiers["RF"] = RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=0)  
elif(classifier_method == "Logistic"):
    classifiers["Logistic"] = LogisticRegression(C=0.1, random_state=0, n_jobs=-1)
elif (classifier_method == "KNN"):
    classifiers["KNN"] = KNeighborsClassifier(n_neighbors=100, n_jobs=-1)
elif (classifier_method == "ADA"):
    classifiers["ADA"] = AdaBoostClassifier(n_estimators=1000, random_state=0)
elif (classifier_method == "SVM"):
    classifiers["SVM"] = SVC(kernel='rbf', probability=True, random_state=0)

In [14]:
def compute_per_domain_auc(y_test, pred_probs):
    """
    Compute the average per_domain auc and auprc for the test set
    """
    
    y_test_copy = y_test.copy(deep=True)
    y_test_copy["pred_probs"] = pred_probs
    
    domain_auc_list = []
    domain_auprc_list = []
    idx = y_test.index
    y_test_copy["domain_name"] = [x[:x.rfind("_")] for x in idx]
    domains_list = y_test_copy["domain_name"].unique().tolist()
        
    for domain_name in domains_list:
        
        #Get only the domain positions
        domain_df = y_test_copy[y_test_copy["domain_name"] == domain_name]

        #Find the binding positions of this domain
        bind_list = domain_df[domain_df["label"] == 1].index
        bind_idx = [int(x[len(domain_name)+1:]) for x in bind_list]
        bind_num = len(bind_idx)
        if (bind_num == 0):
            #No binding positions in this domain - skipping"
            continue
        
        #Compute domain AUC
        domain_auc = roc_auc_score(domain_df["label"], domain_df["pred_probs"])
        domain_auc_list.append(domain_auc)
        #Compute domain AUPRC
        precision, recall, _ = precision_recall_curve(domain_df["label"], domain_df["pred_probs"])
        domain_auprc = auc(recall, precision)
        domain_auprc_list.append(domain_auprc)
        
    #Compute the means for the lists 
    domain_auc_mean = np.mean(domain_auc_list)
    domain_auprc_mean = np.mean(domain_auprc_list)
    
    return (domain_auc_mean, domain_auprc_mean)

In [17]:
def test_model_iterative_fixed(pred_dict, auc_dict, auprc_dict, domain_auc_dict, domain_auprc_dict, ligand_bind_features, ligand_negatives_features, ligand_name, features=[]):
    """
    Test different models in 10-folds cross-validation.
    """
    
    #Default: Exclude no features
    if len(features) == 0:
        features = np.ones([ligand_bind_features.shape[1],]).astype(bool)
        
    #Arranging the features table by the CV order, for each model
    features_pred_dfs = dict.fromkeys(classifiers.keys())
    
    models_req_scaling = ["SVM", "KNN", "Logistic"]

    classifier = classifier_method
    model = classifiers[classifier]
    features_pred_dfs[classifier] = pd.DataFrame()

    #Create X and y with included features
    X = pd.concat([ligand_bind_features.iloc[:,features], ligand_negatives_features.iloc[:,features]])
    y = [1] * ligand_bind_features.shape[0]
    y.extend([0] * ligand_negatives_features.shape[0])
    y = np.array(y)
    y_df = pd.DataFrame(y)
    y_df.index = X.index
    y_df.columns = ["label"]
    
    #Get the fold indices
    cv_idx = calc_CV_idx_iterative(X, splits_dict)
    k = (int(fold)-1)
    
    pred_idx = k+1
    print "fold #: "+str(pred_idx)
    test_index = cv_idx[k]["test"]
    train_index = cv_idx[k]["train"]
    X_train, X_test = X.loc[train_index,:], X.loc[test_index,:]
    y_train, y_test = y_df.loc[train_index,:], y_df.loc[test_index,:]
    
    if (classifier in models_req_scaling):
        cols = X_train.columns
        scaler = StandardScaler() 
        #scale only using the training data
        scaler.fit(X_train) 
        X_train = pd.DataFrame(scaler.transform(X_train))
        # apply same transformation to test data
        X_test = pd.DataFrame(scaler.transform(X_test))
        #Restoring indices after scaling
        X_train.index = train_index 
        X_test.index = test_index 
        #Restoring features names
        X_train.columns = cols
        X_test.columns = cols

    #No down-sampling
    X_train_sampled = X_train
    y_train_sampled = y_train

    #fit to training data
    model = classifiers[classifier]
    model.fit(X_train_sampled, y_train_sampled["label"])

    probs_list = []
    probs = model.predict_proba(X_test)
    for l in probs:
        probs_list.append(l[1])

    pred_dict["obs"].extend(y_test["label"])
    pred_dict["prob"].extend(probs_list)
    fold_list = [pred_idx] * len(probs_list)
    pred_dict["fold"].extend(fold_list)

    model_list = [classifier] * len(probs_list)
    pred_dict["model"].extend(model_list)

    #Update auc auprc dictionaries
    auc_dict[classifier].append(roc_auc_score(y_test["label"], probs[:, 1]))
    precision, recall, _ = precision_recall_curve(y_test["label"], probs[:, 1])
    auprc_dict[classifier].append(auc(recall, precision))
    
    #Per domain AUC and AUPRC
    (domain_auc_mean, domain_auprc_mean) = compute_per_domain_auc(y_test, probs[:, 1])
    domain_auc_dict[classifier].append(domain_auc_mean)
    domain_auprc_dict[classifier].append(domain_auprc_mean)
    
    #Update features table
    features_pred_dfs[classifier] = features_pred_dfs[classifier].append(X_test)
    pred_idx += 1

    print "AUC = "+str(auc_dict[classifier][-1])
    print "AUPRC = "+str(auprc_dict[classifier][-1])
    print "domain AUC = "+str(domain_auc_dict[classifier][-1])
    print "domain AUPRC = "+str(domain_auprc_dict[classifier][-1])

    print "Finished "+ligand+" "+classifier+" fold: "+fold
    
    return features_pred_dfs

### Test model functions

In [9]:
def combine_features_predictions(ligand, ordered_features, pred_df):
    
    pred_res = pred_df.copy(deep=True)
    for classifier in classifiers.keys():
        classifier = classifier_method
        model_pred = pred_res[pred_res["model"] == classifier]
        model_pred.index = ordered_features[classifier].index
        
        #Creating the combined table
        features_pred = pd.concat([ordered_features[classifier], model_pred], axis=1)
        
        #Saving
        features_pred.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/"+out_dir+"/"+downsample_method+"/05.11.2018_domain_CV/features_pred_tables/"+ligand+"_"+classifier+"_features_pred.csv", sep='\t')
        break

#### Predict for each ligand seperatelly

In [18]:
%%time

#Initialize dictionary
pred_dict = defaultdict(list)
auc_dict = defaultdict(list)
auprc_dict = defaultdict(list)
domain_auc_dict = defaultdict(list)
domain_auprc_dict = defaultdict(list)
downsample_method = "NoDown"

ordered_features = test_model_iterative_fixed(pred_dict, auc_dict, auprc_dict, domain_auc_dict, domain_auprc_dict, ligands_features_df[ligand], ligands_negatives_df[ligand], ligand)

pred_df = pd.DataFrame.from_dict(pred_dict)
auc_df = pd.DataFrame.from_dict(auc_dict)
auprc_df = pd.DataFrame.from_dict(auprc_dict)
domain_auc_df = pd.DataFrame.from_dict(domain_auc_dict)
domain_auprc_df = pd.DataFrame.from_dict(domain_auprc_dict)

#Save to file
# pred_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/"+out_dir+"/"+downsample_method+"/05.11.2018_domain_CV/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_10w.csv", sep='\t')
# auc_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/"+out_dir+"/"+downsample_method+"/05.11.2018_domain_CV/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_10w_auc.csv", sep='\t')
# auprc_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/"+out_dir+"/"+downsample_method+"/05.11.2018_domain_CV/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_10w_auprc.csv", sep='\t')

#Combine features and pred results to a unified table
#combine_features_predictions(ligand, ordered_features, pred_df)

print "Finished ligand "+ligand

fold #: 1
AUC = 0.823137157107
AUPRC = 0.596506848661
domain AUC = 0.718986050564
domain APRUC = 0.457986428143
Finished dna XGB fold: 1
Finished ligand dna
CPU times: user 16min 43s, sys: 3.15 s, total: 16min 46s
Wall time: 2min 40s
