In [1]:
#%matplotlib inline
#Basic imports
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
from os import environ, getcwd
import sys

#Classifier imports
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier

#ML framework imports
from sklearn.metrics import auc, roc_auc_score, precision_recall_curve, precision_score, fbeta_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler

# Neural Net imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

#Import utils functions
curr_dir = !pwd
sys.path.append(curr_dir[0]+"/utils")
from prop_threshold_funcs import create_negatives_datasets, create_positives_datasets, create_positives_datasets_combined, create_negatives_datasets_combined
from prediction_general_funcs import ligands, score_cols_suffix, get_features_cols, remove_unimportant_features
from CV_funcs import add_domain_name_from_table_idx, calc_CV_idx_iterative
from generate_models_dict import generate_models_dict

from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

### Reading the input dataset

In [2]:
curr_dir = !pwd
pfam_version = "31"
datafile_date = "06.20.18"
input_path = curr_dir[0]+"/domains_similarity/filtered_features_table/"
filename = "windowed_positions_features_mediode_filter_"+datafile_date+".csv"
out_dir = "mediode_NegLigand_NoFilter"

#flags for creating negatives
zero_prop = True
no_prop = True
all_ligands = False
prec_th = 0.25
folds_num = 5

#Features table
features_all = pd.read_csv(input_path+filename, sep='\t', index_col=0)
#Features columns names, without the labels (the binding scores)
features_cols = get_features_cols(features_all)

print "all samples positions #: "+str(features_all.shape[0])

#CV splits dictionary
# with open(curr_dir[0]+"/CV_splits/pfam-v"+pfam_version+"/domain_"+str(folds_num)+"_folds_"+str(prec_th)+"_prec_dict.pik", 'rb') as handle:
#         splits_dict = pickle.load(handle)
with open(curr_dir[0]+"/CV_splits/pfam-v"+pfam_version+"/domain_"+str(folds_num)+"_folds_combined_dna0.5_rna0.25_ion0.75_prec_dict.pik", 'rb') as handle:
        splits_dict = pickle.load(handle)

all samples positions #: 42535


#### Remove unimportant features

In [3]:
print "# of features before removal: "+str(len(features_cols))
remove_unimportant_features(features_all, features_cols)
print "# of features after removal: "+str(len(features_cols))

# of features before removal: 761
# of features after removal: 753


#### Dataset of negative examples

In [4]:
ligands_negatives_df = create_negatives_datasets_combined(zero_prop, no_prop, features_all, features_cols, all_ligands)

dna non-binding #:41680
dnabase non-binding #:42089
dnabackbone non-binding #:41689
dna combined non binding #: 41555
rna non-binding #:41613
rnabase non-binding #:41828
rnabackbone non-binding #:41619
rna combined non binding #: 41401
peptide non-binding #:38794
ion non-binding #:37525
metabolite non-binding #:37463
sm non-binding #:30978


#### Datasets of positive examples by ligand

In [5]:
ligands_positives_df = create_positives_datasets_combined(features_all, features_cols, all_ligands)

dna #: 239
dnabase #: 170
dnabackbone #: 244
dna combined #: 353
rna #: 360
rnabase #: 246
rnabackbone #: 346
rna combined #: 468
peptide #: 462
ion #: 350
metabolite #: 504
sm #: 708




### Reading env input for downsampler technique, ligand and classifier

In [7]:
#Reading the ligand input
try:
    ligand = environ['ligand']
except:
    ligand = "dna"
print "ligand = "+ligand
    
#Reading the downsampler input
try: 
    fold = environ['fold']
except:
    fold = "5"
print "fold = "+fold

#Reading the classifier input
try: 
    classifier_method = environ['classifier']
except:
    classifier_method = "KNN"
print "classifier_method = "+classifier_method

ligand = dna
fold = 5
classifier_method = KNN


In [8]:
models_dict = generate_models_dict(ligand, ligands, ligands_positives_df, ligands_negatives_df, folds_num)

### Models tested (and their hyper-parameters)

In [24]:
# define the network with batch normalization
class Net(nn.Module):
    def __init__(self, hyperparameters):
        hidden_units_1 = hyperparameters["hidden_units_1"]
        hidden_units_2 = hyperparameters["hidden_units_2"]
        super(Net, self).__init__()
        self.input = nn.Linear(len(features_cols), hidden_units_1) # read input size from the .shape of data table
        self.hidden1 = nn.Linear(hidden_units_1, hidden_units_2)
        self.hidden1_bn = nn.BatchNorm1d(hidden_units_2)
        self.hidden2 = nn.Linear(hidden_units_2, hidden_units_2)
        self.hidden2_bn = nn.BatchNorm1d(hidden_units_2)
        self.hidden3 = nn.Linear(hidden_units_2, hidden_units_1)
        self.hidden3_bn = nn.BatchNorm1d(hidden_units_1)
        self.dropout = nn.Dropout(p=0.5)
        self.output = nn.Linear(hidden_units_1,2)
        self.batch_size = hyperparameters["batch_size"]
        self.learning_rate = hyperparameters["learning_rate"]
        self.beta = hyperparameters["beta"]
        self.weight_decay = hyperparameters["weight_decay"]
        self.epoch_count = hyperparameters["epoch_count"]
        

    def forward(self, x):
        sf = nn.Softmax()
        x = F.rrelu(self.input(x))
        x = self.dropout(F.rrelu(self.hidden1_bn(self.hidden1(x))))
        x = self.dropout(F.rrelu(self.hidden2_bn(self.hidden2(x))))
        x = self.dropout(F.rrelu(self.hidden3_bn(self.hidden3(x))))
        x = self.output(x)
        return x
    
    def fit(self, train_valid_data, train_valid_labels, weight):
        # set random seed
        torch.manual_seed(0)
          
        trainset = pd.concat([train_valid_data,train_valid_labels],axis=1)
        trainset = shuffle(trainset, random_state = 0)

        train_valid_data = trainset.iloc[:,:trainset.shape[1]-1]
        train_valid_labels = trainset.iloc[:,trainset.shape[1]-1]

        # create loss function
        loss = nn.BCEWithLogitsLoss(weight = weight)
        # mini-batching
        batch_size = self.batch_size
        
        BETA_2 = 0.999        
        no_batch_minus_1 = train_valid_data.shape[0] / batch_size 

        skf_2 = RepeatedStratifiedKFold(n_splits=no_batch_minus_1,n_repeats = self.epoch_count,random_state=0)

        # create adam optimizer for Phase 2
        optimizer_2 = optim.Adam(self.parameters(), lr=self.learning_rate,betas = (self.beta,BETA_2), 
                                 weight_decay = self.weight_decay)
        
        
        for train,test in skf_2.split(train_valid_data,train_valid_labels):
            data = train_valid_data.iloc[test,:]
            data = torch.Tensor(data.values.astype(np.float32))
            # forward pass          
            output = self.forward(data)
            output.data = output.data.view(data.shape[0],2)

            labels = train_valid_labels[test]
            labels = labels.astype(int)
            labels = torch.Tensor(np.eye(2)[labels])
            labels = torch.autograd.Variable(labels, requires_grad = False)

            # zero the gradient buffers
            optimizer_2.zero_grad()
            # compute loss and gradients
            loss_output = loss(output,labels)
            loss_output.backward()
            # Does the update
            optimizer_2.step()
            
    #prediction probabilities array
    def predict_proba(self, X_test):
        self.eval()
        #forward pass
        test = torch.Tensor(X_test.values.astype(np.float32))
        output = self.forward(test)
        sf = nn.Softmax()
        probs = sf(output.data)
        probs_list = []
        for i in range(len(probs)):
            probs_list.append(probs[i][1].item())          
        return probs_list

In [10]:
def compute_per_domain_auc(y_test, pred_probs, domain_pred_dict, pred_idx, classifier):
    """
    Compute the average per_domain auc and auprc for the test set
    """
    
    y_test_copy = y_test.copy(deep=True)
    y_test_copy["pred_probs"] = pred_probs
    
    domain_auc_list = []
    domain_auprc_list = []
    domain_auprc_ratio_list = []
    domain_name_list = []
    domain_pos_num_list = []
    domain_neg_num_list = []
    
    idx = y_test.index
    y_test_copy["domain_name"] = [x[:x.rfind("_")] for x in idx]
    domains_list = y_test_copy["domain_name"].unique().tolist()
        
    for domain_name in domains_list:
        
        #Get only the domain positions
        domain_df = y_test_copy[y_test_copy["domain_name"] == domain_name]

        #Find the binding and non-binding positions of this domain 
        bind_list = domain_df[domain_df["label"] == 1].index
        bind_idx = [int(x[len(domain_name)+1:]) for x in bind_list]
        bind_num = len(bind_idx)
        non_bind_list = domain_df[domain_df["label"] == 0].index
        non_bind_idx = [int(x[len(domain_name)+1:]) for x in non_bind_list]
        non_bind_num = len(non_bind_idx)
        if (bind_num == 0 or non_bind_num == 0):
            #No positions of one of the classes "binding/non-binding" - skipping"
            continue
      
        domain_pred_dict["obs"].extend(domain_df["label"])
        domain_pred_dict["prob"].extend(domain_df["pred_probs"])
        fold_list = [pred_idx] * len(domain_df["pred_probs"])
        domain_pred_dict["fold"].extend(fold_list)
        model_list = [classifier] * len(domain_df["pred_probs"])
        domain_pred_dict["model"].extend(model_list)
        domain_str_list = [domain_name] * len(domain_df["pred_probs"])
        domain_pred_dict["domain"].extend(domain_str_list)
        
        #Add number of positives and number of negatives
        domain_pos_num_list.append(bind_num)
        domain_neg_num_list.append(non_bind_num)
        #Compute domain AUC
        domain_auc = roc_auc_score(domain_df["label"], domain_df["pred_probs"])
        domain_auc_list.append(domain_auc)
        #Compute domain AUPRC
        precision, recall, thresholds = precision_recall_curve(domain_df["label"], domain_df["pred_probs"])
        domain_auprc = auc(recall, precision)
        domain_auprc_list.append(domain_auprc)
        #Add positives fraction to list
        pos_frac_ratio = bind_num/float(domain_df.shape[0])
        #Add ratio of AUPRC and positives fraction to list
        domain_auprc_ratio_list.append(domain_auprc/float(pos_frac_ratio))
        #Add domain name for AUC/AUPRC/Ratio tables
        domain_name_list.append(domain_name)
        
    #Compute the means for the lists 
    domain_auc_mean = np.mean(domain_auc_list)
    domain_auprc_mean = np.mean(domain_auprc_list)
    domain_auprc_ratio_mean = np.mean(domain_auprc_ratio_list)
    
    return (domain_auc_mean, domain_auprc_mean, domain_auprc_ratio_mean, domain_auc_list, domain_auprc_list, domain_auprc_ratio_list, domain_name_list, domain_pos_num_list, domain_neg_num_list)

In [11]:
def area_under_precision_prob_curve(y_true, y_probs):
    
    probs_list = [0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1, 0.05, 0.01, 0.005, 0.001, 0]
    probs_vals = []
    precision_vals = []
    
    for prob in probs_list:
        binary_decision = [1 if x >= prob else 0 for x in y_probs]
        if (np.count_nonzero(binary_decision) == 0):
            continue
        precision_vals.append(fbeta_score(y_true, binary_decision, 0.001))
        probs_vals.append(prob)
    
    return auc(probs_vals, precision_vals)

In [22]:
def test_model_iterative_fixed(pred_dict, domain_pred_dict, auc_dict, auprc_dict, domain_auc_mean_dict, domain_auprc_mean_dict, domain_auprc_ratio_mean_dict, domain_auc_dict, 
                               domain_auprc_dict, domain_auprc_ratio_dict, prec_prob_dict,
                               ligand_bind_features, ligand_negatives_features, ligand_name, features=[]):
    
    """
    Test different models in 10-folds cross-validation.
    """
    
    #Default: Exclude no features
    if len(features) == 0:
        features = np.ones([ligand_bind_features.shape[1],]).astype(bool)
        
    #Arranging the features table by the CV order, for each model
    features_pred_dfs = dict.fromkeys(models_dict.keys())
    
    models_req_scaling = ["SVM", "KNN", "Logistic", "NN"]

    classifier = classifier_method
    features_pred_dfs[classifier] = pd.DataFrame()

    #Create X and y with included features
    X = pd.concat([ligand_bind_features.iloc[:,features], ligand_negatives_features.iloc[:,features]])
    y = [1] * ligand_bind_features.shape[0]
    y.extend([0] * ligand_negatives_features.shape[0])
    y = np.array(y)
    y_df = pd.DataFrame(y)
    y_df.index = X.index
    y_df.columns = ["label"]
    
    #Get the fold indices
    cv_idx = calc_CV_idx_iterative(X, splits_dict)
    k = (int(fold)-1)
    
    pred_idx = k+1
    print "fold #: "+str(pred_idx)
    test_index = cv_idx[k]["test"]
    train_index = cv_idx[k]["train"]
    X_train, X_test = X.loc[train_index,:], X.loc[test_index,:]
    y_train, y_test = y_df.loc[train_index,:], y_df.loc[test_index,:]
    
    if (classifier in models_req_scaling):
        cols = X_train.columns
        scaler = StandardScaler() 
        #scale only using the training data
        scaler.fit(X_train) 
        X_train = pd.DataFrame(scaler.transform(X_train))
        # apply same transformation to test data
        X_test = pd.DataFrame(scaler.transform(X_test))
        #Restoring indices after scaling
        X_train.index = train_index 
        X_test.index = test_index 
        #Restoring features names
        X_train.columns = cols
        X_test.columns = cols

    #No down-sampling
    X_train_sampled = X_train
    y_train_sampled = y_train

    #fit to training data
    model = models_dict[classifier][ligand][int(fold)]
    if classifier == "NN":     
        #weight vector for NN
        if hyperparameters["weight"] == "balanced":              
            #weight vector
            neg_weight = float(no_pos) / float(no_neg + no_pos) 
            pos_weight = 1 - neg_weight
        elif hyperparameters["weight"] == "0.1":
            neg_weight = 10
            pos_weight = 1
        elif hyperparameters["weight"] == "None":
            neg_weight = 1
            pos_weight = 1

        weight = torch.Tensor([neg_weight, pos_weight])
        model.fit(X_train_sampled, y_train_sampled["label"], weight)
        probs_list = model.predict_proba(X_test)
    else:
        
        model.fit(X_train_sampled, y_train_sampled["label"])
        probs_list = []
        probs = model.predict_proba(X_test)
        for l in probs:
            probs_list.append(l[1])

    pred_dict["obs"].extend(y_test["label"])
    pred_dict["prob"].extend(probs_list)
    fold_list = [pred_idx] * len(probs_list)
    pred_dict["fold"].extend(fold_list)

    model_list = [classifier] * len(probs_list)
    pred_dict["model"].extend(model_list)
    
    #Adding the position number to the table to help with analysis
    pred_dict["idx"].extend(test_index)

    #Update auc auprc dictionaries
    auc_dict[classifier].append(roc_auc_score(y_test["label"], probs[:, 1]))
    precision, recall, _ = precision_recall_curve(y_test["label"], probs[:, 1])
    auprc_dict[classifier].append(auc(recall, precision))
    #prec_prob_dict[classifier].append(area_under_precision_prob_curve(y_test["label"], probs[:, 1]))
    
    #Compute per domain AUC and AUPRC
    (domain_auc_mean, domain_auprc_mean, domain_auprc_ratio_mean, domain_auc_list, domain_auprc_list, domain_auprc_ratio_list, domain_name_list, domain_pos_num_list, domain_neg_num_list) = compute_per_domain_auc(y_test, probs[:, 1], domain_pred_dict,pred_idx, classifier)
    
    #Update relevant dictionaries for per-domain folds mean
    domain_auc_mean_dict[classifier].append(domain_auc_mean)
    domain_auprc_mean_dict[classifier].append(domain_auprc_mean)
    domain_auprc_ratio_mean_dict[classifier].append(domain_auprc_ratio_mean)
    
    #Update relevant dictionaries for per-domain individual metrices scores
    domain_auc_dict[classifier].extend(domain_auc_list)
    domain_auc_dict["domain"].extend(domain_name_list)
    domain_auc_dict["pos_num"].extend(domain_pos_num_list)
    domain_auc_dict["neg_num"].extend(domain_neg_num_list)
    
    domain_auprc_dict[classifier].extend(domain_auprc_list)
    domain_auprc_dict["domain"].extend(domain_name_list)
    domain_auprc_dict["pos_num"].extend(domain_pos_num_list)
    domain_auprc_dict["neg_num"].extend(domain_neg_num_list)
    
    domain_auprc_ratio_dict[classifier].extend(domain_auprc_ratio_list)
    domain_auprc_ratio_dict["domain"].extend(domain_name_list)
    domain_auprc_ratio_dict["pos_num"].extend(domain_pos_num_list)
    domain_auprc_ratio_dict["neg_num"].extend(domain_neg_num_list)
    
    #Update features table
    features_pred_dfs[classifier] = features_pred_dfs[classifier].append(X_test)
    pred_idx += 1

    print "AUC = "+str(auc_dict[classifier][-1])
    print "AUPRC = "+str(auprc_dict[classifier][-1])
    #print "AU prec prob = "+str(prec_prob_dict[classifier][-1])
    print "domain AUC mean = "+str(domain_auc_mean_dict[classifier][-1])
    print "domain AUPRC mean = "+str(domain_auprc_mean_dict[classifier][-1])
    print "domain AUPRC ratio mean = "+str(domain_auprc_ratio_mean_dict[classifier][-1])

    print "Finished "+ligand+" "+classifier+" fold: "+fold
    
    return (features_pred_dfs, model)

### Test model functions

In [17]:
def combine_features_predictions(ligand, ordered_features, pred_df):
    
    pred_res = pred_df.copy(deep=True)
    for classifier in models_dict.keys():
        classifier = classifier_method
        model_pred = pred_res[pred_res["model"] == classifier]
        model_pred.index = ordered_features[classifier].index
        
        #Creating the combined table
        features_pred = pd.concat([ordered_features[classifier], model_pred], axis=1)
        
        #Saving
        features_pred.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/"+out_dir+"/"+downsample_method+"/"+datafile_date+"_domain_CV/features_pred_tables/"+ligand+"_"+classifier+"_features_pred.csv", sep='\t')
        break

#### Predict for each ligand seperatelly

In [23]:
%%time

#Initialize dictionary
pred_dict = defaultdict(list)
domain_pred_dict = defaultdict(list)
auc_dict = defaultdict(list)
auprc_dict = defaultdict(list)
domain_auc_mean_dict = defaultdict(list)
domain_auprc_mean_dict = defaultdict(list)
domain_auprc_ratio_mean_dict = defaultdict(list)
domain_auc_dict = defaultdict(list)
domain_auprc_dict = defaultdict(list)
domain_auprc_ratio_dict = defaultdict(list)
prec_prob_dict = defaultdict(list)
downsample_method = "NoDown"


(ordered_features, model) = test_model_iterative_fixed(pred_dict, domain_pred_dict, auc_dict, auprc_dict, domain_auc_mean_dict, domain_auprc_mean_dict, domain_auprc_ratio_mean_dict, 
                                                       domain_auc_dict, domain_auprc_dict, domain_auprc_ratio_dict, prec_prob_dict, ligands_positives_df[ligand], 
                                                       ligands_negatives_df[ligand], ligand)

pred_df = pd.DataFrame.from_dict(pred_dict)
domain_pred_df = pd.DataFrame.from_dict(domain_pred_dict)
#global matrics dfs
auc_df = pd.DataFrame.from_dict(auc_dict)
auprc_df = pd.DataFrame.from_dict(auprc_dict)
#per domain mean dfs
domain_auc_mean_df = pd.DataFrame.from_dict(domain_auc_mean_dict)
domain_auprc_mean_df = pd.DataFrame.from_dict(domain_auprc_mean_dict)
domain_auprc_ratio_mean_df = pd.DataFrame.from_dict(domain_auprc_ratio_mean_dict)
#per domain dfs
domain_auc_df = pd.DataFrame.from_dict(domain_auc_dict)
domain_auprc_df = pd.DataFrame.from_dict(domain_auprc_dict)
domain_auprc_ratio_df= pd.DataFrame.from_dict(domain_auprc_ratio_dict)

#Save to file
out_dirname = "comb_dna0.5_rna0.25_ion0.75_tuned"

pred_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/"+out_dir+"/"+downsample_method+"/"+datafile_date+"_"+str(folds_num)+"f_"+out_dirname+"/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w.csv", sep='\t')
domain_pred_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/"+out_dir+"/"+downsample_method+"/"+datafile_date+"_"+str(folds_num)+"f_"+out_dirname+"/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w_d.csv", sep='\t')

auc_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/"+out_dir+"/"+downsample_method+"/"+datafile_date+"_"+str(folds_num)+"f_"+out_dirname+"/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w_auc.csv", sep='\t')
auprc_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/"+out_dir+"/"+downsample_method+"/"+datafile_date+"_"+str(folds_num)+"f_"+out_dirname+"/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w_auprc.csv", sep='\t')

domain_auc_mean_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/"+out_dir+"/"+downsample_method+"/"+datafile_date+"_"+str(folds_num)+"f_"+out_dirname+"/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w_dm_auc.csv", sep='\t')
domain_auprc_mean_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/"+out_dir+"/"+downsample_method+"/"+datafile_date+"_"+str(folds_num)+"f_"+out_dirname+"/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w_dm_auprc.csv", sep='\t')
domain_auprc_ratio_mean_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/"+out_dir+"/"+downsample_method+"/"+datafile_date+"_"+str(folds_num)+"f_"+out_dirname+"/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w_dm_auprc_ratio.csv", sep='\t')

domain_auc_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/"+out_dir+"/"+downsample_method+"/"+datafile_date+"_"+str(folds_num)+"f_"+out_dirname+"/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w_d_auc.csv", sep='\t')
domain_auprc_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/"+out_dir+"/"+downsample_method+"/"+datafile_date+"_"+str(folds_num)+"f_"+out_dirname+"/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w_d_auprc.csv", sep='\t')
domain_auprc_ratio_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/"+out_dir+"/"+downsample_method+"/"+datafile_date+"_"+str(folds_num)+"f_"+out_dirname+"/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w_d_auprc_ratio.csv", sep='\t')

pred_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/"+out_dir+"/"+downsample_method+"/"+datafile_date+"_"+str(folds_num)+"f_"+str(prec_th)+"p/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w.csv", sep='\t')
domain_pred_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/"+out_dir+"/"+downsample_method+"/"+datafile_date+"_"+str(folds_num)+"f_"+str(prec_th)+"p/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w_d.csv", sep='\t')

auc_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/"+out_dir+"/"+downsample_method+"/"+datafile_date+"_"+str(folds_num)+"f_"+str(prec_th)+"p/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w_auc.csv", sep='\t')
auprc_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/"+out_dir+"/"+downsample_method+"/"+datafile_date+"_"+str(folds_num)+"f_"+str(prec_th)+"p/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w_auprc.csv", sep='\t')

domain_auc_mean_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/"+out_dir+"/"+downsample_method+"/"+datafile_date+"_"+str(folds_num)+"f_"+str(prec_th)+"p/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w_dm_auc.csv", sep='\t')
domain_auprc_mean_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/"+out_dir+"/"+downsample_method+"/"+datafile_date+"_"+str(folds_num)+"f_"+str(prec_th)+"p/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w_dm_auprc.csv", sep='\t')
domain_auprc_ratio_mean_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/"+out_dir+"/"+downsample_method+"/"+datafile_date+"_"+str(folds_num)+"f_"+str(prec_th)+"p/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w_dm_auprc_ratio.csv", sep='\t')

domain_auc_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/"+out_dir+"/"+downsample_method+"/"+datafile_date+"_"+str(folds_num)+"f_"+str(prec_th)+"p/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w_d_auc.csv", sep='\t')
domain_auprc_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/"+out_dir+"/"+downsample_method+"/"+datafile_date+"_"+str(folds_num)+"f_"+str(prec_th)+"p/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w_d_auprc.csv", sep='\t')
domain_auprc_ratio_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/"+out_dir+"/"+downsample_method+"/"+datafile_date+"_"+str(folds_num)+"f_"+str(prec_th)+"p/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w_d_auprc_ratio.csv", sep='\t')

#Combine features and pred results to a unified table
#combine_features_predictions(ligand, ordered_features, pred_df)

print "Finished ligand "+ligand

fold #: 5
AUC = 0.8882790671404037
AUPRC = 0.30978649611448733
domain AUC mean = 0.7653679706470694
domain AUPRC mean = 0.4848963770646927
domain AUPRC ratio mean = 3.4120769086560543
Finished dna KNN fold: 5
Finished ligand dna
CPU times: user 19min 11s, sys: 1.69 s, total: 19min 13s
Wall time: 1min 41s
