In [1]:
#Basic imports
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
from os import environ, getcwd
import sys

#Classifier imports
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier

#ML framework imports
from sklearn.metrics import auc, roc_auc_score, precision_recall_curve, precision_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle

# Neural Net imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

#Import utils functions
curr_dir = !pwd

sys.path.append(curr_dir[0]+"/utils")
from prop_threshold_funcs import create_negatives_datasets_combined, create_positives_datasets_combined
from prediction_general_funcs import ligands, score_cols_suffix, get_features_cols, remove_unimportant_features
from CV_funcs import add_domain_name_from_table_idx, calc_CV_idx_iterative
from generate_hyperparameter_trials import *

from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

### Reading the input dataset

In [2]:
curr_dir = !pwd
pfam_version = "31"
datafile_date = "06.20.18"
input_path = curr_dir[0]+"/domains_similarity/filtered_features_table/"
filename = "windowed_positions_features_mediode_filter_"+datafile_date+".csv"
out_dir = "mediode_NegLigand_NoFilter"

#flags for creating negatives
zero_prop = True
no_prop = True
all_ligands = False
prec_th = 0.25
folds_num = 5

#Features table
features_all = pd.read_csv(input_path+filename, sep='\t', index_col=0)
#Features columns names, without the labels (the binding scores)
features_cols = get_features_cols(features_all)
remove_unimportant_features(features_all, features_cols)

print "all samples positions #: "+str(features_all.shape[0])

#CV splits dictionary
with open(curr_dir[0]+"/CV_splits/pfam-v"+pfam_version+"/domain_"+str(folds_num)+"_folds_"+str(prec_th)+"_prec_dict.pik", 'rb') as handle:
        splits_dict = pickle.load(handle)

all samples positions #: 42535


#### Dataset of negative examples

In [3]:
ligands_negatives_df = create_negatives_datasets_combined(zero_prop, no_prop, features_all, features_cols, all_ligands)

dna non-binding #:41680
dnabase non-binding #:42089
dnabackbone non-binding #:41689
dna combined non binding #: 41555
rna non-binding #:41613
rnabase non-binding #:41828
rnabackbone non-binding #:41619
rna combined non binding #: 41401
peptide non-binding #:38794
ion non-binding #:37525
metabolite non-binding #:37463
sm non-binding #:30978


#### Datasets of positive examples by ligand

In [4]:
ligands_positives_df = create_positives_datasets_combined(features_all, features_cols, all_ligands)

dna #: 239
dnabase #: 170
dnabackbone #: 244
dna combined #: 353
rna #: 360
rnabase #: 246
rnabackbone #: 346
rna combined #: 468
peptide #: 462
ion #: 350
metabolite #: 504
sm #: 708




### Reading env input for downsampler technique, ligand and classifier

In [20]:
#Reading the ligand input
try:
    ligand = environ['ligand']
except:
    ligand = "sm"
print "ligand = "+ligand
    
#Reading the downsampler input
try: 
    fold = environ['fold']
except:
    fold = "3"
print "fold = "+fold

#Reading the classifier input
try: 
    classifier_method = environ['classifier']
except:
    classifier_method = "SVM"
print "classifier_method = "+classifier_method

if classifier_method == "NN":
    try:
        learning_rate_ub = int(environ['learning_rate_ub'])
        learning_rate_lb = int(environ['learning_rate_lb'])
        batch_size_ub = int(environ['batch_size_ub'])
        batch_size_lb = int(environ['batch_size_lb'])
        weight_decay_ub = int(environ['weight_decay_ub'])
        weight_decay_lb = int(environ['weight_decay_lb'])
        beta_ub = float(environ['beta_ub'])
        beta_lb = float(environ['beta_lb'])
        hidden_units_1_ub = int(environ['hidden_units_1_ub'])
        hidden_units_1_lb = int(environ['hidden_units_1_lb'])
        hidden_units_2_ub = int(environ['hidden_units_2_ub'])
        hidden_units_2_lb = int(environ['hidden_units_2_lb'])

    except:
        learning_rate_ub = -2
        learning_rate_lb = -3
        batch_size_ub = 150
        batch_size_lb = 30
        weight_decay_ub = -7
        weight_decay_lb = -17
        beta_ub = 0.95
        beta_lb = 0.85
        hidden_units_1_ub = 300
        hidden_units_1_lb = 50
        hidden_units_2_ub = 800
        hidden_units_2_lb = 350
    

if classifier_method == "XGB":
    try:
        n_estimators_ub =int(environ["xgb_n_estimators_ub"])
        n_estimators_lb =int(environ["xgb_n_estimators_lb"])
        max_depth_ub = int(environ["max_depth_ub"])
        max_depth_lb = int(environ["max_depth_lb"])
        min_child_weight_ub = int(environ["min_child_weight_ub"])
        min_child_weight_lb = int(environ["min_child_weight_lb"])
        colsample_bytree_ub = float(environ["colsample_bytree_ub"])
        colsample_bytree_lb = float(environ["colsample_bytree_lb"])
    
    except:
        n_estimators_ub = 3
        n_estimators_lb =1
        max_depth_ub = 5
        max_depth_lb = 2
        min_child_weight_ub = 6
        min_child_weight_lb = 2
        colsample_bytree_ub = 0.6
        colsample_bytree_lb = 0.5

        
if classifier_method == "RF":
    try:
        n_estimators_ub = int(environ["rf_n_estimators_ub"])
        n_estimators_lb = int(environ["rf_n_estimators_lb"])
    except:
        n_estimators_ub = 5
        n_estimators_lb = 2

if classifier_method == "Logistic":
    try:
        C_ub = int(environ["log_C_ub"])
        C_lb = int(environ["log_C_lb"])
    except:
        C_ub = 3
        C_lb = 1

if classifier_method == "KNN":
    try:
        n_neighbors_ub = int(environ["n_neighbors_ub"])
        n_neighbors_lb = int(environ["n_neighbors_lb"])

    except:
        n_neighbors_ub = 100
        n_neighbors_lb = 5
        
if classifier_method == "ADA":
    try:
        n_estimators_ub = int(environ["ada_n_estimators_ub"])
        n_estimators_lb = int(environ["ada_n_estimators_lb"])
    except:
        n_estimators_ub = 6
        n_estimators_lb = 3
        
if classifier_method == "SVM":
    try:
        C_ub = int(environ["sv_C_ub"])
        C_lb = int(environ["sv_C_lb"])
        gamma_ub = int(environ["gamma_ub"])
        gamma_lb = int(environ["gamma_lb"])
    except:
        C_ub = 4
        C_lb = 2
        gamma_ub = -4
        gamma_lb = -6



        



ligand = sm
fold = 3
classifier_method = SVM


### Generate hyperparameter trials

Choose hyperparameters and generate hyperparameters through random search in a grid, as explained by this video: https://www.youtube.com/watch?v=WrICwRrvuIc&index=66&list=PLBAGcD3siRDguyYYzhVwZ3tLvOyyG5k6K

Use logarithmic scale for search for learnining rate and weight decay for NN, as explained by this video: https://www.youtube.com/watch?v=VUbrW8OK3uo&index=67&list=PLBAGcD3siRDguyYYzhVwZ3tLvOyyG5k6K

Utilize nested cross validation to choose between models, as described here: https://stats.stackexchange.com/questions/266225/step-by-step-explanation-of-k-fold-cross-validation-with-grid-search-to-optimise/266229

In [21]:
no_trials = 1

if classifier_method == "NN":
    hyperparameter_trials = generate_trials_NN(no_trials, learning_rate_ub, learning_rate_lb, batch_size_ub, batch_size_lb,weight_decay_ub, 
                                               weight_decay_lb, beta_ub, beta_lb, hidden_units_1_ub, hidden_units_1_lb, hidden_units_2_ub, 
                                               hidden_units_2_lb)
if classifier_method == "XGB":
    hyperparameter_trials = generate_trials_XGB(no_trials, n_estimators_ub, n_estimators_lb,
                                                max_depth_ub, max_depth_lb, min_child_weight_ub,
                                                min_child_weight_lb, colsample_bytree_ub, colsample_bytree_lb)
if classifier_method == "RF":
    hyperparameter_trials = generate_trials_RF(no_trials, n_estimators_ub, n_estimators_lb)
if classifier_method == "Logistic":
    hyperparameter_trials = generate_trials_Log(no_trials, C_ub, C_lb)
if classifier_method == "KNN":
    hyperparameter_trials = generate_trials_KNN(no_trials, n_neighbors_ub, n_neighbors_lb)
if classifier_method == "ADA":
    hyperparameter_trials = generate_trials_ADA(no_trials, n_estimators_ub, n_estimators_lb)
if classifier_method == "SVM":
    hyperparameter_trials = generate_trials_SVM(no_trials, C_ub, C_lb, gamma_ub, gamma_lb)
    

print hyperparameter_trials



[{'C': 1252.0653814999462, 'gamma': 2.693883019285411e-05}]


### Define the Network

Tutorial for Neural Net Architecture: https://pytorch.org/tutorials/beginner/blitz/neural_networks_tutorial.html

Utilize batch normalization, as explained here: https://www.youtube.com/watch?v=fv1Luwd-LOI&index=69&list=PLBAGcD3siRDguyYYzhVwZ3tLvOyyG5k6K

In [22]:
# define the network with batch normalization
class Net(nn.Module):
    def __init__(self, hyperparameters):
        hidden_units_1 = hyperparameters["hidden_units_1"]
        hidden_units_2 = hyperparameters["hidden_units_2"]
        super(Net, self).__init__()
        self.input = nn.Linear(len(features_cols), hidden_units_1) # read input size from the .shape of data table
        self.hidden1 = nn.Linear(hidden_units_1, hidden_units_2)
        self.hidden1_bn = nn.BatchNorm1d(hidden_units_2)
        self.hidden2 = nn.Linear(hidden_units_2, hidden_units_2)
        self.hidden2_bn = nn.BatchNorm1d(hidden_units_2)
        self.hidden3 = nn.Linear(hidden_units_2, hidden_units_1)
        self.hidden3_bn = nn.BatchNorm1d(hidden_units_1)
        self.output = nn.Linear(hidden_units_1,2)
        self.batch_size = hyperparameters["batch_size"]
        self.learning_rate = hyperparameters["learning_rate"]
        self.beta = hyperparameters["beta"]
        self.weight_decay = hyperparameters["weight_decay"]

    def forward(self, x):
        x = F.relu(self.input(x))
        x = F.relu(self.hidden1_bn(self.hidden1(x)))
        x = F.relu(self.hidden2_bn(self.hidden2(x)))
        x = F.relu(self.hidden3_bn(self.hidden3(x)))
        x = self.output(x)
        return x
    
    def fit(self, X_train, y_train_label, X_valid, y_valid, weight):

        # set random seed for weights and biases
        torch.manual_seed(0)

        # dataset
        dataset = pd.concat([X_train,y_train_label],axis=1)
        dataset = shuffle(dataset, random_state = 0)

        X_train = dataset.iloc[:,:dataset.shape[1]-1]
        y_train_label = dataset.iloc[:,dataset.shape[1]-1]


        # create loss function
        loss = nn.CrossEntropyLoss(weight = weight)
        # mini-batching
        batch_size = self.batch_size

        # create adam optimizer for Phase 1
        optimizer_1 = optim.Adam(self.parameters(), lr=self.learning_rate,betas=(self.beta,0.999), 
                                 weight_decay = self.weight_decay)
        no_batch_minus_1 = X_train.shape[0] / batch_size 

        # Repeated Stratified K Fold to ensure positives are evenly distributed across batches
        skf_1 = RepeatedStratifiedKFold(n_splits=no_batch_minus_1,n_repeats=5,random_state=0)
        count = 0
        epoch_count = 0
        tol = 0.01
        prev_score = 0
        patience = 0 

        for train,test in skf_1.split(X_train,y_train_label):
            data = X_train.iloc[test,:]
            data = torch.Tensor(data.values.astype(np.float32))
             # forward pass
            output = self.forward(data)
            output.data = output.data.view(data.shape[0],2)

            labels = y_train_label[test]
            labels = torch.Tensor(labels.astype(np.float32))
            labels = torch.autograd.Variable(labels).long()

            # zero the gradient buffers
            optimizer_1.zero_grad()
            # compute loss and gradients
            loss_output = loss(output,labels)
            loss_output.backward()
            # Does the update
            optimizer_1.step()

            count = count + 1

            # Early Stopping
            if count == no_batch_minus_1 + 1:
                count = 0
                epoch_count = epoch_count + 1
                probs = self.predict_proba(X_valid)
                precision, recall, _ = precision_recall_curve(y_valid, probs)
                score = auc(recall, precision)
                #score = roc_auc_score(y_valid, probs)
                diff = score - prev_score

                if diff < tol:
                    patience = patience + 1
                    prev_score = score
                    if patience >= 4 :
                        break

                else:
                    patience = 0
                    prev_score = score
                    self = self.train()
        best_epoch_count = epoch_count - patience
        return prev_score,best_epoch_count

        
    #prediction probabilities array
    def predict_proba(self, X_test):
        self = self.eval()
        #forward pass
        test = torch.Tensor(X_test.values.astype(np.float32))
        output = self.forward(test)
        sf = nn.Softmax()
        probs = sf(output.data)
        return probs[:,1]



### Models tested (and their hyper-parameters)

In [24]:
def generate_model(classifier_method, hyperparameters):
    if (classifier_method == "XGB"):
        ligand_pos = ligands_positives_df[ligand].shape[0]
        ligand_neg = ligands_negatives_df[ligand].shape[0]
        scale_weight = ligand_neg/float(ligand_pos)
        model = XGBClassifier(n_estimators=hyperparameters["n_estimators"], n_jobs=-1, random_state=0, max_depth=hyperparameters["max_depth"], min_child_weight=hyperparameters["min_child_weight"], colsample_bytree=hyperparameters["colsample_bytree"], 
                              scale_pos_weight=scale_weight)
    elif (classifier_method == "RF"):
        model = RandomForestClassifier(n_estimators=hyperparameters["n_estimators"], n_jobs=-1, random_state=0)  
    elif(classifier_method == "Logistic"):
        model = LogisticRegression(C=hyperparameters["C"], random_state=0, n_jobs=-1)
    elif (classifier_method == "KNN"):
        model = KNeighborsClassifier(n_neighbors=hyperparameters["n_neighbors"], n_jobs=-1)
    elif (classifier_method == "ADA"):
        model = AdaBoostClassifier(n_estimators=hyperparameters["n_estimators"], random_state=0)
    elif (classifier_method == "SVM"):
        model = SVC(C=hyperparameters["C"], gamma = hyperparameters["gamma"], kernel="rbf", probability=True, random_state=0) 
    elif (classifier_method =="NN"):
        torch.manual_seed(0)
        model = Net(hyperparameters)
        # sets model in training mode because batch normalization behavior in training and testing modes are different
        model = model.train()
    return model

In [25]:
def compute_per_domain_auc(y_test, pred_probs, domain_pred_dict, pred_idx, classifier):
    """
    Compute the average per_domain auc and auprc for the test set
    """
    
    y_test_copy = y_test.copy(deep=True)
    y_test_copy["pred_probs"] = pred_probs
    
    domain_auc_list = []
    domain_auprc_list = []
    domain_auprc_ratio_list = []
    domain_name_list = []
    
    idx = y_test.index
    y_test_copy["domain_name"] = [x[:x.rfind("_")] for x in idx]
    domains_list = y_test_copy["domain_name"].unique().tolist()
        
    for domain_name in domains_list:
        
        #Get only the domain positions
        domain_df = y_test_copy[y_test_copy["domain_name"] == domain_name]

        #Find the binding and non-binding positions of this domain 
        bind_list = domain_df[domain_df["label"] == 1].index
        bind_idx = [int(x[len(domain_name)+1:]) for x in bind_list]
        bind_num = len(bind_idx)
        non_bind_list = domain_df[domain_df["label"] == 0].index
        non_bind_idx = [int(x[len(domain_name)+1:]) for x in non_bind_list]
        non_bind_num = len(non_bind_idx)
        if (bind_num == 0 or non_bind_num == 0):
            #No positions of one of the classes "binding/non-binding" - skipping"
            continue
        
        domain_pred_dict["obs"].extend(domain_df["label"])
        domain_pred_dict["prob"].extend(domain_df["pred_probs"])
        fold_list = [pred_idx] * len(domain_df["pred_probs"])
        domain_pred_dict["fold"].extend(fold_list)
        model_list = [classifier] * len(domain_df["pred_probs"])
        domain_pred_dict["model"].extend(model_list)
        domain_str_list = [domain_name] * len(domain_df["pred_probs"])
        domain_pred_dict["domain"].extend(domain_str_list)
    
        #Compute domain AUC
        domain_auc = roc_auc_score(domain_df["label"], domain_df["pred_probs"])
        domain_auc_list.append(domain_auc)
        #Compute domain AUPRC
        precision, recall, thresholds = precision_recall_curve(domain_df["label"], domain_df["pred_probs"])
        domain_auprc = auc(recall, precision)
        domain_auprc_list.append(domain_auprc)
        #Add positives fraction to list
        pos_frac_ratio = bind_num/float(domain_df.shape[0])
        #Add ratio of AUPRC and positives fraction to list
        domain_auprc_ratio_list.append(domain_auprc/float(pos_frac_ratio))
        #Add domain name for AUC/AUPRC/Ratio tables
        domain_name_list.append(domain_name)
        
    #Compute the means for the lists 
    domain_auc_mean = np.mean(domain_auc_list)
    domain_auprc_mean = np.mean(domain_auprc_list)
    domain_auprc_ratio_mean = np.mean(domain_auprc_ratio_list)
    
    return (domain_auc_mean, domain_auprc_mean, domain_auprc_ratio_mean, domain_auc_list, domain_auprc_list, domain_auprc_ratio_list, domain_name_list)

### Dealing with model imbalance
Weight Vector: https://towardsdatascience.com/dealing-with-imbalanced-classes-in-machine-learning-d43d6fa19d2 (look at section on "Cost-sensitive Learning")

In [29]:
def test_model_iterative_fixed(hyperparameters_dict,ligand_bind_features, ligand_negatives_features, ligand_name, features=[]):
    
    """
    Test different models in 10-folds cross-validation.
    """
    
    #Default: Exclude no features
    if len(features) == 0:
        features = np.ones([ligand_bind_features.shape[1],]).astype(bool)
        
    #Arranging the features table by the CV order, for each model
    #features_pred_dfs = dict.fromkeys(classifiers.keys())
    features_pred_dfs = {}
    
    models_req_scaling = ["SVM", "KNN", "Logistic","NN"]

    classifier = classifier_method
    
    #model = classifiers[classifier]
    features_pred_dfs[classifier] = pd.DataFrame()

    #Create X and y with included features
    X = pd.concat([ligand_bind_features.iloc[:,features], ligand_negatives_features.iloc[:,features]])
    y = [1] * ligand_bind_features.shape[0]
    y.extend([0] * ligand_negatives_features.shape[0])
    y = np.array(y)
    y_df = pd.DataFrame(y)
    y_df.index = X.index
    y_df.columns = ["label"]
    
    #Get the fold indices
    cv_idx = calc_CV_idx_iterative(X, splits_dict)
    k = (int(fold)-1)
    
    pred_idx = k+1
    print "fold #: "+str(pred_idx)
    #test_index = cv_idx[k]["test"]
    full_train_index = cv_idx[k]["train"]
        
    # phase 1: testing on validation set, hyperparameter tuning
    trials_mean_results_list = np.zeros(no_trials) 
    
    for trial in range(no_trials):
        trial_results = np.zeros(4)
        epoch_counts = np.zeros(4, dtype = "int")
        for i in range(4):
            valid_k = (k + 1 + i) % 5
            valid_index = cv_idx[valid_k]["test"]

            train_index = [index for index in full_train_index if index not in valid_index]
            #sanity checks to check if there are overlaps between training and validation
            #valid_index_set = set(valid_index)
            #train_index_set = set(train_index)
            #print valid_index_set.intersection(train_index_set)
            X_train, X_valid = X.loc[train_index,:], X.loc[valid_index,:]
            y_train, y_valid = y_df.loc[train_index,:], y_df.loc[valid_index,:]
            
            if (classifier in models_req_scaling):
                cols = X_train.columns

                # phase 1 scaling with just training data
                scaler_1 = StandardScaler() 
                scaler_1.fit(X_train) 
                X_train = pd.DataFrame(scaler_1.transform(X_train))
                # apply same transformation to validation data
                X_valid = pd.DataFrame(scaler_1.transform(X_valid))

                #Restoring indices after scaling
                X_train.index = train_index 
                X_valid.index = valid_index

                #Restoring features names
                X_train.columns = cols
                X_valid.columns = cols

            #No down-sampling
            X_train_sampled = X_train
            y_train_sampled = y_train

            #fit to training data
            if classifier == "NN":
                #weight vector
                no_pos = ligand_bind_features.shape[0]
                no_neg = ligand_negatives_features.shape[0]
                neg_weight = float(no_pos) / float(no_neg + no_pos)
                pos_weight = 1 - neg_weight
                weight = torch.Tensor([neg_weight, pos_weight])
                model = generate_model(classifier, hyperparameter_trials[trial])
                auprc_score,epoch_count = model.fit(X_train_sampled, y_train_sampled["label"],X_valid, y_valid["label"],weight)
            else:
                model = generate_model(classifier, hyperparameter_trials[trial])
                model.fit(X_train_sampled, y_train_sampled["label"])
                probs_list = []
                probs = model.predict_proba(X_valid)
                for l in probs:
                    probs_list.append(l[1])
                precision, recall, _ = precision_recall_curve(y_valid, probs_list)
                auprc_score = auc(recall, precision)
                

            trial_results[i] = auprc_score 
            if classifier == "NN": epoch_counts[i] = epoch_count
        
        mean_result = np.mean(trial_results)
        trials_mean_results_list[trial] = mean_result
        
        if classifier == "NN":
            majority_epoch_count = np.bincount(epoch_counts).argmax()
            hyperparameter_trials[trial]["epoch_count"] = majority_epoch_count
        
        hyperparameter_trials[trial]["mean_AUPRC"] = mean_result
    
    # extract top performing hyperparameters
    max_index = np.argmax(trials_mean_results_list)
    best_hyperparameters = hyperparameter_trials[max_index]
    print "best_hyperparameters:" + str(best_hyperparameters)

    with open(curr_dir[0]+"/best_hyperparameters/"+ligand+"_"+classifier_method+"_best_hyperparameters.pik", 'wb') as handle:
        pickle.dump(best_hyperparameters, handle, protocol=pickle.HIGHEST_PROTOCOL)
    handle.close()
    
    
        
    # Update dictionary with all hyperparameters
    keys = hyperparameter_trials[0].keys()
    for key in keys:
        for trial in range(no_trials):
            hyperparameters_dict[key].append(hyperparameter_trials[trial][key])

    pred_idx += 1

    print "Finished "+ligand+" "+classifier+" fold: "+fold
    
    return features_pred_dfs 

### Test model functions

In [27]:
def combine_features_predictions(ligand, ordered_features, pred_df):
    
    pred_res = pred_df.copy(deep=True)
    for classifier in classifiers.keys():
        classifier = classifier_method
        model_pred = pred_res[pred_res["model"] == classifier]
        model_pred.index = ordered_features[classifier].index
        
        #Creating the combined table
        features_pred = pd.concat([ordered_features[classifier], model_pred], axis=1)
        
        #Saving
        features_pred.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/"+out_dir+"/"+downsample_method+"/"+datafile_date+"_domain_CV/features_pred_tables/"+ligand+"_"+classifier+"_features_pred.csv", sep='\t')
        break

#### Predict for each ligand seperatelly

In [28]:
#%%time

#Initialize dictionary
hyperparameters_dict = defaultdict(list)

downsample_method = "NoDown"

ordered_features = test_model_iterative_fixed(hyperparameters_dict,ligands_positives_df[ligand], ligands_negatives_df[ligand], ligand)

hyperparameters_df = pd.DataFrame.from_dict(hyperparameters_dict)
hyperparameters_df = hyperparameters_df.sort_values(by = "mean_AUPRC", ascending = False)

#Save to file
hyperparameters_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/"+out_dir+"/"+downsample_method+"/"+datafile_date+"_domain_CV/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w_hyperparameters.csv", sep=',')

#Combine features and pred results to a unified table
#combine_features_predictions(ligand, ordered_features, pred_df)

print "Finished ligand "+ligand

fold #: 3
best_hyperparameters:{'C': 1252.0653814999462, 'mean_AUPRC': 0.173900007237034, 'gamma': 2.693883019285411e-05}
Finished sm SVM fold: 3
Finished ligand sm
