### Create oof predictions for each test fold, using final model hyperparams

In [1]:
#Basic imports
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
from os import environ, getcwd
import sys

curr_dir = getcwd()

#Import 10.Prediction utils functions
sys.path.append(curr_dir+"/../10.Prediction/utils")
from prop_threshold_funcs import create_positives_datasets_combined, create_negatives_datasets_combined
from prediction_general_funcs import get_features_cols, remove_unimportant_features
from CV_funcs import calc_CV_idx_iterative
from tuning_helper_functions import generate_model

#Import 10.Prediction/stacking utils functions
sys.path.append(curr_dir+"/../10.Prediction/stacking/utils")
from stacking_funcs import create_stacked_dataset

In [2]:
#Reading the ligand input
try:
    ligand = environ['ligand']
except:
    ligand = "ion"
print "ligand = "+ligand

#Reading the ensemble type
try: 
    ens = environ['ens']
except:
    ens = "MODEL"
print "ens = "+ens

#Reading the heldout fold
try: 
    heldout_fold = environ['heldout_fold']
except:
    heldout_fold = "5"
print "heldout fold = "+heldout_fold

#Reading the test fold
try:
    test_fold = environ['test_fold']
except:
    test_fold = "4"
print "test fold = "+test_fold

#Reading the classifier input
try: 
    classifier_method = environ['classifier']
except:
    classifier_method = "XGB"
print "classifier_method = "+classifier_method

ligand = ion
ens = MODEL
heldout fold = 5
test fold = 4
classifier_method = XGB


In [3]:
hyperparameters = {}
ligands = ["rna", "dna", "ion", "peptide", "sm"]
all_models = ["XGB", "SVM", "RF", "Logistic", "NN"]
if (ens == "LIGAND"):
    from generate_stacking2nd_ligand_features_dict_global_auprc import generate_models_dict
    hyperparameters["ligands"] = [ligand]
    hyperparameters["models"] = all_models
    hyperparameters["keep"] = True
elif (ens == "PROB"):
    from generate_stacking2nd_just_probs_dict_global_auprc import generate_models_dict
    hyperparameters["ligands"] = ligands
    hyperparameters["models"] = all_models
    hyperparameters["keep"] = False
elif (ens == "MODEL"):
    from generate_stacking2nd_model_features_dict_global_auprc import generate_models_dict
    hyperparameters["ligands"] = ligands
    hyperparameters["models"] = ["XGB"]
    hyperparameters["keep"] = True
else:
    from generate_stacking2nd_all_features_dict_global_auprc import generate_models_dict
    hyperparameters["ligands"] = ligands
    hyperparameters["models"] = all_models
    hyperparameters["keep"] = True

In [4]:
pfam_version = "31"
datafile_date = "08.06.18"
input_path = curr_dir+"/../10.Prediction/domains_similarity/filtered_features_table/"
filename = "windowed_positions_features_mediode_filter_"+datafile_date+".csv"

zero_prop = True
no_prop = True
all_ligands = False
folds_num = 5


#Features table
features_all = pd.read_csv(input_path+filename, sep='\t', index_col=0)
#Features columns names, without the labels (the binding scores)
features_cols = get_features_cols(features_all)
print "all samples positions #: "+str(features_all.shape[0])

#CV splits dictionary
with open(curr_dir+"/../10.Prediction/CV_splits/pfam-v"+pfam_version+"/domain_"+str(folds_num)+"_folds_combined_dna0.5_rna0.5_ion0.75_prec_dict.pik", 'rb') as handle:
    splits_dict = pickle.load(handle)

all samples positions #: 44872


In [5]:
print "# of features before removal: "+str(len(features_cols))
remove_unimportant_features(features_all, features_cols, update_features_cols=True)
print "# of features after removal: "+str(len(features_cols))

# of features before removal: 761
# of features after removal: 753


In [6]:
ligands_negatives_df = create_negatives_datasets_combined(zero_prop, no_prop, features_all, features_cols, all_ligands)

dna non-binding #:43886
dnabase non-binding #:44418
dnabackbone non-binding #:44021
dna combined non binding #: 43884
rna non-binding #:43727
rnabase non-binding #:44154
rnabackbone non-binding #:43944
rna combined non binding #: 43720
peptide non-binding #:41105
ion non-binding #:39630
metabolite non-binding #:39638
druglike non-binding #:35018
sm non-binding #:32697


In [7]:
ligands_positives_df = create_positives_datasets_combined(features_all, features_cols, all_ligands)

dna #: 369
dnabase #: 161
dnabackbone #: 245
dna combined #: 397
rna #: 206
rnabase #: 118
rnabackbone #: 136
rna combined #: 247
peptide #: 436
ion #: 351
metabolite #: 522
druglike #: 763
sm #: 825




In [8]:
no_features = len(features_cols)
models_dict = generate_models_dict(ligand, classifier_method, ligands, ligands_positives_df, ligands_negatives_df, folds_num, no_features)

In [9]:
training_stacking_path = curr_dir+"/../10.Prediction/stacking/1st_level_pred/08.06.18_comb_dna0.5_rna0.5_ion0.75/global_auprc/all_combined/"+heldout_fold+"/"
(training_positivies, training_negatives) = create_stacked_dataset(training_stacking_path, hyperparameters["ligands"], hyperparameters["models"], 
                                                                   ligands_positives_df[ligand], ligands_negatives_df[ligand], all_models, keep_original_features= hyperparameters["keep"])

#(features) = 758


In [10]:
def test_model_iterative_fixed(pred_dict, ligand_bind_features, ligand_negatives_features, ligand_name, features=[]):
    
    """
    Test different models in 10-folds cross-validation.
    """
    
    #Default: Exclude no features
    if len(features) == 0:
        features = np.ones([ligand_bind_features.shape[1],]).astype(bool)
        
    #Arranging the features table by the CV order, for each model
    features_pred_dfs = dict.fromkeys(models_dict.keys())
    
    models_req_scaling = ["SVM", "KNN", "Logistic", "NN"]

    classifier = classifier_method
    features_pred_dfs[classifier] = pd.DataFrame()

    #Create X and y with included features
    X = pd.concat([ligand_bind_features.iloc[:,features], ligand_negatives_features.iloc[:,features]])
    y = [1] * ligand_bind_features.shape[0]
    y.extend([0] * ligand_negatives_features.shape[0])
    y = np.array(y)
    y_df = pd.DataFrame(y)
    y_df.index = X.index
    y_df.columns = ["label"]
    
    #Get the heldout fold indices and seperate them from the rest of the folds
    cv_idx = calc_CV_idx_iterative(X, splits_dict)
    heldout_k = (int(heldout_fold)-1)
    heldout_idx = heldout_k+1
    print "heldout fold #: "+str(heldout_idx)
    #Divide the data accordingly
    heldout_index = cv_idx[heldout_k]["test"]
    all_train_index = cv_idx[heldout_k]["train"]
    X_all_train, X_heldout = X.loc[all_train_index,:], X.loc[heldout_index,:]
    y_all_train, y_heldout = y_df.loc[all_train_index,:], y_df.loc[heldout_index,:]
    print "all training size = "+str(X_all_train.shape)
    
    #Get the test fold indices and define the other training folds as the training
    test_k = (int(test_fold)-1)
    test_idx = test_k+1
    print "test fold #: "+str(test_idx)
    #Divide the data accordingly
    test_index = cv_idx[test_k]["test"]
    train_index = pd.Index.difference(X_all_train.index, cv_idx[test_k]["test"]) #remove from "all_training" idx the test indices
    X_train, X_test = X.loc[train_index,:], X.loc[test_index,:]
    y_train, y_test = y_df.loc[train_index,:], y_df.loc[test_index,:]
    print "train size = "+str(X_train.shape)
    print "test size = "+str(X_test.shape)
    
    if (classifier in models_req_scaling):
        cols = X_train.columns
        scaler = StandardScaler() 
        #scale only using the training data
        scaler.fit(X_train) 
        X_train = pd.DataFrame(scaler.transform(X_train))
        # apply same transformation to test data
        X_test = pd.DataFrame(scaler.transform(X_test))
        #Restoring indices after scaling
        X_train.index = train_index 
        X_test.index = test_index 
        #Restoring features names
        X_train.columns = cols
        X_test.columns = cols

    #No down-sampling
    X_train_sampled = X_train
    y_train_sampled = y_train
    
    #Shuffle training data rows
    rseed=0
    np.random.seed(rseed)
    idx_perm_train = np.random.permutation(X_train_sampled.index)
    X_train_sampled_perm = X_train_sampled.reindex(idx_perm_train)
    y_train_sampled_perm = y_train_sampled.reindex(idx_perm_train)
        
    #Shuffle test data rows
    np.random.seed(rseed)
    idx_perm_test = np.random.permutation(X_test.index)
    X_test_perm = X_test.reindex(idx_perm_test)
    y_test_perm = y_test.reindex(idx_perm_test)
    
    #pos and neg numbers in the training
    no_pos = np.count_nonzero(y_train_sampled["label"] == 1)
    no_neg = np.count_nonzero(y_train_sampled["label"] == 0)
    
    #Generate model
    model = models_dict[classifier][ligand][int(heldout_fold)]
    
    if classifier == "NN":   
        model = model.to(device=curr_device)
        #weight vector for NN
        if model.weight == "balanced":              
            #weight vector
            neg_weight = float(no_pos) / float(no_neg + no_pos) 
            pos_weight = 1 - neg_weight
        elif model.weight == 0.1:
            neg_weight = 10
            pos_weight = 1
        elif model.weight == None:
            neg_weight = 1
            pos_weight = 1

        weight = torch.Tensor([neg_weight, pos_weight]).to(device=curr_device)
        model.fit(X_train_sampled, y_train_sampled["label"], weight)
        probs_list = model.predict_proba(X_test)
    
    else:
        model.fit(X_train_sampled, y_train_sampled["label"])
        probs_list = []
        probs = model.predict_proba(X_test)
        for l in probs:
            probs_list.append(l[1])

    pred_dict["obs"].extend(y_test["label"])
    pred_dict["prob"].extend(probs_list)
    fold_list = [test_idx] * len(probs_list)
    pred_dict["fold"].extend(fold_list)

    model_list = [classifier] * len(probs_list)
    pred_dict["model"].extend(model_list)
    
    #Adding the position number to the table to help with analysis
    pred_dict["idx"].extend(test_index)

    print "Finished "+ligand+" "+classifier+" heldout fold: "+heldout_fold+" test fold: "+test_fold

In [11]:
pred_dict = defaultdict(list)

test_model_iterative_fixed(pred_dict, training_positivies, training_negatives, ligand)
pred_df = pd.DataFrame.from_dict(pred_dict)

#Save to file
pred_df.to_csv(curr_dir+"/stacked_oof_predictions/"+heldout_fold+"/"+ligand+"_"+classifier_method+"_test_fold"+test_fold+"_"+str(folds_num)+"w.csv", sep='\t')

heldout fold #: 5
all training size = (31655, 758)
test fold #: 4
train size = (23758, 758)
test size = (7897, 758)
Finished ion XGB heldout fold: 5 test fold: 4
