In [1]:
import pandas as pd
import numpy as np
import cPickle as pickle
from collections import defaultdict

from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

#Downsamplers imports - prototype selection - Cleaning techniques
from imblearn.under_sampling import TomekLinks

#Downsamplers imports - prototype selection - controlled
from imblearn.under_sampling import RandomUnderSampler

#ML framework imports
from sklearn.metrics import auc, roc_auc_score, precision_recall_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import scale

from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

ABSOLUTE_NEGATIVES = True
FILTER_DOMAIN = False

In [2]:
curr_dir = !pwd
input_path = curr_dir[0]+"/domains_similarity/filtered_features_table/"
filename = "positions_features_mediode_filter_01.25.18.csv"

#input_path = curr_dir[0]+"/../9.Features_exploration/binding_df/10/"
#filename = "positions_features_01.25.18.csv"

bind_scores_num = 10

#Features table
features_all = pd.read_csv(input_path+filename, sep='\t', index_col=0)
features_cols = features_all.columns[1:-bind_scores_num] #removing binding scores and domain name
ligands = ["dna", "dnabase", "dnabackbone", "rna", "rnabase", "rnabackbone", "peptide", "ion", "metabolite"]
print "all samples positions #: "+str(features_all.shape[0])

#lignd binding domains dictionary
with open(curr_dir[0]+"/ligands_negatives_domains_dict.pik", 'rb') as handle:
    negatives_dict = pickle.load(handle)

all samples positions #: 38944


### Create dataset of negatives

In [3]:
def filter_to_ligand_binding_domains(use_max_binding_score):
    
    ligands_negatives_df = {}
    for ligand in ligands:
        
        ligands_negatives_df[ligand] = pd.DataFrame()
        for domain in negatives_dict[ligand].keys():
            if domain == 'negatives' or domain == 'domains':
                continue
            domain_all = features_all.loc[features_all.loc[:,"domain_name"] == domain,:]
            
            #In case this domain was previously filtered
            if len(domain_all) == 0:
                continue
            
            if (use_max_binding_score):
                ligands_negatives_df[ligand] = pd.concat([ligands_negatives_df[ligand],domain_all.loc[domain_all.loc[:,"max_binding_score"] == 0,:]])
            else:
                ligand_bind_str = ligand+"_binding_score"
                ligands_negatives_df[ligand] = pd.concat([ligands_negatives_df[ligand],domain_all.loc[domain_all.loc[:,ligand_bind_str] == 0,:]])
        
    #Handeling the ligand "all_ligands"
    all_ligands_negatives_df = pd.concat([ligands_negatives_df["dna"], ligands_negatives_df["dnabase"], ligands_negatives_df["dnabackbone"], ligands_negatives_df["rna"], ligands_negatives_df["rnabase"], 
                                 ligands_negatives_df["rnabackbone"], ligands_negatives_df["ion"], ligands_negatives_df["peptide"], ligands_negatives_df["metabolite"]])
    all_ligands_negatives_df = all_ligands_negatives_df.drop_duplicates()
    #Filter to just positions with max. binding score = 0
    all_ligands_negatives_df = all_ligands_negatives_df[all_ligands_negatives_df["max_binding_score"] == 0]
    ligands_negatives_df["all_ligands"] = all_ligands_negatives_df
    
    #Leaving just the features columns
    for ligand in ligands_negatives_df.keys():   
        ligands_negatives_df[ligand] = ligands_negatives_df[ligand]
        print(ligand+" non-binding #:"+str(len(ligands_negatives_df[ligand])))
    
    return ligands_negatives_df

In [4]:
def negatives_by_binding_score(use_max_binding_score):
    
    ligands_negatives_df = {}
    for ligand in ligands:
        
        if use_max_binding_score:
            ligand_bind_str = "max_binding_score"
        else:
            ligand_bind_str = ligand+"_binding_score"
        
        ligands_negatives_df[ligand] = features_all[features_all[ligand_bind_str] == 0]
        ligands_negatives_df[ligand] = ligands_negatives_df[ligand]
        print(ligand+" non-binding #:"+str(len(ligands_negatives_df[ligand])))
        
    #Handeling the ligand "all_ligands"
    ligands_negatives_df["all_ligands"] = features_all[features_all["max_binding_score"] == 0]
    ligands_negatives_df["all_ligands"] = ligands_negatives_df["all_ligands"]
    print("all_ligands non-binding #:"+str(len(ligands_negatives_df["all_ligands"])))
    
    return ligands_negatives_df

In [5]:
#Create negatives datasets
if FILTER_DOMAIN:
    if ABSOLUTE_NEGATIVES:
        ligands_negatives_df = filter_to_ligand_binding_domains(True)
    else:
        ligands_negatives_df = filter_to_ligand_binding_domains(False)
else:
    if ABSOLUTE_NEGATIVES:
        ligands_negatives_df = negatives_by_binding_score(True)
    else:
        ligands_negatives_df = negatives_by_binding_score(False)

dna non-binding #:27191
dnabase non-binding #:27191
dnabackbone non-binding #:27191
rna non-binding #:27191
rnabase non-binding #:27191
rnabackbone non-binding #:27191
peptide non-binding #:27191
ion non-binding #:27191
metabolite non-binding #:27191
all_ligands non-binding #:27191


### Create dataset of positives

In [6]:
bind_th = 0.1
ligands_features_df = {}

for ligand in ligands:
    score_col_str = ligand+"_binding_score"
    ligand_binding_df = features_all[features_all[score_col_str] >= bind_th]
    print ligand+" #: "+str(ligand_binding_df.shape[0])
    ligands_features_df[ligand] = ligand_binding_df

dna #: 501
dnabase #: 193
dnabackbone #: 408
rna #: 433
rnabase #: 224
rnabackbone #: 308
peptide #: 1496
ion #: 1093
metabolite #: 1525


In [7]:
all_ligands_features_df = pd.concat([ligands_features_df["dna"], ligands_features_df["dnabase"], ligands_features_df["dnabackbone"], ligands_features_df["rna"], ligands_features_df["rnabase"], 
                                     ligands_features_df["rnabackbone"], ligands_features_df["ion"], ligands_features_df["peptide"], ligands_features_df["metabolite"]])
all_ligands_features_df = all_ligands_features_df.drop_duplicates()
print "all_ligands #: "+str(all_ligands_features_df.shape[0])
ligands_features_df["all_ligands"] = all_ligands_features_df

all_ligands #: 4518


### Models

In [8]:
classifiers = {}
classifiers["RF"] = RandomForestRegressor(n_estimators=1000)

In [9]:
downsamplers = {}
downsamplers["TomekLinks"] = TomekLinks(random_state=0)
downsamplers["RandomUnderSampler"] = RandomUnderSampler(random_state=0)

### Test model

In [10]:
def test_model(pred_dict, auc_dict, auprc_dict, ligand_bind_features, ligand_negatives_features, ligand_name, downsample_method, features=[]):
    """
    Test different models in 10-folds cross-validation.
    """
    
    #Default: Exclude no features
    if len(features) == 0:
        features = np.ones([ligand_bind_features.shape[1],]).astype(bool)
        
    #Arranging the features table by the CV order, for each model
    features_pred_dfs = dict.fromkeys(classifiers.keys())
    
    models_req_scaling = ["SVM", "KNN"]
    
    for classifier in classifiers.keys():
        model = classifiers[classifier]
        features_pred_dfs[classifier] = pd.DataFrame()
        
        #Create X and y with included features
        X = pd.concat([ligand_bind_features.iloc[:,features], ligand_negatives_features.iloc[:,features]])
        
        if (classifier in models_req_scaling):
            idx = X.index
            cols = X.columns
            X = pd.DataFrame(scale(X)) #Is z-scoring the data needed?
            X.index = idx #Restoring indices after scaling
            X.columns = cols

        y = X.loc[:,ligand_name+"_binding_score"]
        y = np.array(y)
        
        X=X.loc[:,features_cols]

        binding_skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
        pred_idx = 1
        
        #Metrics
        exp_var = []
        mse = []
        r2 = []
        
        for train_index, test_index in binding_skf.split(X, y):
            print "fold #: "+str(pred_idx)
            X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
            y_train, y_test = y[train_index], y[test_index]

            #Down-sample negative examples to have balanced classes
            if (downsample_method == "NoDown"):
                X_train_sampled = X_train
                y_train_sampled = y_train
            else:
                if (downsample_method == "InstanceHardnessThreshold"):
                    downsampler = downsamplers[downsample_method][classifier]
                else:
                    downsampler = downsamplers[downsample_method]

                X_train_sampled, y_train_sampled = downsampler.fit_sample(X_train, y_train)
            
            #fit to training data
            model = classifiers[classifier]
            model.fit(X_train_sampled, y_train_sampled)
            probs_list = []

            #probs = model.predict(X_test)
            #probs_list = probs
            
            pred_list = model.predict(X_test)
            #for l in probs:
            #    probs_list.append(l[1])
                
            pred_dict["obs"].extend(y_test)
            pred_dict["prob"].extend(pred_list)
            fold_list = [pred_idx] * len(pred_list)
            pred_dict["fold"].extend(fold_list)

            model_list = [classifier] * len(pred_list)
            pred_dict["model"].extend(model_list)
            
            #Update metrics
            
            print(y_test)
            print(pred_list)
            
            e = explained_variance_score(y_test, pred_list)
            exp_var.append(e)
            
            m = mean_squared_error(y_test, pred_list)
            mse.append(m)
            
            r = r2_score(y_test, pred_list)
            r2.append(r)
            

            #Update auc auprc dictionaries
            #auc_dict[classifier].append(roc_auc_score(y_test, probs_list))
            #precision, recall, _ = precision_recall_curve(y_test, probs[:, 1])
            
            #auc_dict[classifier].append(roc_auc_score(y_test, probs))
            #precision, recall, _ = precision_recall_curve(y_test, probs)
            
            #auprc_dict[classifier].append(auc(recall, precision))
            
            #Update features table
            features_pred_dfs[classifier] = features_pred_dfs[classifier].append(X_test)
            pred_idx += 1
            
            print "Expected Variance = "+str(e)
            print "Mean Squared Error = "+str(m)
            print "R2 = "+str(r)

        #avg_auc = np.sum(auc_dict[classifier])/10.0
        print "avg expected variance = "+str(np.mean(exp_var))
        
        #avg_auprc = np.sum(auprc_dict[classifier])/10.0
        print "avg mean squared error = "+str(np.mean(mse))
        
        print "avg r2 = "+str(np.mean(r2))
            
        print "Finished "+ligand+" "+classifier
        break
    
    return features_pred_dfs

In [11]:
def combine_features_predictions(ligand, ordered_features, pred_df):
    
    pred_res = pred_df.copy(deep=True)
    for classifier in classifiers.keys():
        model_pred = pred_res[pred_res["model"] == classifier]
        model_pred.index = ordered_features[classifier].index
        
        #Creating the combined table
        features_pred = pd.concat([ordered_features[classifier], model_pred], axis=1)
        
        #Saving
        features_pred.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegAbs_NoFilter_NoDown_"+ligand+"_RF_features_pred.csv", sep='\t')
        break

### Predict

In [77]:
#%%time
    
for ligand in ligands:
    #Initialize dictionary
    pred_dict = defaultdict(list)
    auc_dict = defaultdict(list)
    auprc_dict = defaultdict(list)

    ordered_features = test_model(pred_dict, auc_dict, auprc_dict, ligands_features_df[ligand], ligands_negatives_df[ligand], ligand, "RandomUnderSampler")

    #pred_df = pd.DataFrame.from_dict(pred_dict)
    #auc_df = pd.DataFrame.from_dict(auc_dict)
    #auprc_df = pd.DataFrame.from_dict(auprc_dict)
    
    #Save to file
    #pred_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegAbs_NoFilter_NoDown_"+ligand+"_RF_0.1.csv", sep='\t')
    #auc_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegAbs_NoFilter_NoDown_"+ligand+"_RF_0.1_auc.csv", sep='\t')
    #auprc_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegAbs_NoFilter_NoDown_"+ligand+"_RF_0.1_auprc.csv", sep='\t')

    #Combine features and pred results to a unified table
    #combine_features_predictions(ligand, ordered_features, pred_df)

    print "Finished ligand "+ligand

fold #: 1
[ 1.          0.84982639  0.80034722 ...,  0.          0.          0.        ]
[ 0.48816766  0.3353172   0.39282479 ...,  0.27467717  0.23818384
  0.24843754]
Expected Variance = -0.973831199304
Mean Squared Error = 0.137658710117
R2 = -44.2202883411
fold #: 2
[ 1.  1.  1. ...,  0.  0.  0.]
[ 0.46859666  0.54370867  0.40149737 ...,  0.44545071  0.39287521
  0.38615304]
Expected Variance = -3.63204197205
Mean Squared Error = 0.139464384907
R2 = -125.680303502
fold #: 3
[ 1.          0.19138614  0.13236535 ...,  0.          0.          0.        ]
[ 0.37141439  0.3408487   0.50870043 ...,  0.34036923  0.39589524
  0.31445901]
Expected Variance = -2.42039501918
Mean Squared Error = 0.136672881853
R2 = -90.2347846451
fold #: 4
[ 1.  1.  1. ...,  0.  0.  0.]
[ 0.42345274  0.54915769  0.47700158 ...,  0.30972252  0.32246348
  0.42939614]
Expected Variance = -3.46286029489
Mean Squared Error = 0.139100555252
R2 = -125.349824497
fold #: 5
[ 0.32985562  0.16492781  0.60069444 ...,  0.

[ 0.  0.  0. ...,  0.  0.  0.]
[ 0.21656004  0.26179027  0.20833973 ...,  0.31485786  0.2402221
  0.28037838]
Expected Variance = 0.0
Mean Squared Error = 0.100325814686
R2 = 0.0
fold #: 5
[ 0.36274774  0.27742984  0.28571429 ...,  0.          0.          0.        ]
[ 0.33884767  0.3797692   0.51428408 ...,  0.38971757  0.21923689
  0.32747635]
Expected Variance = -1.10004637748
Mean Squared Error = 0.0984815969377
R2 = -33.2408241252
fold #: 6
[ 0.14285714  0.          0.         ...,  0.          0.          0.        ]
[ 0.3338835   0.33377254  0.29503227 ...,  0.36070928  0.3581846
  0.29710166]
Expected Variance = -479.161680252
Mean Squared Error = 0.0971553416844
R2 = -12952.6263023
fold #: 7
[ 0.14285714  0.          0.         ...,  0.          0.          0.        ]
[ 0.34436022  0.21998166  0.23293886 ...,  0.31685055  0.30613382
  0.23745283]
Expected Variance = -463.400068727
Mean Squared Error = 0.0981975957987
R2 = -13091.5890199
fold #: 8
[ 0.28571429  0.14285714  0.5

[ 0.13712202  1.          1.         ...,  0.          0.          0.        ]
[ 0.28896299  0.23260023  0.35521979 ...,  0.32830154  0.34852912
  0.32772767]
Expected Variance = -4.57360080331
Mean Squared Error = 0.105003939873
R2 = -127.277324401
fold #: 7
[ 0.13712202  1.          0.17892157 ...,  0.          0.          0.        ]
[ 0.26880448  0.37570798  0.39531671 ...,  0.36425774  0.28921223
  0.30634972]
Expected Variance = -4.69545729525
Mean Squared Error = 0.104194438968
R2 = -127.770907805
fold #: 8
[ 0.24604705  0.15886431  0.15710079 ...,  0.          0.          0.        ]
[ 0.20228956  0.2179204   0.21643923 ...,  0.31883761  0.28490929
  0.2167187 ]
Expected Variance = -1.95472179525
Mean Squared Error = 0.103639081632
R2 = -50.8681783576
fold #: 9
[ 0.24604705  0.26197422  0.14669055 ...,  0.          0.          0.        ]
[ 0.20933538  0.21860991  0.22838799 ...,  0.26736506  0.2371057
  0.26944078]
Expected Variance = -0.107219824212
Mean Squared Error = 0.077

In [24]:
def test_model_bin(pred_dict, auc_dict, auprc_dict, ligand_bind_features, ligand_negatives_features, ligand_name, downsample_method, features=[]):
    """
    Test different models in 10-folds cross-validation.
    """
    
    #Default: Exclude no features
    if len(features) == 0:
        features = np.ones([ligand_bind_features.shape[1],]).astype(bool)
        
    #Arranging the features table by the CV order, for each model
    features_pred_dfs = dict.fromkeys(classifiers.keys())
    
    models_req_scaling = ["SVM", "KNN"]
    
    for classifier in classifiers.keys():
        model = classifiers[classifier]
        features_pred_dfs[classifier] = pd.DataFrame()
        
        #Create X and y with included features
        X = pd.concat([ligand_bind_features.iloc[:,features], ligand_negatives_features.iloc[:,features]])
        
        if (classifier in models_req_scaling):
            idx = X.index
            cols = X.columns
            X = pd.DataFrame(scale(X)) #Is z-scoring the data needed?
            X.index = idx #Restoring indices after scaling
            X.columns = cols

        y = X.loc[:,ligand_name+"_binding_score"]
        y = np.array(y)
        
        X=X.loc[:,features_cols]

        binding_skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
        pred_idx = 1
        
        for train_index, test_index in binding_skf.split(X, y):
            print "fold #: "+str(pred_idx)
            X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
            y_train, y_test = y[train_index], y[test_index]

            #Down-sample negative examples to have balanced classes
            if (downsample_method == "NoDown"):
                X_train_sampled = X_train
                y_train_sampled = y_train
            else:
                if (downsample_method == "InstanceHardnessThreshold"):
                    downsampler = downsamplers[downsample_method][classifier]
                else:
                    downsampler = downsamplers[downsample_method]

                X_train_sampled, y_train_sampled = downsampler.fit_sample(X_train, y_train)
            
            #fit to training data
            model = classifiers[classifier]
            model.fit(X_train_sampled, y_train_sampled)
            probs_list = []

            #probs = model.predict(X_test)
            #probs_list = probs
            
            pred_list = model.predict(X_test)
            #for l in probs:
            #    probs_list.append(l[1])
                
            pred_dict["obs"].extend(y_test)
            pred_dict["prob"].extend(pred_list)
            fold_list = [pred_idx] * len(pred_list)
            pred_dict["fold"].extend(fold_list)

            model_list = [classifier] * len(pred_list)
            pred_dict["model"].extend(model_list)
            
            #Update metrics
            
            y_test = (y_test >= 0.1).astype(int)
            print(y_test,len(y_test),sum(y_test))
            print(pred_list,len(pred_list))
            
            #Update auc auprc dictionaries
            try:
                auc_dict[classifier].append(roc_auc_score(y_test, pred_list))
                precision, recall, _ = precision_recall_curve(y_test, pred_list)            
                auprc_dict[classifier].append(auc(recall, precision))
            except ValueError: # Some splits are not well formed and have 0 positives after binning
                auc_dict[classifier].append(1)          
                auprc_dict[classifier].append(1)
            
            #Update features table
            features_pred_dfs[classifier] = features_pred_dfs[classifier].append(X_test)
            pred_idx += 1
            
            print "AUC = "+str(auc_dict[classifier][-1])
            print "AUPRC = "+str(auprc_dict[classifier][-1])

        avg_auc = np.sum(auc_dict[classifier])/10.0
        print "avg auc = "+str(avg_auc)
        
        avg_auprc = np.sum(auprc_dict[classifier])/10.0
        print "avg auprc = "+str(avg_auprc)
            
        print "Finished "+ligand+" "+classifier
        break
    
    return features_pred_dfs

### Run as regression and bin to classify

In [25]:
#%%time
    
for ligand in ligands:
    #Initialize dictionary
    pred_dict = defaultdict(list)
    auc_dict = defaultdict(list)
    auprc_dict = defaultdict(list)

    ordered_features = test_model_bin(pred_dict, auc_dict, auprc_dict, ligands_features_df[ligand], ligands_negatives_df[ligand], ligand, "RandomUnderSampler")

    #pred_df = pd.DataFrame.from_dict(pred_dict)
    #auc_df = pd.DataFrame.from_dict(auc_dict)
    #auprc_df = pd.DataFrame.from_dict(auprc_dict)
    
    #Save to file
    #pred_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegAbs_NoFilter_NoDown_"+ligand+"_RF_0.1.csv", sep='\t')
    #auc_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegAbs_NoFilter_NoDown_"+ligand+"_RF_0.1_auc.csv", sep='\t')
    #auprc_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/mediode_NegAbs_NoFilter_NoDown_"+ligand+"_RF_0.1_auprc.csv", sep='\t')

    #Combine features and pred results to a unified table
    #combine_features_predictions(ligand, ordered_features, pred_df)

    print "Finished ligand "+ligand

fold #: 1
(array([1, 1, 1, ..., 0, 0, 0]), 2744, 24)
(array([ 0.49240998,  0.34757528,  0.40209025, ...,  0.27909091,
        0.24320631,  0.25120314]), 2744)
AUC = 0.672150735294
AUPRC = 0.0812994395196
fold #: 2
(array([1, 1, 1, ..., 0, 0, 0]), 2722, 3)
(array([ 0.46093181,  0.54054881,  0.40260068, ...,  0.43808406,
        0.38467492,  0.40046758]), 2722)
AUC = 0.881696702219
AUPRC = 0.00903323470093
fold #: 3
(array([1, 1, 1, ..., 0, 0, 0]), 2726, 7)
(array([ 0.3733314 ,  0.35463838,  0.50910947, ...,  0.33310594,
        0.39348583,  0.32216582]), 2726)
AUC = 0.747806441444
AUPRC = 0.0274853260981
fold #: 4
(array([1, 1, 1, ..., 0, 0, 0]), 2722, 3)
(array([ 0.41821198,  0.54928129,  0.50326444, ...,  0.31001826,
        0.3271602 ,  0.4261134 ]), 2722)
AUC = 0.92509501042
AUPRC = 0.0150642724848
fold #: 5
(array([1, 1, 1, ..., 0, 0, 0]), 2764, 45)
(array([ 0.46581501,  0.36538057,  0.28579638, ...,  0.39637789,
        0.32930051,  0.37113544]), 2764)
AUC = 0.591966000572
AUPRC =

AUC = 0.603824935638
AUPRC = 0.00258231446964
avg auc = 0.695417003085
avg auprc = 0.232693453007
Finished rna RF
Finished ligand rna
fold #: 1
(array([1, 1, 1, ..., 0, 0, 0]), 2737, 17)
(array([ 0.30447263,  0.26927352,  0.2509467 , ...,  0.2338962 ,
        0.26613533,  0.23775709]), 2737)
AUC = 0.580514705882
AUPRC = 0.00774637846861
fold #: 2
(array([0, 0, 0, ..., 0, 0, 0]), 2719, 0)
(array([ 0.24277505,  0.22089988,  0.22296096, ...,  0.21533431,
        0.23986007,  0.21029303]), 2719)
AUC = 1
AUPRC = 1
fold #: 3
(array([1, 1, 1, ..., 0, 0, 0]), 2723, 4)
(array([ 0.24538297,  0.34013731,  0.25014099, ...,  0.36213721,
        0.26567749,  0.339731  ]), 2723)
AUC = 0.473887458624
AUPRC = 0.00130514184082
fold #: 4
(array([0, 0, 0, ..., 0, 0, 0]), 2719, 0)
(array([ 0.19455563,  0.3292012 ,  0.20339413, ...,  0.24159699,
        0.20741329,  0.32264759]), 2719)
AUC = 1
AUPRC = 1
fold #: 5
(array([1, 1, 1, ..., 0, 0, 0]), 2749, 30)
(array([ 0.23840745,  0.29399296,  0.29728679, ..., 

(array([1, 1, 1, ..., 0, 0, 0]), 2731, 12)
(array([ 0.34112744,  0.30559867,  0.24196382, ...,  0.30625958,
        0.31584513,  0.31131254]), 2731)
AUC = 0.700655878387
AUPRC = 0.130513600029
avg auc = 0.703902276959
avg auprc = 0.211110351812
Finished ion RF
Finished ligand ion
fold #: 1
(array([1, 1, 1, ..., 0, 0, 0]), 2813, 93)
(array([ 0.2924918 ,  0.29936967,  0.34730406, ...,  0.23632719,
        0.2072403 ,  0.2101702 ]), 2813)
AUC = 0.551221537002
AUPRC = 0.0421862250719
fold #: 2
(array([1, 1, 1, ..., 0, 0, 0]), 2726, 7)
(array([ 0.25739047,  0.23450444,  0.24017232, ...,  0.30929485,
        0.27291918,  0.33090233]), 2726)
AUC = 0.431093364157
AUPRC = 0.00222968129339
fold #: 3
(array([1, 1, 1, ..., 0, 0, 0]), 2751, 32)
(array([ 0.26945436,  0.33262836,  0.3322206 , ...,  0.27210064,
        0.28297932,  0.24680792]), 2751)
AUC = 0.663111438029
AUPRC = 0.0316606459373
fold #: 4
(array([1, 1, 1, ..., 0, 0, 0]), 2724, 5)
(array([ 0.23154515,  0.2952968 ,  0.26832503, ...,  0.