In [1]:
#Basic imports
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict

#Classifier imports
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression, RidgeClassifier, Lasso
from sklearn.svm import SVC

#ML framework imports
from sklearn.metrics import roc_auc_score, precision_recall_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import scale

#Downsamplers imports - prototype generation
from imblearn.under_sampling import ClusterCentroids

#Downsamplers imports - prototype selection - controlled
from imblearn.under_sampling import RandomUnderSampler, NearMiss

#Downsamplers imports - prototype selection - Cleaning techniques
from imblearn.under_sampling import TomekLinks, EditedNearestNeighbours, RepeatedEditedNearestNeighbours

#Downsamplers imports - prototype selection - Cleaning techniques - Condensed nearest neighbors and derived algorithms
from imblearn.under_sampling import CondensedNearestNeighbour, OneSidedSelection, NeighbourhoodCleaningRule

#Downsamplers imports - prototype selection - Cleaning techniques
from imblearn.under_sampling import InstanceHardnessThreshold

from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

#import random as random
#from sklearn import metrics
#from sklearn.datasets import make_classification
#import statsmodels.api as sm

### Reading the input dataset

In [2]:
curr_dir = !pwd
input_path = curr_dir[0]+"/domains_similarity/filtered_features_table/"
filename = "positions_features_mediode_filter_01.25.18.csv"

bind_scores_num = 10

#Features table
features_all = pd.read_csv(input_path+filename, sep='\t', index_col=0)
features_cols = features_all.columns[1:-bind_scores_num] #removing binding scores and domain name
ligands = ["dna", "dnabase", "dnabackbone", "rna", "rnabase", "rnabackbone", "peptide", "ion", "metabolite"]
print "all samples #: "+str(features_all.shape[0])

all samples #: 38944


#### Dataset of negative examples

In [3]:
non_binding_positions = features_all[features_all["max_binding_score"] == 0]
non_binding_positions = non_binding_positions.loc[:,features_cols]
print "non_binding #: "+str(non_binding_positions.shape[0])

non_binding #: 27191


#### Datasets of positive examples by ligand

In [4]:
bind_th = 0.1
ligands_features_df = {}

for ligand in ligands:
    score_col_str = ligand+"_binding_score"
    ligand_binding_df = features_all[features_all[score_col_str] >= bind_th]
    print ligand+" #: "+str(ligand_binding_df.shape[0])
    ligands_features_df[ligand] = ligand_binding_df.loc[:,features_cols]

dna #: 501
dnabase #: 193
dnabackbone #: 408
rna #: 433
rnabase #: 224
rnabackbone #: 308
peptide #: 1496
ion #: 1093
metabolite #: 1525


#### Dataset of positive examples - all ligands combined

In [5]:
all_ligands_features_df = pd.concat([ligands_features_df["dna"], ligands_features_df["dnabase"], ligands_features_df["dnabackbone"], ligands_features_df["rna"], ligands_features_df["rnabase"], 
                                     ligands_features_df["rnabackbone"], ligands_features_df["ion"], ligands_features_df["peptide"], ligands_features_df["metabolite"]])
all_ligands_features_df = all_ligands_features_df.drop_duplicates()
print "all_ligands #: "+str(all_ligands_features_df.shape[0])

all_ligands #: 4518


### Models tested (and their hyper-parameters)

In [6]:
classifiers = {}
classifiers["Logistic"] = LogisticRegression()
classifiers["RF"] = RandomForestRegressor(n_estimators=1000)  
classifiers["KNN"] = KNeighborsRegressor(n_neighbors=100)
classifiers["Lasso"] = Lasso(alpha=0.5)
classifiers["Ridge"] = RidgeClassifier(alpha=0.5)
classifiers["SVM"] = SVC(kernel='rbf', probability=True)

### Downsamplers tested

In [7]:
#documentation on techniques: http://contrib.scikit-learn.org/imbalanced-learn/stable/under_sampling.html#cleaning-under-sampling-techniques
downsamplers = defaultdict(dict)

##Prototype generation##
downsamplers["ClusterCentroids"] = ClusterCentroids(random_state=0)

##Prototype selection##
#Contolled#
downsamplers["RandomUnderSampler"] = RandomUnderSampler(random_state=0)
downsamplers["NearMiss3"] = NearMiss(random_state=0, version=3)
downsamplers["NearMiss2"] = NearMiss(random_state=0, version=2)
downsamplers["NearMiss1"] = NearMiss(random_state=0, version=1)

#Cleaning#
downsamplers["TomekLinks"] = TomekLinks(random_state=0)
downsamplers["EditedNearestNeighbours"] = EditedNearestNeighbours(random_state=0)
downsamplers["RepeatedEditedNearestNeighbours"] = RepeatedEditedNearestNeighbours(random_state=0)
downsamplers["NeighbourhoodCleaningRule"] = NeighbourhoodCleaningRule(random_state=0)

# Instance hardness threshold#
downsamplers["InstanceHardnessThreshold"]["KNN"] = InstanceHardnessThreshold(random_state=0, estimator=classifiers["KNN"])
downsamplers["InstanceHardnessThreshold"]["SVM"] = InstanceHardnessThreshold(random_state=0, estimator=classifiers["SVM"])
downsamplers["InstanceHardnessThreshold"]["RF"] = InstanceHardnessThreshold(random_state=0, estimator=classifiers["RF"])
downsamplers["InstanceHardnessThreshold"]["Lasso"] = InstanceHardnessThreshold(random_state=0, estimator=classifiers["Lasso"])
downsamplers["InstanceHardnessThreshold"]["Ridge"] = InstanceHardnessThreshold(random_state=0, estimator=classifiers["Ridge"])
downsamplers["InstanceHardnessThreshold"]["Logistic"] = InstanceHardnessThreshold(random_state=0, estimator=classifiers["Logistic"])

### Test model functions

In [8]:
def test_model(pred_dict, auc_dict, auprc_dict, ligand_bind_features, ligand_name, downsample_method, features=[]):
    """
    Test different models in 10-folds cross-validation.
    """
    
    #Default: Exclude no features
    if len(features) == 0:
        features = np.ones([ligand_bind_features.shape[1],]).astype(bool)
        
    #Arranging the features table by the CV order, for each model
    features_pred_dfs = dict.fromkeys(classifiers.keys())
    
    models_req_scaling = ["SVM", "KNN"]
    
    for classifier in classifiers.keys():
        
        model = classifiers[classifier]
        features_pred_dfs[classifier] = pd.DataFrame()
        
        #Create X and y with included features
        X = pd.concat([ligand_bind_features.iloc[:,features], non_binding_positions.iloc[:,features]])
        
        if (classifier in models_req_scaling):
            idx = X.index
            cols = X.columns
            X = pd.DataFrame(scale(X)) #Is z-scoring the data needed?
            X.index = idx #Restoring indices after scaling
            X.columns = cols

        y = [1] * ligand_bind_features.shape[0]
        y.extend([0] * non_binding_positions.shape[0])
        y = np.array(y)

        binding_skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
        pred_idx = 1

        for train_index, test_index in binding_skf.split(X, y):
            print "fold #: "+str(pred_idx)
            X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
            y_train, y_test = y[train_index], y[test_index]

            #Down-sample negative examples to have balanced classes
            downsampler = downsamplers[downsample_method]
            X_train_sampled, y_train_sampled = downsampler.fit_sample(X_train, y_train)
            
            #fit to training data
            model = classifiers[classifier]
            model.fit(X_train_sampled, y_train_sampled)
            probs_list = []

            if (classifier == "Logistic" or classifier == "SVM"):
                probs = model.predict_proba(X_test)
                for l in probs:
                    probs_list.append(l[1])
            elif (classifier == "Ridge"):
                probs = model.decision_function(X_test)
                probs_list = probs
            else:
                probs = model.predict(X_test)
                probs_list = probs

            pred_dict["obs"].extend(y_test)
            pred_dict["prob"].extend(probs_list)
            fold_list = [pred_idx] * len(probs_list)
            pred_dict["fold"].extend(fold_list)

            model_list = [classifier] * len(probs_list)
            pred_dict["model"].extend(model_list)

            if (classifier == "Logistic" or classifier == "SVM"):
                auc_dict[classifier].append(roc_auc_score(y_test, probs[:, 1]))
                precision, recall, _ = precision_recall_curve(y_test, probs[:, 1])
                
            else:
                auc_dict[classifier].append(roc_auc_score(y_test, probs))
                precision, recall, _ = precision_recall_curve(y_test, probs)
            auprc_dict[classifier].append(auc(recall, precision))
            
            #Update features table
            features_pred_dfs[classifier] = features_pred_dfs[classifier].append(X_test)
            pred_idx += 1

        avg_auc = np.sum(auc_dict[classifier])/10.0
        print "avg auc = "+str(avg_auc)
        
        avg_auprc = np.sum(auprc_dict[classifier])/10.0
        print "avg auprc = "+str(avg_auprc)
            
        print "Finished "+ligand+" "+classifier
    
    return features_pred_dfs

In [9]:
def combine_features_predictions(ligand, ordered_features, pred_df):
    
    pred_res = pred_df.copy(deep=True)
    for classifier in classifiers.keys():
        
        model_pred = pred_res[pred_res["model"] == classifier]
        model_pred.index = ordered_features[classifier].index
        
        #Creating the combined table
        features_pred = pd.concat([ordered_features[classifier], model_pred], axis=1)
        
        #Saving
        features_pred.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/ligand_df/features_pred_tables/"+ligand+"_"+classifier+"_features_pred.csv", sep='\t')

#### Predict for each ligand seperatelly

In [None]:
%%time

for ligand in ligands:
    ligand = "dnabase"
    downsample_method = "EditedNearestNeighbours"
    #Initialize dictionary
    pred_dict = defaultdict(list)
    auc_dict = defaultdict(list)
    auprc_dict = defaultdict(list)
    
    ordered_features = test_model(pred_dict, auc_dict, auprc_dict, ligands_features_df[ligand], ligand, downsample_method)
    
    pred_df = pd.DataFrame.from_dict(pred_dict)
    auc_df = pd.DataFrame.from_dict(auc_dict)
    auprc_df = pd.DataFrame.from_dict(auprc_dict)
    
    #Save to file
    #pred_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/ligand_df/01.25.2018/"+ligand+"_0.1.csv", sep='\t')
    #auc_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/ligand_df/01.25.2018/"+ligand+"_0.1_auc.csv", sep='\t')
    #auprc_df.to_csv(curr_dir[0]+"/pred_AUC_AUPRC/ligand_df/01.25.2018/"+ligand+"_0.1_auprc.csv", sep='\t')
    
    #Combine features and pred results to a unified table
    #combine_features_predictions(ligand, ordered_features, pred_df)
    
    print "Finished ligand "+ligand
    break

In [14]:
%%time
for train_index, test_index in binding_skf.split(X, y):
    print "fold #: "+str(pred_idx)
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    
    #Down-sample negative examples to have balanced classes
    downsampler = downsamplers[downsample_method]
    X_train_sampled, y_train_sampled = downsampler.fit_sample(X_train, y_train)
    break

fold #: 1
CPU times: user 23min 53s, sys: 1min 19s, total: 25min 12s
Wall time: 3min 51s


#### Classifier for all ligands- combined

In [17]:
%%time

pred_dict = defaultdict(list)
auc_dict = defaultdict(list)
auprc_dict = defaultdict(list)

ligand = "all_ligands"
print "Starting all_ligands"
test_model(pred_dict, auc_dict, auprc_dict, all_ligands_features_df, ligand)

pred_df = pd.DataFrame.from_dict(pred_dict)
auc_df = pd.DataFrame.from_dict(auc_dict)
auprc_df = pd.DataFrame.from_dict(auprc_dict)

#Save to file
pred_df.to_csv(curr_dir[0]+"/ligand_df/01.25.2018/"+ligand+"_0.1.csv", sep='\t')
auc_df.to_csv(curr_dir[0]+"/ligand_df/01.25.2018/"+ligand+"_0.1_auc.csv", sep='\t')
auprc_df.to_csv(curr_dir[0]+"/ligand_df/01.25.2018/"+ligand+"_0.1_auprc.csv", sep='\t')

Starting all_ligands
avg auc = 0.680981544722
avg auprc = 0.314704292157
Finished all_ligands KNN
avg auc = 0.724265555611
avg auprc = 0.349210885958
Finished all_ligands SVM
avg auc = 0.713613532384
avg auprc = 0.335252507346
Finished all_ligands Ridge
avg auc = 0.770406766222
avg auprc = 0.426651368058
Finished all_ligands RF
avg auc = 0.616142666708
avg auprc = 0.248897348078
Finished all_ligands Logistic
avg auc = 0.611034444117
avg auprc = 0.239598271444
Finished all_ligands Lasso
CPU times: user 4h 58min 14s, sys: 1min 14s, total: 4h 59min 28s
Wall time: 4h 57min 56s
