In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
from sklearn import metrics

import matplotlib.pyplot as plt
import random as random
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import zero_one_loss
import sklearn.linear_model
from sklearn import svm
from sklearn.metrics import roc_curve, auc, precision_recall_curve,average_precision_score

from sklearn import linear_model #TODO: more models
from sklearn import tree
from sklearn import ensemble
from sklearn import svm
from sklearn import neighbors
from sklearn import naive_bayes
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
from scipy import sparse

from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso
from sklearn.metrics import roc_curve
from sklearn.preprocessing import scale

from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

### Reading the input dataset

In [2]:
curr_dir = !pwd
input_path = curr_dir[0]+"/../9.Features_exploration/binding_df/10/"
filename = "positions_features_01.25.18.csv"

bind_scores_num = 10

#Features table
features_all = pd.read_csv(input_path+filename, sep='\t', index_col=0)
features_cols = features_all.columns[1:-bind_scores_num] #removing binding scores and domain name
ligands = ["dna", "dnabase", "dnabackbone", "rna", "rnabase", "rnabackbone", "peptide", "ion", "metabolite"]
print "all samples #: "+str(features_all.shape[0])

#Significantly similar pairs table
significantly_similar_pairs = pd.read_csv(curr_dir[0]+"/../10.Prediction/domains_similarity/sig_pairs_pval0.001_score10.csv", sep='\t', index_col=0)

all samples #: 54833


#### Dataset of negative examples

In [3]:
non_binding_positions = features_all[features_all["max_binding_score"] == 0]
non_binding_positions = non_binding_positions.loc[:,features_cols]
print "non_binding #: "+str(non_binding_positions.shape[0])

non_binding #: 37359


#### Datasets of positive examples by ligand

In [4]:
bind_th = 0.1
ligands_features_df = {}

for ligand in ligands:
    score_col_str = ligand+"_binding_score"
    ligand_binding_df = features_all[features_all[score_col_str] >= bind_th]
    print ligand+" #: "+str(ligand_binding_df.shape[0])
    ligands_features_df[ligand] = ligand_binding_df.loc[:,features_cols]

dna #: 749
dnabase #: 279
dnabackbone #: 625
rna #: 629
rnabase #: 322
rnabackbone #: 458
peptide #: 2173
ion #: 1691
metabolite #: 2109


#### Dataset of positive examples - all ligands combined

In [5]:
all_ligands_features_df = pd.concat([ligands_features_df["dna"], ligands_features_df["dnabase"], ligands_features_df["dnabackbone"], ligands_features_df["rna"], ligands_features_df["rnabase"], 
                                     ligands_features_df["rnabackbone"], ligands_features_df["ion"], ligands_features_df["peptide"], ligands_features_df["metabolite"]])
all_ligands_features_df = all_ligands_features_df.drop_duplicates()
print "all_ligands #: "+str(all_ligands_features_df.shape[0])

all_ligands #: 6486


### Models tested (and their hyper-parameters)

In [6]:
classifiers = {}
classifiers["Logistic"] = LogisticRegression()
classifiers["RF"] = ensemble.RandomForestRegressor(n_estimators=1000)  
classifiers["KNN"] = neighbors.KNeighborsRegressor(n_neighbors=100)
classifiers["Lasso"] = linear_model.Lasso(alpha=0.5)
classifiers["Ridge"] = linear_model.RidgeClassifier(alpha=0.5)
classifiers["SVM"] = svm.SVC(kernel='rbf', probability=True)

### Test model functions

In [11]:
def sample(iterator, k):
    """
    Samples k elements from an iterable object.

    :param iterator: an object that is iterable
    :param k: the number of items to sample
    """
    # fill the reservoir to start
    result = [next(iterator) for _ in range(k)]

    n = k - 1
    for item in iterator:
        n += 1
        s = random.randint(0, n)
        if s < k:
            result[s] = item

    return result

In [8]:
def test_model(pred_dict, auc_dict, auprc_dict, ligand_bind_features, ligand_name, features=[]):
    """
    Test different models in 10-folds cross-validation.
    """
    
    #Default: Exclude no features
    if len(features) == 0:
        features = np.ones([ligand_bind_features.shape[1],]).astype(bool)
        
    #Arranging the features table by the CV order, for each model
    features_pred_dfs = dict.fromkeys(classifiers.keys())
    
    models_req_scaling = ["SVM", "KNN"]
    
    for classifier in classifiers.keys():
        
        model = classifiers[classifier]
        features_pred_dfs[classifier] = pd.DataFrame()
        
        #Create X and y with included features
        X = pd.concat([ligand_bind_features.iloc[:,features], non_binding_positions.iloc[:,features]])
        #X = pd.concat([ligand_bind_features, non_binding_positions])
        
        if (classifier in models_req_scaling):
            idx = X.index
            cols = X.columns
            X = pd.DataFrame(scale(X)) #Is z-scoring the data needed?
            X.index = idx #Restoring indices after scaling
            X.columns = cols

        y = [1] * ligand_bind_features.shape[0]
        y.extend([0] * non_binding_positions.shape[0])
        y = np.array(y)

        binding_skf = StratifiedKFold(n_splits=10, shuffle=True)
        pred_idx = 1

        for train_index, test_index in binding_skf.split(X, y):
            X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
            y_train, y_test = y[train_index], y[test_index]

            #Down-sample negative examples to have balanced classes
            negative_idx = np.where(y_train == 0)[0].tolist()
            positive_idx = np.where(y_train == 1)[0].tolist()
            sampled_negative_idx = sample(iter(negative_idx), len(positive_idx))
            X_train_sampled = pd.concat([X_train.iloc[positive_idx], X_train.iloc[sampled_negative_idx]])
            y_train_sampled = np.append(y_train[positive_idx], [0] * len(positive_idx))
            
            #fit to training data
            model = classifiers[classifier]
            model.fit(X_train_sampled, y_train_sampled)
            probs_list = []

            if (classifier == "Logistic" or classifier == "SVM"):
                probs = model.predict_proba(X_test)
                for l in probs:
                    probs_list.append(l[1])
            elif (classifier == "Ridge"):
                probs = model.decision_function(X_test)
                probs_list = probs
            else:
                probs = model.predict(X_test)
                probs_list = probs

            pred_dict["obs"].extend(y_test)
            pred_dict["prob"].extend(probs_list)
            fold_list = [pred_idx] * len(probs_list)
            pred_dict["fold"].extend(fold_list)

            model_list = [classifier] * len(probs_list)
            pred_dict["model"].extend(model_list)

            if (classifier == "Logistic" or classifier == "SVM"):
                #print "auc= "+str(metrics.roc_auc_score(y_test, probs[:, 1]))
                auc_dict[classifier].append(metrics.roc_auc_score(y_test, probs[:, 1]))
                precision, recall, _ = precision_recall_curve(y_test, probs[:, 1])
                
            else:
                #print "auc= "+str(metrics.roc_auc_score(y_test, probs))
                auc_dict[classifier].append(metrics.roc_auc_score(y_test, probs))
                precision, recall, _ = precision_recall_curve(y_test, probs)
            auprc_dict[classifier].append(auc(recall, precision))
            
            #Update features table
            features_pred_dfs[classifier] = features_pred_dfs[classifier].append(X_test)
            pred_idx += 1

        avg_auc = np.sum(auc_dict[classifier])/10.0
        print "avg auc = "+str(avg_auc)
        
        avg_auprc = np.sum(auprc_dict[classifier])/10.0
        print "avg auprc = "+str(avg_auprc)
            
        print "Finished "+ligand+" "+classifier
    
    return features_pred_dfs

In [9]:
def combine_features_predictions(ligand, ordered_features, pred_df):
    
    pred_res = pred_df.copy(deep=True)
    for classifier in classifiers.keys():
        
        model_pred = pred_res[pred_res["model"] == classifier]
        model_pred.index = ordered_features[classifier].index
        
        #Creating the combined table
        features_pred = pd.concat([ordered_features[classifier], model_pred], axis=1)
        
        #Saving
        features_pred.to_csv(curr_dir[0]+"/ligand_df/features_pred_tables/"+ligand+"_"+classifier+"_features_pred.csv", sep='\t')

#### Predict for each ligand seperatelly

In [18]:
%%time

for ligand in ligands:
    
    #Initialize dictionary
    pred_dict = defaultdict(list)
    auc_dict = defaultdict(list)
    auprc_dict = defaultdict(list)
    
    ordered_features = test_model(pred_dict, auc_dict, auprc_dict, ligands_features_df[ligand], ligand)
    
    pred_df = pd.DataFrame.from_dict(pred_dict)
    auc_df = pd.DataFrame.from_dict(auc_dict)
    auprc_df = pd.DataFrame.from_dict(auprc_dict)
    
    #Save to file
    pred_df.to_csv(curr_dir[0]+"/ligand_df/01.25.2018/"+ligand+"_0.1.csv", sep='\t')
    auc_df.to_csv(curr_dir[0]+"/ligand_df/01.25.2018/"+ligand+"_0.1_auc.csv", sep='\t')
    auprc_df.to_csv(curr_dir[0]+"/ligand_df/01.25.2018/"+ligand+"_0.1_auprc.csv", sep='\t')
    
    #Combine features and pred results to a unified table
    combine_features_predictions(ligand, ordered_features, pred_df)
    
    print "Finished ligand "+ligand

avg auc = 0.807908379803
avg auprc = 0.0718203826244
Finished rna KNN
avg auc = 0.888809984789
avg auprc = 0.146461714792
Finished rna SVM
avg auc = 0.859261514657
avg auprc = 0.113903489429
Finished rna Ridge
avg auc = 0.955869173778
avg auprc = 0.263549000153
Finished rna RF
avg auc = 0.644756669837
avg auprc = 0.0424191962074
Finished rna Logistic
avg auc = 0.686641066316
avg auprc = 0.0571219575953
Finished rna Lasso
Finished ligand rna
avg auc = 0.818176381624
avg auprc = 0.0890206458134
Finished rnabase KNN
avg auc = 0.890224044294
avg auprc = 0.0838808521701
Finished rnabase SVM
avg auc = 0.82007119179
avg auprc = 0.04633624631
Finished rnabase Ridge
avg auc = 0.930377088449
avg auprc = 0.187844933024
Finished rnabase RF
avg auc = 0.63093895851
avg auprc = 0.0184075512992
Finished rnabase Logistic
avg auc = 0.683376064015
avg auprc = 0.0328920679336
Finished rnabase Lasso
Finished ligand rnabase
avg auc = 0.79495017882
avg auprc = 0.0800132796604
Finished rnabackbone KNN
avg auc

#### Classifier for all ligands- combined

In [17]:
%%time

pred_dict = defaultdict(list)
auc_dict = defaultdict(list)
auprc_dict = defaultdict(list)

ligand = "all_ligands"
print "Starting all_ligands"
test_model(pred_dict, auc_dict, auprc_dict, all_ligands_features_df, ligand)

pred_df = pd.DataFrame.from_dict(pred_dict)
auc_df = pd.DataFrame.from_dict(auc_dict)
auprc_df = pd.DataFrame.from_dict(auprc_dict)

#Save to file
pred_df.to_csv(curr_dir[0]+"/ligand_df/01.25.2018/"+ligand+"_0.1.csv", sep='\t')
auc_df.to_csv(curr_dir[0]+"/ligand_df/01.25.2018/"+ligand+"_0.1_auc.csv", sep='\t')
auprc_df.to_csv(curr_dir[0]+"/ligand_df/01.25.2018/"+ligand+"_0.1_auprc.csv", sep='\t')

Starting all_ligands
avg auc = 0.680981544722
avg auprc = 0.314704292157
Finished all_ligands KNN
avg auc = 0.724265555611
avg auprc = 0.349210885958
Finished all_ligands SVM
avg auc = 0.713613532384
avg auprc = 0.335252507346
Finished all_ligands Ridge
avg auc = 0.770406766222
avg auprc = 0.426651368058
Finished all_ligands RF
avg auc = 0.616142666708
avg auprc = 0.248897348078
Finished all_ligands Logistic
avg auc = 0.611034444117
avg auprc = 0.239598271444
Finished all_ligands Lasso
CPU times: user 4h 58min 14s, sys: 1min 14s, total: 4h 59min 28s
Wall time: 4h 57min 56s


### Predict without the most conserved positions

In [34]:
%%time

for ligand in ligands:
    
    pred_dict = defaultdict(list)
    auc_dict = defaultdict(list)
    auprc_dict = defaultdict(list)
    
    ligands_features_df_not_con = ligands_features_df[ligand][ligands_features_df[ligand]["pfam_prob_max"] <= 0.5]
    print "shape = "+str(ligands_features_df_not_con.shape[0])
    
    test_model(pred_dict, auc_dict, auprc_dict, ligands_features_df_not_con, ligand+"_not_con")
    
    pred_df = pd.DataFrame.from_dict(pred_dict)
    auc_df = pd.DataFrame.from_dict(auc_dict)
    auprc_df = auprc_df = pd.DataFrame.from_dict(auprc_dict)
    
    #Save to file
    pred_df.to_csv(curr_dir[0]+"/ligand_df/"+ligand+"_not_con_0.1.csv", sep='\t')
    auc_df.to_csv(curr_dir[0]+"/ligand_df/"+ligand+"_not_con_0.1_auc.csv", sep='\t')
    auprc_df.to_csv(curr_dir[0]+"/ligand_df/"+ligand+"_not_con_0.1_auprc.csv", sep='\t')
    
    print "Finished ligand "+ligand

shape = 643
hi
avg auc = 0.753515280818
avg auprc = 0.090082693265
Finished dna KNN
avg auc = 0.773948716203
avg auprc = 0.13535452596
Finished dna SVM
avg auc = 0.760963304856
avg auprc = 0.126490168357
Finished dna Ridge
avg auc = 0.762135824511
avg auprc = 0.0979357495207
Finished dna RF
avg auc = 0.692370696979
avg auprc = 0.0844068296964
Finished dna Logistic
avg auc = 0.688980907691
avg auprc = 0.078198742127
Finished dna Lasso
Finished ligand dna
shape = 241
hi
avg auc = 0.728183940079
avg auprc = 0.0795176295513
Finished dnabase KNN
avg auc = 0.761036794722
avg auprc = 0.0572367418724
Finished dnabase SVM
avg auc = 0.659274798277
avg auprc = 0.0478662636617
Finished dnabase Ridge
avg auc = 0.804137166289
avg auprc = 0.0405892994931
Finished dnabase RF
avg auc = 0.689096152335
avg auprc = 0.0658911321308
Finished dnabase Logistic
avg auc = 0.682180494462
avg auprc = 0.0553499999266
Finished dnabase Lasso
Finished ligand dnabase
shape = 535
hi
avg auc = 0.769711754916
avg auprc =

### Predict all ligands together without the most conserved positions

In [None]:
%%time

all_ligands_features_df_not_con = all_ligands_features_df[all_ligands_features_df["pfam_prob_max"] <= 0.5]

pred_dict = defaultdict(list)
auc_dict = defaultdict(list)
auprc_dict = defaultdict(list)

ligand = "all_ligands"
print "shape = "+str(all_ligands_features_df_not_con.shape[0])

test_model(pred_dict, auc_dict, auprc_dict, all_ligands_features_df_not_con, ligand)

pred_df = pd.DataFrame.from_dict(pred_dict)
auc_df = pd.DataFrame.from_dict(auc_dict)

#Save to file
pred_df.to_csv(curr_dir[0]+"/ligand_df/"+ligand+"_not_con_0.1.csv", sep='\t')
auc_df.to_csv(curr_dir[0]+"/ligand_df/"+ligand+"_not_con_0.1_auc.csv", sep='\t')
auprc_df.to_csv(curr_dir[0]+"/ligand_df/"+ligand+"_not_con_0.1_auprc.csv", sep='\t')

shape = 5554
hi
avg auc = 0.60893547414
avg auprc = 0.228060714192
Finished all_ligands KNN
avg auc = 0.646094934738
avg auprc = 0.221615568947
Finished all_ligands SVM
avg auc = 0.645545150528