In [1]:
#Basic imports
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
from os import environ
import sys

#Classifier imports
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, RandomForestRegressor, BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier

#ML framework imports
from sklearn.metrics import auc, roc_auc_score, precision_recall_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn.calibration import CalibratedClassifierCV

# Neural Net imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR

#Import utils functions
curr_dir = !pwd
sys.path.append(curr_dir[0]+"/../utils")
from prop_threshold_funcs import create_negatives_datasets, create_positives_datasets, create_positives_datasets_combined, create_negatives_datasets_combined
from prediction_general_funcs import ligands, score_cols_suffix, get_features_cols, remove_unimportant_features
from CV_funcs import add_domain_name_from_table_idx, calc_CV_idx_iterative
from generate_models_dict import generate_models_dict

from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

### Reading the input dataset

In [2]:
curr_dir = !pwd
pfam_version = "31"
datafile_date = "08.06.18"
input_path = curr_dir[0]+"/../domains_similarity/filtered_features_table/"
filename = "windowed_positions_features_mediode_filter_"+datafile_date+".csv"
out_dir = "mediode_NegLigand_NoFilter"

#flags for creating negatives
zero_prop = True
no_prop = True
all_ligands = False
prec_th = 0.25
folds_num = 5
ligands = ["dna", "rna", "ion", "peptide", "sm"]

#Features table
features_all = pd.read_csv(input_path+filename, sep='\t', index_col=0)
#Features columns names, without the labels (the binding scores)
features_cols = get_features_cols(features_all)
remove_unimportant_features(features_all, features_cols)

print "all samples positions #: "+str(features_all.shape[0])

#CV splits dictionary
# with open(curr_dir[0]+"/CV_splits/pfam-v"+pfam_version+"/domain_"+str(folds_num)+"_folds_"+str(prec_th)+"_prec_dict.pik", 'rb') as handle:
#         splits_dict = pickle.load(handle)
with open(curr_dir[0]+"/../CV_splits/pfam-v"+pfam_version+"/domain_"+str(folds_num)+"_folds_combined_dna0.5_rna0.25_ion0.75_prec_dict.pik", 'rb') as handle:
        splits_dict = pickle.load(handle)

all samples positions #: 44872


#### Dataset of negative examples

In [3]:
ligands_negatives_df = create_negatives_datasets_combined(zero_prop, no_prop, features_all, features_cols, all_ligands)

dna non-binding #:43886
dnabase non-binding #:44418
dnabackbone non-binding #:44021
dna combined non binding #: 43884
rna non-binding #:43727
rnabase non-binding #:44154
rnabackbone non-binding #:43944
rna combined non binding #: 43720
peptide non-binding #:41105
ion non-binding #:39630
metabolite non-binding #:39638
druglike non-binding #:35018
sm non-binding #:32697


#### Datasets of positive examples by ligand

In [4]:
ligands_positives_df = create_positives_datasets_combined(features_all, features_cols, all_ligands)

dna #: 369
dnabase #: 161
dnabackbone #: 245
dna combined #: 397
rna #: 450
rnabase #: 290
rnabackbone #: 306
rna combined #: 531
peptide #: 436
ion #: 351
metabolite #: 522
druglike #: 763
sm #: 825




#### Dataset of positive examples - all ligands combined

In [5]:
all_ligands_positives_df = pd.concat([ligands_positives_df["dna"], ligands_positives_df["rna"], ligands_positives_df["ion"], ligands_positives_df["peptide"], ligands_positives_df["sm"]])                                    
all_ligands_positives_df = all_ligands_positives_df.drop_duplicates()
print "all_ligands pos#: "+str(all_ligands_positives_df.shape[0])
ligands_positives_df["all_ligands"] = all_ligands_positives_df

all_ligands pos#: 2322


#### Dataset of negative examples - all ligands combined

In [6]:
all_ligands_negatives_df = pd.concat([ligands_negatives_df["dna"], ligands_negatives_df["rna"], ligands_negatives_df["ion"], ligands_negatives_df["peptide"], ligands_negatives_df["sm"]])                                    
all_ligands_negatives_df = all_ligands_negatives_df.drop_duplicates()
print "all_ligands neg#: "+str(all_ligands_negatives_df.shape[0])
ligands_negatives_df["all_ligands"] = all_ligands_negatives_df

all_ligands neg#: 44855


#### Dataset of all positions examples examples - all ligands combined

In [7]:
all_positions_used_df = pd.concat([ligands_positives_df["all_ligands"], ligands_negatives_df["all_ligands"]])
all_positions_used_df = all_positions_used_df.drop_duplicates()
print "all_ligands #: "+str(all_ligands_negatives_df.shape[0])

all_ligands #: 44855


### Datasets of missing positions (for each ligand: positions that are in the training of other ligands)

In [8]:
ligands_missing_df = {}

for ligand in ligands:
    #All the ligand positions together
    ligand_df = pd.concat([ligands_negatives_df[ligand],  ligands_positives_df[ligand]])
    ligand_df = ligand_df.drop_duplicates()
    
    #Find indices of the missing positions
    idx_diff = pd.Index.difference(all_positions_used_df.index, ligand_df.index)
    
    #Save a table with all the missing positions
    ligands_missing_df[ligand] = all_positions_used_df.loc[idx_diff]
    
    print ligand+" has "+str(ligands_missing_df[ligand].shape[0])+" missing positions"

dna has 585 missing positions
rna has 615 missing positions
ion has 4885 missing positions
peptide has 3325 missing positions
sm has 11344 missing positions


### Reading env input for downsampler technique, ligand and classifier

In [14]:
#Reading the ligand input
try:
    ligand = environ['ligand']
except:
    ligand = "dna"
print "ligand = "+ligand

#Reading the downsampler input
try: 
    fold = environ['fold']
except:
    fold = "1"
print "fold = "+fold

#Reading the classifier input
try: 
    classifier_method = environ['classifier']
except:
    classifier_method = "ADA"
print "classifier_method = "+classifier_method

ligand = dna
fold = 1
classifier_method = ADA


In [15]:
no_features = len(features_cols)
models_dict = generate_models_dict(ligand, ligands, ligands_positives_df, ligands_negatives_df, folds_num, no_features)

### Predicting the missing positions, each time with a different folds training set

In [20]:
def test_model_iterative_fixed(pred_dict, ligand_bind_features, ligand_negatives_features, ligand_missing_df, ligand_name, features=[]):
    
    """
    Test different models in 10-folds cross-validation.
    """
    
    #Default: Exclude no features
    if len(features) == 0:
        features = np.ones([ligand_bind_features.shape[1],]).astype(bool)
        
    #Arranging the features table by the CV order, for each model
    features_pred_dfs = dict.fromkeys(models_dict.keys())
    
    models_req_scaling = ["SVM", "KNN", "Logistic", "NN"]

    classifier = classifier_method
    features_pred_dfs[classifier] = pd.DataFrame()

    #Create X and y with included features
    X = pd.concat([ligand_bind_features.iloc[:,features], ligand_negatives_features.iloc[:,features]])
    y = [1] * ligand_bind_features.shape[0]
    y.extend([0] * ligand_negatives_features.shape[0])
    y = np.array(y)
    y_df = pd.DataFrame(y)
    y_df.index = X.index
    y_df.columns = ["label"]
    
    #Get the fold indices
    cv_idx = calc_CV_idx_iterative(X, splits_dict)
    k = (int(fold)-1)
    
    pred_idx = k+1
    print "fold #: "+str(pred_idx)
    test_index = cv_idx[k]["test"]
    train_index = cv_idx[k]["train"]
    X_train, X_test = X.loc[train_index,:], X.loc[test_index,:]
    y_train, y_test = y_df.loc[train_index,:], y_df.loc[test_index,:]
    
    if (classifier in models_req_scaling):
        cols = X_train.columns
        scaler = StandardScaler() 
        #scale only using the training data
        scaler.fit(X_train) 
        X_train = pd.DataFrame(scaler.transform(X_train))
        # apply same transformation to test data
        X_test = pd.DataFrame(scaler.transform(X_test))
        #Restoring indices after scaling
        X_train.index = train_index 
        X_test.index = test_index 
        #Restoring features names
        X_train.columns = cols
        X_test.columns = cols

    #No down-sampling
    X_train_sampled = X_train
    y_train_sampled = y_train

    #fit to training data
    model = models_dict[classifier][ligand][int(fold)]
    
    #pos and neg numbers in the training
    no_pos = np.count_nonzero(y_train_sampled["label"] == 1)
    no_neg = np.count_nonzero(y_train_sampled["label"] == 0)  
    if classifier == "NN":     
        #weight vector for NN
        if model.weight == "balanced":              
            #weight vector
            neg_weight = float(no_pos) / float(no_neg + no_pos) 
            pos_weight = 1 - neg_weight
        elif model.weight == "0.1":
            neg_weight = 10
            pos_weight = 1
        elif model.weight == "None":
            neg_weight = 1
            pos_weight = 1

        weight = torch.Tensor([neg_weight, pos_weight])
        model.fit(X_train_sampled, y_train_sampled["label"], weight)
        probs_list = model.predict_proba(X_test)
    
    elif classifier == "ADA":
        print "fiting calibrated model"
        calib_model = CalibratedClassifierCV(base_estimator=model)
        calib_model.fit(X_train_sampled, y_train_sampled["label"])
        probs_list = []
        probs = calib_model.predict_proba(ligand_missing_df)
        for l in probs:
            probs_list.append(l[1])
    else:
        
        model.fit(X_train_sampled, y_train_sampled["label"])
        probs_list = []
        probs = model.predict_proba(ligand_missing_df)
        for l in probs:
            probs_list.append(l[1])

    pred_dict["prob"].extend(probs_list)
    fold_list = [pred_idx] * len(probs_list)
    pred_dict["fold"].extend(fold_list)

    model_list = [classifier] * len(probs_list)
    pred_dict["model"].extend(model_list)
    
    #Adding the position number to the table to help with analysis
    pred_dict["idx"].extend(ligand_missing_df.index)
    
    #Update features table
    features_pred_dfs[classifier] = features_pred_dfs[classifier].append(ligand_missing_df)
    pred_idx += 1

    print "Finished "+ligand+" "+classifier+" fold: "+fold
    
    return (features_pred_dfs, model)

In [12]:
def combine_features_predictions(ligand, ordered_features, pred_df):
    
    pred_res = pred_df.copy(deep=True)
    for classifier in classifiers.keys():
        classifier = classifier_method
        model_pred = pred_res[pred_res["model"] == classifier]
        model_pred.index = ordered_features[classifier].index
        
        #Creating the combined table
        features_pred = pd.concat([ordered_features[classifier], model_pred], axis=1)
        
        #Saving
        features_pred.to_csv(curr_dir[0]+"/between_1st_level_pred/features_pred_tables/"+ligand+"_"+classifier+"_features_pred.csv", sep='\t')
        break

#### Predict for each ligand seperatelly - every training set split will predict for the between labels data points

In [16]:
%%time

pred_dict = defaultdict(list)
downsample_method = "NoDown"

(ordered_features, model) = test_model_iterative_fixed(pred_dict, ligands_positives_df[ligand], ligands_negatives_df[ligand], ligands_missing_df[ligand], ligand)

pred_df = pd.DataFrame.from_dict(pred_dict)

#Save to file
out_dirname = "comb_dna0.5_rna0.25_ion0.75"

pred_df.to_csv(curr_dir[0]+"/missing_predictions/"+datafile_date+"_"+out_dirname+"/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w.csv", sep='\t')

print "Finished ligand "+ligand

fold #: 1
fiting calibrated model


TypeError: float() argument must be a string or a number

In [19]:
ligands_missing_df[ligand].shape

(585, 753)

In [21]:
sys.path.insert()

['', '/home/anat/anaconda2/lib/python27.zip', '/home/anat/anaconda2/lib/python2.7', '/home/anat/anaconda2/lib/python2.7/plat-linux2', '/home/anat/anaconda2/lib/python2.7/lib-tk', '/home/anat/anaconda2/lib/python2.7/lib-old', '/home/anat/anaconda2/lib/python2.7/lib-dynload', '/home/anat/.local/lib/python2.7/site-packages', '/home/anat/anaconda2/lib/python2.7/site-packages', '/home/anat/anaconda2/lib/python2.7/site-packages/Sphinx-1.3.5-py2.7.egg', '/home/anat/anaconda2/lib/python2.7/site-packages/torchvision-0.2.1-py2.7.egg', '/home/anat/anaconda2/lib/python2.7/site-packages/IPython/extensions', '/home/anat/.ipython', '/home/anat/Research/ExAC/10.Prediction/stacking/../utils']
