In [1]:
#Basic imports
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
from os import environ, getcwd
import sys

#Import utils functions
curr_dir = getcwd()
sys.path.append(curr_dir+"/../utils")
from prop_threshold_funcs import create_positives_datasets_combined, create_negatives_datasets_combined
from prediction_general_funcs import get_features_cols, remove_unimportant_features, test_model_no_performance

#Import models dict according to tuning type
tuning_type = "global_auprc"
#tuning_type = "domain_auc"
if (tuning_type == "global_auprc"):
    from generate_models_dict_global_auprc import generate_models_dict, Net
else:
    from generate_models_dict_domain_auc import generate_models_dict, Net

from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

### Reading the input dataset

In [2]:
pfam_version = "31"
datafile_date = "08.06.18"
input_path = curr_dir+"/../domains_similarity/filtered_features_table/"
filename = "windowed_positions_features_mediode_filter_"+datafile_date+".csv"
out_dirname = "comb_dna0.5_rna0.5_ion0.75"

#flags for creating negatives
zero_prop = True
no_prop = True
all_ligands = False
prec_th = 0.25
folds_num = 5
ligands = ["dna", "rna", "ion", "peptide", "sm"]

#Features table
features_all = pd.read_csv(input_path+filename, sep='\t', index_col=0)
#Features columns names, without the labels (the binding scores)
features_cols = get_features_cols(features_all)
remove_unimportant_features(features_all, features_cols, update_features_cols=True)

print "all samples positions #: "+str(features_all.shape[0])

#CV splits dictionary
with open(curr_dir+"/../CV_splits/pfam-v"+pfam_version+"/domain_"+str(folds_num)+"_folds_combined_dna0.5_rna0.25_ion0.75_prec_dict.pik", 'rb') as handle:
        splits_dict = pickle.load(handle)

all samples positions #: 44872


#### Dataset of negative examples

In [3]:
ligands_negatives_df = create_negatives_datasets_combined(zero_prop, no_prop, features_all, features_cols, all_ligands)

dna non-binding #:43886
dnabase non-binding #:44418
dnabackbone non-binding #:44021
dna combined non binding #: 43884
rna non-binding #:43727
rnabase non-binding #:44154
rnabackbone non-binding #:43944
rna combined non binding #: 43720
peptide non-binding #:41105
ion non-binding #:39630
metabolite non-binding #:39638
druglike non-binding #:35018
sm non-binding #:32697


#### Datasets of positive examples by ligand

In [4]:
ligands_positives_df = create_positives_datasets_combined(features_all, features_cols, all_ligands)

dna #: 369
dnabase #: 161
dnabackbone #: 245
dna combined #: 397
rna #: 206
rnabase #: 118
rnabackbone #: 136
rna combined #: 247
peptide #: 436
ion #: 351
metabolite #: 522
druglike #: 763
sm #: 825




#### Dataset of positive examples - all ligands combined

In [5]:
all_ligands_positives_df = pd.concat([ligands_positives_df["dna"], ligands_positives_df["rna"], ligands_positives_df["ion"], ligands_positives_df["peptide"], ligands_positives_df["sm"]])                                    
all_ligands_positives_df = all_ligands_positives_df.drop_duplicates()
print "all_ligands pos#: "+str(all_ligands_positives_df.shape[0])
ligands_positives_df["all_ligands"] = all_ligands_positives_df

all_ligands pos#: 2055


#### Dataset of negative examples - all ligands combined

In [6]:
all_ligands_negatives_df = pd.concat([ligands_negatives_df["dna"], ligands_negatives_df["rna"], ligands_negatives_df["ion"], ligands_negatives_df["peptide"], ligands_negatives_df["sm"]])                                    
all_ligands_negatives_df = all_ligands_negatives_df.drop_duplicates()
print "all_ligands neg#: "+str(all_ligands_negatives_df.shape[0])
ligands_negatives_df["all_ligands"] = all_ligands_negatives_df

all_ligands neg#: 44855


#### Dataset of all positions examples - all ligands combined

In [7]:
all_positions_used_df = pd.concat([ligands_positives_df["all_ligands"], ligands_negatives_df["all_ligands"]])
all_positions_used_df = all_positions_used_df.drop_duplicates()
print "all_ligands #: "+str(all_positions_used_df.shape[0])

all_ligands #: 44860


### Datasets of missing positions (for each ligand: positions that are in the training of other ligands)

In [8]:
ligands_missing_df = {}

for ligand in ligands:
    #All the ligand positions together
    ligand_df = pd.concat([ligands_negatives_df[ligand],  ligands_positives_df[ligand]])
    ligand_df = ligand_df.drop_duplicates()
    
    #Find indices of the missing positions
    idx_diff = pd.Index.difference(all_positions_used_df.index, ligand_df.index)
    
    #Save a table with all the missing positions
    ligands_missing_df[ligand] = all_positions_used_df.loc[idx_diff]
    
    print ligand+" has "+str(ligands_missing_df[ligand].shape[0])+" missing positions"

dna has 579 missing positions
rna has 893 missing positions
ion has 4879 missing positions
peptide has 3319 missing positions
sm has 11338 missing positions


### Reading env input for downsampler technique, ligand and classifier

In [9]:
#Reading the ligand input
try:
    ligand = environ['ligand']
except:
    ligand = "dna"
print "ligand = "+ligand

#Reading the downsampler input
try: 
    fold = environ['fold']
except:
    fold = "1"
print "fold = "+fold

#Reading the classifier input
try: 
    classifier_method = environ['classifier']
except:
    classifier_method = "XGB"
print "classifier_method = "+classifier_method

ligand = dna
fold = 1
classifier_method = XGB


In [10]:
no_features = len(features_cols)
models_dict = generate_models_dict(ligand, classifier_method, ligands, ligands_positives_df, ligands_negatives_df, folds_num, no_features)

### Predicting the missing positions, each time with a different folds training set

In [11]:
#%%time

pred_dict = defaultdict(list)
model = models_dict[classifier_method][ligand][int(fold)]

ordered_features = test_model_no_performance(pred_dict, ligands_positives_df[ligand], ligands_negatives_df[ligand], ligands_missing_df[ligand], ligand, model, classifier_method, fold)

pred_df = pd.DataFrame.from_dict(pred_dict)

#Save to file
pred_df.to_csv(curr_dir+"/missing_predictions/"+datafile_date+"_"+out_dirname+"/"+tuning_type+"/per_fold/"+ligand+"_"+classifier_method+"_fold"+fold+"_"+str(folds_num)+"w.csv", sep='\t')

print "Finished ligand "+ligand

fold #: 1
Finished dna XGB fold: 1
Finished ligand dna
