In [1]:
#Basic imports
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
import sys
from os import getcwd
import subprocess

curr_dir = getcwd()
#14.Final model utils functions
sys.path.append(curr_dir+"/../14.Final_Model/utils")
from final_model_funcs import ligands, all_models_list

sys.path.append(curr_dir+"/../10.Prediction/stacking/utils")
from stacking_funcs import create_stacked_dataset

sys.path.append(curr_dir+"/../10.Prediction/utils")
from prediction_general_funcs import get_features_cols, remove_unimportant_features
from tuning_helper_functions import models_req_scaling

### Functions

In [2]:
def get_pickled_model(ens_model, ens_ligand, stacked=False, ens_dir=""):
    
    #Get model filename
    if (stacked):
        models_path = curr_dir+"/stacked_pik_models/"+ens_dir
    else:
        models_path = curr_dir+"/pik_models"
    
    cmd = "ls "+models_path+" | grep "+ens_ligand+"_"+ens_model+"* | cut -d'_' -f3 | cut -d'.' -f1"
    ls_cmd_out = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
    for line in ls_cmd_out.stdout.readlines():
        trial = line[:-1]
        break
    
    #Read the pickeled model
    print "Reading "+ens_ligand+"_"+ens_model+"_"+trial+".pik"
    with open(models_path+"/"+ens_ligand+"_"+ens_model+"_"+trial+".pik", 'rb') as handle:
        pik_model = pickle.load(handle)
    
    return (pik_model)

In [3]:
def predcit_using_pickeled_model(pred_dict, pik_model, classifier_method, data, stacked=False):
    
    #Scale the data if the classifier is one of: SVM, LG, NN
    data_index = data.index
    if (classifier_method in models_req_scaling):
        cols = data.columns
        #Read the saved Scaler
        if (stacked):
            with open(curr_dir+"/stacked_pik_models/"+ens+"/scaler.pik", 'rb') as handle:
                scaler = pickle.load(handle)
        else:
            with open(curr_dir+"/pik_models/scaler.pik", 'rb') as handle:
                scaler = pickle.load(handle)
        # apply same transformation to data
        data = pd.DataFrame(scaler.transform(data))
        #Restoring indices after scaling
        data.index = data_index 
        #Restoring features names
        data.columns = cols
        
    #Predict using the pickeled model
    
    probs = pik_model.predict_proba(data)
    if (classifier_method == "NN"): 
        probs_list = probs
    else:
        probs_list = []
        for l in probs:
            probs_list.append(l[1])
    
    #Arrange predictions in the output dictionary
    pred_dict["idx"].extend(data_index)
    pred_dict["prob"].extend(probs_list)

In [4]:
def create_stacked_dataset(stacking_path, stacking_ligands, stacking_models, features_data, stacking_filename, keep_original_features=True):
    
    df_stacking_combined = pd.DataFrame()
    
    for stack_ligand in stacking_ligands:
        for stack_model in stacking_models:

            #Read the stacking-1st level probs of all the ligands
            staking1_filename = stack_ligand+"_"+stack_model+"_"+stacking_filename+".csv"
            stacking1_df = pd.read_csv(stacking_path+staking1_filename, sep='\t',index_col=0)
            stacking1_df.index = stacking1_df["idx"]
            stacking1_df.columns = ["idx", stack_model+"_"+stack_ligand+"_prob"]
    
            #Add to the combined df tables
            if (df_stacking_combined.shape[0] == 0):
                df_stacking_combined = stacking1_df
            else:
                df_stacking_combined = pd.merge(df_stacking_combined, stacking1_df, on="idx")
    
    #Remving the idx column after all the merging
    df_stacking_combined.index = stacking1_df["idx"]
    del df_stacking_combined["idx"]
    
    #Adding the original features
    if (keep_original_features):
        df_stacking_combined = pd.concat([df_stacking_combined, features_data], axis=1)
    
    print "#(features) = "+str(df_stacking_combined.shape[1])
    
    return df_stacking_combined

### Reading flags input and determine run parameters

In [5]:
#Reading the ligand input
try:
    ligand = environ['ligand']
except:
    ligand = "sm"
print "ligand = "+ligand

#Reading the ensemble type
try: 
    ens = environ['ens']
except:
    ens = "ALL"
print "ens = "+ens

#Reading stacking layer
try:
    layer = environ['layer']
except:
    layer = "2"
print "layer = "+layer

ligand = sm
ens = ALL
layer = 2


In [6]:
#Determine models and ligands based on ensemble type
hyperparameters = dict()
#all_models_list_change_order = ["SVM", "XGB", "NN", "RF", "Logistic"]
if (ens == "LIGAND"):
    hyperparameters["models"] = all_models_list
    hyperparameters["ligands"] = [ligand]
    out_dir = "ligand_features_probs"
elif (ens == "MODEL"):
    hyperparameters["models"] = ["XGB"]
    hyperparameters["ligands"] = ligands
    out_dir = "model_features"
elif (ens == "ALL"):
    hyperparameters["models"] = all_models_list
    hyperparameters["ligands"] = ligands
    out_dir = "all_features_probs"
else:
    hyperparameters["models"] = all_models_list
    hyperparameters["ligands"] = ligands
    out_dir = "just_probs"
print hyperparameters

{'models': ['XGB', 'RF', 'SVM', 'Logistic', 'NN'], 'ligands': ['dna', 'rna', 'ion', 'peptide', 'sm']}


### Get features data

In [7]:
#Get data
filename_number = "fixed"
data_filename = "windowed_positions_features_"+filename_number+"_10.24.19.csv"
data_path = curr_dir+"/../9.Features_exploration/features_tables_v32/"
features_data = pd.read_csv(data_path+data_filename, sep='\t', index_col=0)
print "test samples positions #: "+str(features_data.shape[0])

test samples positions #: 7102


In [8]:
#Remove the domains with spider problems
if (filename_number != "fixed"):
    spider_problems_domains_list = ["Ank_2", "Ank_4", "Ank_5", "Asp", "CD45", "Cys_knot", "DENN", "DUF1908", "DUF4187", "EF-hand_1", "EF-hand_5", "EF-hand_6", "EF-hand_7", "EF-hand_8", "EF-hand_9", "EFhand_Ca_insen", "ELM2", "Exo_endo_phos", "FYVE", "G-patch", "G-patch_2", "GRAM", "GSHPx", "IQ_SEC7_PH", "LRR_1", "LRR_12", "LRR_6", "LRR_8", "MFS_1", "Myb_DNA-binding", "Myotub-related", "PDZ", "PDZ_6", "PH", "PNMA", "PTP_N", "Pkinase", "Pkinase_Tyr", "RNase_T", "RUN", "Rdx", "SBF2", "SKICH", "Sec7", "SelP_C", "SelP_N", "SelR", "Sep15_SelM", "T4_deiodinase", "TAXi_N", "TIR", "Trefoil", "V-set", "WAP", "Y_phosphatase", "dDENN", "fn3", "uDENN", "zf-C2H2", "zf-C2H2_4", "zf-CCCH", "zf-H2C2_5"]
    features_data = features_data[~features_data["domain_name"].isin(spider_problems_domains_list)]
    print "test samples positions #: "+str(features_data.shape[0])

In [9]:
#Get list of features that we use
features_cols = get_features_cols(features_data)
print "# of features before removal: "+str(len(features_cols))
remove_unimportant_features(features_data, features_cols, update_features_cols=True)
print "# of features after removal: "+str(len(features_cols))

#Filter data to just these features
features_data = features_data.loc[:,features_cols]

# of features before removal: 761
# of features after removal: 753


In [10]:
for col in features_data.columns:
    if (col == "domain_name"):
        continue
    nan_idx = np.where(np.isnan(features_data[col].tolist()) == True)[0]
    if (len(nan_idx) > 0):
        print col+" has NaNs"

### 1st layer predictions

In [11]:
#Get Models
if (layer == "1"):
    first_layer_models = dict()
    for ens_model in hyperparameters["models"]:
        if (ens_model == "NN"):
            continue #This should be run on the gpu
        for ens_ligand in hyperparameters["ligands"]:
            key_str = ens_ligand+"_"+ens_model
            first_layer_models[key_str] = get_pickled_model(ens_model, ens_ligand)

In [12]:
#Predict using all the models for this ligand
if (layer == "1"):
    for ligand_model_key in first_layer_models.keys():

        classifier_method = ligand_model_key.split("_")[1]
        pred_dict = defaultdict(list)
        predcit_using_pickeled_model(pred_dict, first_layer_models[ligand_model_key], classifier_method, features_data)
        pred_df = pd.DataFrame.from_dict(pred_dict)                  

        pred_df.to_csv(curr_dir+"/1st_level_pred/"+ligand_model_key+"_"+filename_number+".csv", sep='\t')
        print "Finished predicting using "+ligand_model_key

### 2nd layer predictions

In [14]:
#Get the stacked model
if (layer == "2"):
    second_layer_model = get_pickled_model("XGB", ligand, stacked=True, ens_dir=ens)

Reading sm_XGB_690.pik


In [15]:
#Create the 2nd layer features table
if (layer == "2"):
    stacking_path = curr_dir+"/1st_level_pred/"
    stacked_features_df = create_stacked_dataset(stacking_path, hyperparameters["ligands"], hyperparameters["models"], features_data, filename_number)

#(features) = 778


In [16]:
#Predict using the stacked model
if (layer == "2"):
    pred_dict = defaultdict(list)
    predcit_using_pickeled_model(pred_dict, second_layer_model, "XGB", stacked_features_df, stacked=True)
    pred_df = pd.DataFrame.from_dict(pred_dict) 
    
    pred_df.to_csv(curr_dir+"/2nd_level_pred/"+ligand+"_"+ens+"_"+filename_number+".csv", sep='\t')
    print "Finished predicting using "+ligand+"_"+ens

Finished predicting using sm_ALL
