In [1]:
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
import os
import sys

#Helper functions to classify positives/negatives by position
curr_dir = os.getcwd()
sys.path.append(curr_dir+"/../10.Prediction/utils")
from prop_threshold_funcs import create_negatives_datasets_combined, create_positives_datasets_combined

In [23]:
#Reading the training dataset
train_datafile_date = "08.06.18"
input_path = curr_dir+"/../10.Prediction/domains_similarity/filtered_features_table/"
filename = "windowed_positions_features_mediode_filter_"+train_datafile_date+".csv"
features_all = pd.read_csv(input_path+filename, sep='\t', index_col=0)
position_prop = features_all.copy(deep=True)

In [24]:
#Remove feature columns and save labels
for col in position_prop.columns:
    if not "prop_th" in col and not "propensity" in col and not "domain_length" in col and not "domain_name" in col:
        del position_prop[col]

In [25]:
domains_list = features_all["domain_name"].unique().tolist()

### Option 1: 
binding: any non-zero position, neutral: all positions are 0

In [54]:
domain_labels_dict1 = {}

for domain in domains_list:
    domain_table = position_prop[position_prop["domain_name"] == domain]
    
    labels = []
    
    #DNA label
    if (max(domain_table["dna_propensity"]) > 0 or max(domain_table["dnabase_propensity"]) > 0 or max(domain_table["dnabackbone_propensity"]) > 0):
        labels.append(1)
    else:
        labels.append(0)
    
    #RNA label
    if (max(domain_table["rna_propensity"]) > 0 or max(domain_table["rnabase_propensity"]) > 0 or max(domain_table["rnabackbone_propensity"]) > 0):
        labels.append(1)
    else:
        labels.append(0)
        
    #ion label
    if (max(domain_table["ion_propensity"]) > 0):
        labels.append(1)
    else:
        labels.append(0)
        
    #peptide label
    if (max(domain_table["peptide_propensity"]) > 0):
        labels.append(1)
    else:
        labels.append(0)  
        
    #sm label
    if (max(domain_table["sm_propensity"]) > 0):
        labels.append(1)
    else:
        labels.append(0)
    
    domain_labels_dict1[domain] = labels

In [55]:
#Convert to labels df
domain_labels_df1 = pd.DataFrame.from_dict(domain_labels_dict1, orient="index")
domain_labels_df1 = domain_labels_df1.sort_index()
domain_labels_df1.columns = ["dna", "rna", "ion", "peptide", "sm"]

#Save df
domain_labels_df1.to_csv(curr_dir+"/domain_labels/train_domain_labels_positive_prop.csv", sep='\t')

### Option 2: 
binding: a position above binding th, neutral: all positions are 0

In [63]:
domain_labels_dict2 = {}

for domain in domains_list:
    domain_table = position_prop[position_prop["domain_name"] == domain]
    
    labels = []
    
    #DNA label
    if (max(domain_table["dna_propensity"]) > domain_table["dna_prop_th_0.5"][0] 
        or max(domain_table["dnabase_propensity"]) > domain_table["dnabase_prop_th_0.5"][0]
        or max(domain_table["dnabackbone_propensity"]) > domain_table["dnabackbone_prop_th_0.5"][0]):
        labels.append(1)
    elif (max(domain_table["dna_propensity"]) < 0 or max(domain_table["dnabase_propensity"]) < 0 or max(domain_table["dnabackbone_propensity"]) < 0):
        labels.append(0)
    else:
        labels.append(-1)
    
    #RNA label
    if (max(domain_table["rna_propensity"]) > domain_table["rna_prop_th_0.5"][0] 
        or max(domain_table["rnabase_propensity"]) > domain_table["rnabase_prop_th_0.5"][0]
        or max(domain_table["rnabackbone_propensity"]) > domain_table["rnabackbone_prop_th_0.5"][0]):
        labels.append(1)
    elif (max(domain_table["rna_propensity"]) < 0 or max(domain_table["rnabase_propensity"]) < 0 or max(domain_table["rnabackbone_propensity"]) < 0):
        labels.append(0)
    else:
        labels.append(-1)
        
    #ion label
    if (max(domain_table["ion_propensity"]) > domain_table["ion_prop_th_0.75"][0]):
        labels.append(1)
    elif (max(domain_table["ion_propensity"]) < 0):
        labels.append(0)
    else:
        labels.append(-1)
        
    #peptide label
    if (max(domain_table["peptide_propensity"]) > domain_table["peptide_prop_th_0.5"][0]):
        labels.append(1)
    elif (max(domain_table["peptide_propensity"]) < 0):
        labels.append(0)
    else:
        labels.append(-1)
        
    #sm label
    if (max(domain_table["sm_propensity"]) > domain_table["sm_prop_th_0.5"][0]):
        labels.append(1)
    elif (max(domain_table["sm_propensity"]) < 0):
        labels.append(0)
    else:
        labels.append(-1)
    
    domain_labels_dict2[domain] = labels

In [72]:
#Convert to labels df
domain_labels_df2 = pd.DataFrame.from_dict(domain_labels_dict2, orient="index")
domain_labels_df2 = domain_labels_df2.sort_index()
domain_labels_df2.columns = ["dna", "rna", "ion", "peptide", "sm"]

#Save df
domain_labels_df2.to_csv(curr_dir+"/domain_labels/train_domain_labels_th_prop.csv", sep='\t')