In [1]:
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
import os
import sys

#Helper functions to classify positives/negatives by position
curr_dir = os.getcwd()
sys.path.append(curr_dir+"/../utils")
from prop_threshold_funcs import create_negatives_datasets_combined, create_positives_datasets_combined

### Obtain position labels
##### Domains in training set

In [2]:
#If pre-parsed file is not available, get labels from features table
datafile_date = "08.06.18"
if not os.path.exists("train_position_labels_"+datafile_date+".csv"):
    input_path = curr_dir+"/../domains_similarity/filtered_features_table/"
    filename = "windowed_positions_features_mediode_filter_"+datafile_date+".csv"
    features_all = pd.read_csv(input_path+filename, sep='\t', index_col=0)
    
    #Remove feature columns and save labels
    for col in features_all.columns:
        if not "prop_th" in col and not "propensity" in col and not "domain_length" in col:
            del features_all[col]
            
    features_all.to_csv("train_position_labels_"+datafile_date+".csv", sep='\t')
    train_position_labels = features_all
else:
    train_position_labels = pd.read_csv("train_position_labels_"+datafile_date+".csv", sep='\t', index_col=0)

### Obtain position labels
##### Domains not in training set

In [3]:
if not os.path.exists("not_train_position_labels.csv"):
    with open(curr_dir+"/../../7.InteracDome_targetVariable/domains_ligands_propensity_dict.pik", 'rb') as handle:
         raw_labels = pickle.load(handle)

    with open(curr_dir+"/../../13.Process_domains_not_in_training/processed_domains_not_in_pipeline_final_list.pik", 'rb') as handle:
        not_train_list = pickle.load(handle)

    not_train_position_labels = pd.DataFrame(columns=["domain_length"])
    for domain in not_train_list:
        for ligand in raw_labels[domain]:
            for state in raw_labels[domain][ligand]["states_props"]:
                not_train_position_labels.loc[domain+"_"+str(state),"domain_length"] = len(raw_labels[domain][ligand]["states_props"])
                not_train_position_labels.loc[domain+"_"+str(state),ligand+"_prop_th_0.1"] = raw_labels[domain][ligand]["prop_th_0.1"]
                not_train_position_labels.loc[domain+"_"+str(state),ligand+"_prop_th_0.25"] = raw_labels[domain][ligand]["prop_th_0.25"]
                not_train_position_labels.loc[domain+"_"+str(state),ligand+"_prop_th_0.5"] = raw_labels[domain][ligand]["prop_th_0.5"]
                not_train_position_labels.loc[domain+"_"+str(state),ligand+"_prop_th_0.75"] = raw_labels[domain][ligand]["prop_th_0.75"]
                not_train_position_labels.loc[domain+"_"+str(state),ligand+"_propensity"] = raw_labels[domain][ligand]["states_props"][state]
    not_train_position_labels.to_csv("not_train_position_labels.csv", sep='\t')
else:
    not_train_position_labels = pd.read_csv("not_train_position_labels.csv", sep='\t', index_col=0)

### Aggregate positives and negatives across domains

In [4]:
def create_labels(position_labels):
    #Flags for creating negatives
    zero_prop = True
    no_prop = True
    all_ligands = False

    ligands_negatives_df = create_negatives_datasets_combined(zero_prop, no_prop, position_labels, [], all_ligands)
    ligands_positives_df = create_positives_datasets_combined(position_labels, [], all_ligands)

    #Initialize dataframe
    col_names=["length"]
    for ligand in ligands_negatives_df:
        col_names.append(ligand+"_number_neg")
        col_names.append(ligand+"_number_pos")
        col_names.append(ligand+"_max")
    domain_labels = pd.DataFrame(columns=col_names)

    for pos,row in position_labels.iterrows():
        #Extract domain name and check if already added
        domain_name = "_".join(pos.split("_")[:-1])
        if not domain_name in domain_labels.index:
            domain_labels.loc[domain_name] = np.zeros(len(domain_labels.columns))
            domain_labels.loc[domain_name,"length"] = row["domain_length"]

        #Update counts
        for ligand in ligands_negatives_df:
            if pos in ligands_negatives_df[ligand].index:
                domain_labels.loc[domain_name,ligand+"_number_neg"] += 1
        for ligand in ligands_positives_df:
            if pos in ligands_positives_df[ligand].index:
                domain_labels.loc[domain_name,ligand+"_number_pos"] += 1

        #Max labels
        for ligand in ligands_negatives_df:
            domain_labels.loc[domain_name,ligand+"_max"] = max(domain_labels.loc[domain_name,ligand+"_max"], row[ligand+"_propensity"])

    # Add binary label and fractions pos/neg
    for domain,row in domain_labels.iterrows():
        for ligand in ligands_negatives_df:
            if domain_labels.loc[domain,ligand+"_number_pos"] > 0:
                domain_labels.loc[domain,ligand+"_label"] = 1
            else:
                domain_labels.loc[domain,ligand+"_label"] = 0
            domain_labels.loc[domain,ligand+"_frac_neg"] = domain_labels.loc[domain,ligand+"_number_neg"] / domain_labels.loc[domain,"length"]
            domain_labels.loc[domain,ligand+"_frac_pos"] = domain_labels.loc[domain,ligand+"_number_pos"] / domain_labels.loc[domain,"length"]

    return domain_labels

In [5]:
train_domain_labels = create_labels(train_position_labels)
train_domain_labels.to_csv("train_domain_labels_"+datafile_date+".csv", sep='\t')

not_train_domain_labels = create_labels(not_train_position_labels)
not_train_domain_labels.to_csv("not_train_domain_labels.csv", sep='\t')

dna non-binding #:43886
dnabase non-binding #:44418
dnabackbone non-binding #:44021
dna combined non binding #: 43884
rna non-binding #:43727
rnabase non-binding #:44154
rnabackbone non-binding #:43944
rna combined non binding #: 43720
peptide non-binding #:41105
ion non-binding #:39630
metabolite non-binding #:39638
druglike non-binding #:35018
sm non-binding #:32697
dna #: 369
dnabase #: 161
dnabackbone #: 245
dna combined #: 397
rna #: 206
rnabase #: 118
rnabackbone #: 136
rna combined #: 247
peptide #: 436
ion #: 351
metabolite #: 522
druglike #: 763
sm #: 825


  ligands_positives_dna[ligand] = features_all[features_all[prop_th_str] != -1][features_all[score_col_str] >= features_all[prop_th_str]]
  ligands_positives_rna[ligand] = features_all[features_all[prop_th_str] != -1][features_all[score_col_str] >= features_all[prop_th_str]]
  ligand_binding_df = features_all[features_all[prop_th_str] != -1][features_all[score_col_str] >= features_all[prop_th_str]]


dna non-binding #:8865
dnabase non-binding #:8144
dnabackbone non-binding #:9026
dna combined non binding #: 6528
rna non-binding #:11714
rnabase non-binding #:12318
rnabackbone non-binding #:12243
rna combined non binding #: 9908
peptide non-binding #:14723
ion non-binding #:71250
metabolite non-binding #:60911
druglike non-binding #:86749
sm non-binding #:91887
dna #: 572
dnabase #: 185
dnabackbone #: 448
dna combined #: 632
rna #: 2845
rnabase #: 838
rnabackbone #: 2520
rna combined #: 2900
peptide #: 516
ion #: 611
metabolite #: 1692
druglike #: 3203
sm #: 3366
