In [2]:
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
import os
import sys

#Helper functions to classify positives/negatives by position
curr_dir = os.getcwd()
sys.path.append(curr_dir+"/../10.Prediction/utils")
from prop_threshold_funcs import create_negatives_datasets_combined, create_positives_datasets_combined

### Obtain position labels

In [4]:
def read_labels(input_path, filename):   
    features_all = pd.read_csv(input_path+filename, sep='\t', index_col=0)
    
    #Remove feature columns and save labels
    for col in features_all.columns:
        if not "prop_th" in col and not "propensity" in col and not "domain_length" in col:
            del features_all[col]
            
    return features_all

In [6]:
#If pre-parsed file is not available, get labels from features table

#Domains >10 instances
train_datafile_date = "08.06.18"
if not os.path.exists("train_position_labels_"+train_datafile_date+".csv"):
    input_path = curr_dir+"/../10.Prediction/domains_similarity/filtered_features_table/"
    filename = "windowed_positions_features_mediode_filter_"+train_datafile_date+".csv"
    train_position_labels = read_labels(input_path, filename)
            
    train_position_labels.to_csv("train_position_labels_"+train_datafile_date+".csv", sep='\t')
else:
    train_position_labels = pd.read_csv("train_position_labels_"+train_datafile_date+".csv", sep='\t', index_col=0)
    
#Domains <10 instances
not_train_datafile_date = "01.22.19"
if not os.path.exists("not_train_position_labels_"+not_train_datafile_date+".csv"):
    input_path = curr_dir+"/../9.Features_exploration/features_tables_v31/"
    filename = "windowed_positions_features_less_than_10_"+not_train_datafile_date+".csv"
    not_train_position_labels = read_labels(input_path, filename)
            
    not_train_position_labels.to_csv("not_train_position_labels_"+not_train_datafile_date+".csv", sep='\t')
else:
    not_train_position_labels = pd.read_csv("not_train_position_labels_"+not_train_datafile_date+".csv", sep='\t', index_col=0)

### Aggregate positives and negatives across domains

In [9]:
def create_labels(position_labels):
    #Flags for creating negatives
    zero_prop = True
    no_prop = True
    all_ligands = False

    ligands_negatives_df = create_negatives_datasets_combined(zero_prop, no_prop, position_labels, [], all_ligands)
    ligands_positives_df = create_positives_datasets_combined(position_labels, [], all_ligands)

    #Initialize dataframe
    col_names=[]
    for ligand in ligands_negatives_df:
        col_names.append(ligand+"_label")
    domain_labels = pd.DataFrame(columns=col_names)

    for pos,row in position_labels.iterrows():
        #Extract domain name and check if already added
        domain_name = "_".join(pos.split("_")[:-1])
        if not domain_name in domain_labels.index:
            domain_labels.loc[domain_name] = np.zeros(len(domain_labels.columns))

        #Create binary label
        for ligand in ligands_positives_df:
            if pos in ligands_positives_df[ligand].index:
                domain_labels.loc[domain_name,ligand+"_label"] = 1

    return domain_labels

In [4]:
def create_features(position_labels):
    #Flags for creating negatives
    zero_prop = True
    no_prop = True
    all_ligands = False

    ligands_negatives_df = create_negatives_datasets_combined(zero_prop, no_prop, position_labels, [], all_ligands)
    ligands_positives_df = create_positives_datasets_combined(position_labels, [], all_ligands)

    #Initialize dataframe
    col_names=["length"]
    for ligand in ligands_negatives_df:
        col_names.append(ligand+"_number_neg")
        col_names.append(ligand+"_number_pos")
        col_names.append(ligand+"_max")
    domain_labels = pd.DataFrame(columns=col_names)

    for pos,row in position_labels.iterrows():
        #Extract domain name and check if already added
        domain_name = "_".join(pos.split("_")[:-1])
        if not domain_name in domain_labels.index:
            domain_labels.loc[domain_name] = np.zeros(len(domain_labels.columns))
            domain_labels.loc[domain_name,"length"] = row["domain_length"]

        #Update counts
        for ligand in ligands_negatives_df:
            if pos in ligands_negatives_df[ligand].index:
                domain_labels.loc[domain_name,ligand+"_number_neg"] += 1
        for ligand in ligands_positives_df:
            if pos in ligands_positives_df[ligand].index:
                domain_labels.loc[domain_name,ligand+"_number_pos"] += 1

        #Max labels
        for ligand in ligands_negatives_df:
            domain_labels.loc[domain_name,ligand+"_max"] = max(domain_labels.loc[domain_name,ligand+"_max"], row[ligand+"_propensity"])

    # Add binary label and fractions pos/neg
    for domain,row in domain_labels.iterrows():
        for ligand in ligands_negatives_df:
            if domain_labels.loc[domain,ligand+"_number_pos"] > 0:
                domain_labels.loc[domain,ligand+"_label"] = 1
            else:
                domain_labels.loc[domain,ligand+"_label"] = 0
            domain_labels.loc[domain,ligand+"_frac_neg"] = domain_labels.loc[domain,ligand+"_number_neg"] / domain_labels.loc[domain,"length"]
            domain_labels.loc[domain,ligand+"_frac_pos"] = domain_labels.loc[domain,ligand+"_number_pos"] / domain_labels.loc[domain,"length"]

    return domain_labels

In [10]:
train_domain_labels = create_labels(train_position_labels)
train_domain_labels.to_csv("train_domain_labels_"+train_datafile_date+".csv", sep='\t')

not_train_domain_labels = create_labels(not_train_position_labels)
not_train_domain_labels.to_csv("not_train_domain_labels_"+not_train_datafile_date+".csv", sep='\t')

dna non-binding #:43886
dnabase non-binding #:44418
dnabackbone non-binding #:44021
dna combined non binding #: 43884
rna non-binding #:43727
rnabase non-binding #:44154
rnabackbone non-binding #:43944
rna combined non binding #: 43720
peptide non-binding #:41105
ion non-binding #:39630
metabolite non-binding #:39638
druglike non-binding #:35018
sm non-binding #:32697
dna #: 369
dnabase #: 161
dnabackbone #: 245
dna combined #: 397
rna #: 206
rnabase #: 118
rnabackbone #: 136
rna combined #: 247
peptide #: 436
ion #: 351
metabolite #: 522
druglike #: 763
sm #: 825
dna non-binding #:147900
dnabase non-binding #:149821
dnabackbone non-binding #:148214
dna combined non binding #: 147893
rna non-binding #:144984
rnabase non-binding #:147626
rnabackbone non-binding #:145510
rna combined non binding #: 144977
peptide non-binding #:148527
ion non-binding #:142566
metabolite non-binding #:140418
druglike non-binding #:130078
sm non-binding #:126372
dna #: 569
dnabase #: 184
dnabackbone #: 445


In [24]:
not_train_domain_labels.shape

(823, 7)

In [23]:
old_labels = pd.read_csv("not_train_domain_labels.csv",index_col=0,sep='\t')

old_labels.loc[:,["metabolite_label"]]

Unnamed: 0,metabolite_label
14-3-3,0.0
2-Hacid_dh,1.0
2OG-FeII_Oxy_2,1.0
3HCDH,0.0
4HBT,1.0
5-FTHF_cyc-lig,1.0
5_nucleotid_C,1.0
7TM_GPCR_Srv,0.0
7TM_GPCR_Srw,0.0
AAA_29,1.0


## Old - Use inferred bindings scores
### Obtain position labels
##### Domains not in training set

In [3]:
if not os.path.exists("not_train_position_labels.csv"):
    with open(curr_dir+"/../../7.InteracDome_targetVariable/domains_ligands_propensity_dict.pik", 'rb') as handle:
         raw_labels = pickle.load(handle)

    with open(curr_dir+"/../../13.Process_domains_not_in_training/processed_domains_not_in_pipeline_final_list.pik", 'rb') as handle:
        not_train_list = pickle.load(handle)

    not_train_position_labels = pd.DataFrame(columns=["domain_length"])
    for domain in not_train_list:
        for ligand in raw_labels[domain]:
            for state in raw_labels[domain][ligand]["states_props"]:
                not_train_position_labels.loc[domain+"_"+str(state),"domain_length"] = len(raw_labels[domain][ligand]["states_props"])
                not_train_position_labels.loc[domain+"_"+str(state),ligand+"_prop_th_0.1"] = raw_labels[domain][ligand]["prop_th_0.1"]
                not_train_position_labels.loc[domain+"_"+str(state),ligand+"_prop_th_0.25"] = raw_labels[domain][ligand]["prop_th_0.25"]
                not_train_position_labels.loc[domain+"_"+str(state),ligand+"_prop_th_0.5"] = raw_labels[domain][ligand]["prop_th_0.5"]
                not_train_position_labels.loc[domain+"_"+str(state),ligand+"_prop_th_0.75"] = raw_labels[domain][ligand]["prop_th_0.75"]
                not_train_position_labels.loc[domain+"_"+str(state),ligand+"_propensity"] = raw_labels[domain][ligand]["states_props"][state]
    not_train_position_labels.to_csv("not_train_position_labels.csv", sep='\t')
else:
    not_train_position_labels = pd.read_csv("not_train_position_labels.csv", sep='\t', index_col=0)