In [1]:
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
import os
import sys

In [2]:
def create_features(input_path, ligand):
    thresholds = [0.001, 0.01, 0.1, 0.25, 0.5, 0.75]
    zscores_thresholds = [0.5, 1, 2, 3]
    
    #Raw scores
    predicted_scores = pd.read_csv(input_path+"/"+ligand+"_5w.csv",index_col=0,sep='\t')
    
    #Initialize dataframe
    cols = [ligand+"_max", ligand+"_count"]
    for t in thresholds:
        cols.append(ligand+"_thresh_"+str(t))
    for z in zscores_thresholds:
        cols.append(ligand+"_zthresh_above_"+str(z))
    domain_labels = pd.DataFrame(columns=cols)
    
    #Values for z-scoring
    ligand_mean = np.mean(predicted_scores["prob"])
    ligand_std = np.std(predicted_scores["prob"])

    for idx,row in predicted_scores.iterrows():
        #Extract domain name and check if already added
        domain_name = "_".join(row["idx"].split("_")[:-1])
        if not domain_name in domain_labels.index:
            domain_labels.loc[domain_name] = np.zeros(len(domain_labels.columns))
            
        domain_labels.loc[domain_name,ligand+"_count"] += 1

        #Update counts
        for t in thresholds:
            if row["prob"] > t:
                domain_labels.loc[domain_name,ligand+"_thresh_"+str(t)] += 1

        #Max labels
        domain_labels.loc[domain_name,ligand+"_max"] = max(domain_labels.loc[domain_name,ligand+"_max"], row["prob"])
        
        #z-scores
        for z in zscores_thresholds:
            if (row["prob"]-ligand_mean) / ligand_std > z:
                domain_labels.loc[domain_name,ligand+"_zthresh_above_"+str(z)] += 1
                
    #Convert to fractions
    for idx,row in domain_labels.iterrows():
        for t in thresholds:
            domain_labels.loc[idx,ligand+"_thresh_"+str(t)] = row[ligand+"_thresh_"+str(t)] / row[ligand+"_count"]
        for z in zscores_thresholds:
            domain_labels.loc[idx,ligand+"_zthresh_above_"+str(z)] = row[ligand+"_zthresh_above_"+str(z)] / row[ligand+"_count"]
            
    del domain_labels[ligand+"_count"]
    return domain_labels

In [3]:
#Create features for each of the relevant ligands
ligands = ["dna","rna","ion","peptide","sm"]
input_path = os.getcwd()+"/../10.Prediction/stacking/2nd_level_pred/08.06.18_dna0.5_rna0.5_ion0.75/global_auprc/model_features"
domain_labels = pd.DataFrame()

for ligand in ligands:
    domain_labels = pd.concat([domain_labels, create_features(input_path, ligand)], axis=1)
    
domain_labels.to_csv("domain_features.csv", sep='\t')