In [1]:
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
import os
import sys
import math

In [2]:
# Calculates a normalized Shannon entropy (from Miller et al, 2015)
def entropy(a):
    
    if len(a) == 1:
        return 0 #Min entropy - all the change is in one value
    
    a = np.asarray(a) / float(sum(a))
    entropy = 0
    
    for val in a:
        if (val == 0 or np.isnan(val)):
            continue
        entropy += val * math.log(val)
    
    entropy_adj = -entropy / math.log(len(a)) #To account for different size input
        
    return entropy_adj

In [3]:
def create_ligand_features(input_path, ligand):
    #Raw scores
    predicted_scores = pd.read_csv(input_path+"/"+ligand+"_5w.csv",index_col=0,sep='\t')
    
    #Thresholds
    percentiles = [25, 50, 75]
    per_thresholds = np.percentile(predicted_scores["prob"], percentiles)
    zscores_thresholds = [0.5, 1, 2, 3]
    
    #Initialize dataframe
    cols = [ligand+"_max", ligand+"_count", ligand+"_entropy"]
    for p in percentiles:
        cols.append(ligand+"_thresh_"+str(p))
    for z in zscores_thresholds:
        cols.append(ligand+"_zthresh_above_"+str(z))
    domain_labels = pd.DataFrame(columns=cols)
    
    #Values for z-scoring
    ligand_mean = np.mean(predicted_scores["prob"])
    ligand_std = np.std(predicted_scores["prob"])

    for idx,row in predicted_scores.iterrows():
        #Extract domain name and check if already added
        domain_name = "_".join(row["idx"].split("_")[:-1])
        if not domain_name in domain_labels.index:
            domain_labels.loc[domain_name] = np.zeros(len(domain_labels.columns))
            
        #Add helpful cols
        predicted_scores.loc[idx,"domain"] = domain_name
        predicted_scores.loc[idx,"pos"] = row["idx"].split("_")[-1]
            
        domain_labels.loc[domain_name,ligand+"_count"] += 1

        #Update counts
        for i in range(0,len(percentiles)):
            if row["prob"] > per_thresholds[i]:
                domain_labels.loc[domain_name,ligand+"_thresh_"+str(percentiles[i])] += 1

        #Max labels
        domain_labels.loc[domain_name,ligand+"_max"] = max(domain_labels.loc[domain_name,ligand+"_max"], row["prob"])
        
        #z-scores
        for z in zscores_thresholds:
            if (row["prob"]-ligand_mean) / ligand_std > z:
                domain_labels.loc[domain_name,ligand+"_zthresh_above_"+str(z)] += 1
                
    for idx,row in domain_labels.iterrows():
        #Convert to fractions
        for p in percentiles:
            domain_labels.loc[idx,ligand+"_thresh_"+str(p)] = row[ligand+"_thresh_"+str(p)] / row[ligand+"_count"]
        for z in zscores_thresholds:
            domain_labels.loc[idx,ligand+"_zthresh_above_"+str(z)] = row[ligand+"_zthresh_above_"+str(z)] / row[ligand+"_count"]
            
        #Entropy
        matches = predicted_scores.loc[predicted_scores["domain"] == idx,:]
        if len(matches) == 0:       #Shouldn't happen
            print("Error: domain not found: ",idx)
        probs = matches["prob"].values
        norm = probs / np.linalg.norm(probs)
        domain_labels.loc[idx,ligand+"_entropy"] = entropy(norm)
            
    del domain_labels[ligand+"_count"]
    return domain_labels

In [4]:
def create_constant_features(input_path, filename, domain_labels):
    #All domains are in training set, so use that features table
    features_table = pd.read_csv(input_path+filename, index_col=0, sep='\t')
    
    #Compute features
    for idx,row in domain_labels.iterrows():
        matches = features_table.loc[features_table["domain_name"] == idx, :]
        #Should never happen
        if len(matches) == 0:
            print("Error: Domain not found: ",idx)
        #Length
        domain_labels.loc[idx,"domain_length"] = matches["domain_length"].values[0]
        domain_labels.loc[idx,"avg_protein_len"] = np.mean(matches["prot_avg_length"].values)
        #Conservation
        domain_labels.loc[idx,"avg_phastCons"] = matches["whole_domain_phastCons_avg"].values[0]
        domain_labels.loc[idx,"avg_phyloP"] = matches["whole_domain_phyloP_avg"].values[0]
        #Genomic variation
        domain_labels.loc[idx,"avg_maf"] = np.mean(matches["avg_maf_all"].values)
        domain_labels.loc[idx,"avg_blosum"] = np.mean(matches["blosum_avg"].values)
        #Physiochemical
        domain_labels.loc[idx,"avg_hindex"] = np.mean(matches["hindex_avg"].values)
        domain_labels.loc[idx,"avg_positve_cnt"] = np.mean(matches["aa_ref_charge_positive_count"].values)
        domain_labels.loc[idx,"avg_solvent_acc"] = np.mean(matches["solvent_acc_avg"].values)
        domain_labels.loc[idx,"avg_helix_prob"] = np.mean(matches["helix_prob_avg"].values)
        domain_labels.loc[idx,"avg_sheet_prob"] = np.mean(matches["sheet_prob_avg"].values)
        domain_labels.loc[idx,"avg_turn_prob"] = np.mean(matches["turn_prob_avg"].values)
        #Pfam
        domain_labels.loc[idx,"max_pfam_C"] = np.max(matches["pfam_prob_C"].values)
        domain_labels.loc[idx,"max_pfam_H"] = np.max(matches["pfam_prob_H"].values)
        domain_labels.loc[idx,"max_pfam_K"] = np.max(matches["pfam_prob_K"].values)
        domain_labels.loc[idx,"max_pfam_R"] = np.max(matches["pfam_prob_R"].values)
        domain_labels.loc[idx,"pfam_prob_max"] = np.max(matches["pfam_prob_max"].values)
        domain_labels.loc[idx,"cnt_pfam_conserved"] = np.count_nonzero(matches["is_pfam_conserved"].values)
        
    return domain_labels

In [5]:
%%time
#Create features for each of the relevant ligands
ligands = ["dna","rna","ion","peptide","sm"]
ligands_chosen_models = {"dna": "LIGAND", "rna": "MODEL", "ion": "MODEL", "peptide": "MODEL", "sm": "ALL"}
curr_dir = os.getcwd()
input_path = curr_dir+"/../10.Prediction/stacking/2nd_level_pred/08.06.18_dna0.5_rna0.5_ion0.75/global_auprc/"
domain_labels = pd.DataFrame()

for ligand in ligands:
    chosen_model = ligands_chosen_models[ligand]
    if (chosen_model == "LIGAND"):
        curr_input_path = input_path+"ligand_features_probs/"
    elif (chosen_model == "MODEL"):
        curr_input_path = input_path+"model_features/"
    elif (chosen_model == "ALL"):
        curr_input_path = input_path+"all_features_probs/"
    else:
        curr_input_path = input_path+"just_probs/"
    domain_labels = pd.concat([domain_labels, create_ligand_features(curr_input_path, ligand)], axis=1)
    
#Add features constant across ligands
input_path = curr_dir+"/../10.Prediction/domains_similarity/filtered_features_table/"
date = "08.06.18"
filename = "windowed_positions_features_mediode_filter_"+date+".csv"

domain_labels = create_constant_features(input_path, filename, domain_labels)
    
#Save to file
domain_labels.to_csv("domain_features.csv", sep='\t')