## Calculate fraction of binding positions in the X top columns by each method
01/18/2018: a performance measurment of fraction of "top X" was seen in two papers:

1) Capra and Singh, 2007: https://www.ncbi.nlm.nih.gov/pubmed/17519246

2) Kai Wang and Ram Samudrala, 2006: https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-7-385

In [74]:
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict

#### read input

In [103]:
curr_dir = !pwd
filename = "positions_ortho-para_01.16.18.csv"
ortho_para_df = pd.read_csv(curr_dir[0]+"/"+filename, sep='\t', index_col=0)

ligands = ["dna", "dnabase", "dnabackbone", "rna", "rnabase", "rnabackbone", "peptide", "ion", "metabolite", "max"] 

methods = ["instances_individuals_change_ratio", "aa_ref_overlap_individuals_change_ratio", "jsd_100way_instances_major_ratio", "jsd_100way_aa_not_used_ratio", "jsd_mul_aa_ref_SE",
          "jsd_SE_diff_ratio", "SE_jsd_diff_ratio", "jsds_ratio", "jsds_subtraction"]

BIND_TH = 0.1
top_columns = [10, 20, 30]

In [13]:
ortho_para_df.columns

Index([u'pfam_prob_max', u'is_pfam_conserved', u'instances_change_frac',
       u'aa_ref_overlap', u'aa_ref_SE', u'aa_ref_jsd',
       u'instances_individuals_change_ratio',
       u'aa_ref_overlap_individuals_change_ratio', u'avg_jsd_100way_blosum',
       u'jsd_100way_instances_major_ratio', u'jsd_100way_aa_not_used_ratio',
       u'jsd_mul_aa_ref_SE', u'jsd_SE_diff_ratio', u'SE_jsd_diff_ratio',
       u'jsds_ratio', u'jsds_subtraction', u'dna_binding_score',
       u'rna_binding_score', u'dnabase_binding_score',
       u'rnabase_binding_score', u'dnabackbone_binding_score',
       u'rnabackbone_binding_score', u'peptide_binding_score',
       u'ion_binding_score', u'metabolite_binding_score', u'max_binding_score',
       u'avg_maf_all', u'domain_name'],
      dtype='object')

In [6]:
domain_names = ortho_para_df["domain_name"].unique()

In [109]:
top_frac_dict = defaultdict(list)
skipped_domains_X_dict = defaultdict(list)
skipped_domains_no_bind_dict = defaultdict(list)

for X in top_columns:
    
    for domain in domain_names:
        domain_table = ortho_para_df[ortho_para_df["domain_name"] == domain]
        domain_len = domain_table.shape[0]
        #Calculating for X top columns only if X is less than half of the domain
        if (X >= 0.5 * domain_len):
            skipped_domains_X_dict["domain"].append(domain)
            skipped_domains_X_dict["domain_len"].append(domain_len)
            skipped_domains_X_dict["X"].append(X)
            continue
        #Calculating for X top columns only if X is at least 10% of the domain
        elif (X < 0.1 * domain_len):
            skipped_domains_X_dict["domain"].append(domain)
            skipped_domains_X_dict["domain_len"].append(domain_len)
            skipped_domains_X_dict["X"].append(X)
            continue

        for ligand in ligands:
            #Get indices of binding positions
            ligand_str = ligand+"_binding_score"
            bind_list = domain_table[domain_table[ligand_str] > BIND_TH].index
            bind_idx = [int(x[len(domain)+1:]) for x in bind_list]
            bind_num = len(bind_idx)
            
            #Calculating only if there is at least 1 binding position
            if (bind_num == 0):
                skipped_domains_no_bind_dict["domain"].append(domain)
                skipped_domains_no_bind_dict["ligand"].append(ligand)
                continue

            for method in methods:
                method_ranking = domain_table[method].order(ascending=False)
                method_topX = method_ranking[0:X].index
                method_topX_idx = [int(x[len(domain)+1:]) for x in method_topX]
                hits_num = np.sum([1 for x in bind_idx if x in method_topX_idx])
                hits_frac = hits_num/float(min(X, bind_num)) #Normalizing for bind_num < X
                
                #Saving data to the dict
                top_frac_dict["domain"].append(domain)
                top_frac_dict["domain_len"].append(domain_len)
                top_frac_dict["ligand"].append(ligand)
                top_frac_dict["method"].append(method)
                top_frac_dict["X"].append(X)
                top_frac_dict["bind_num"].append(bind_num)
                top_frac_dict["hits_num"].append(hits_num)
                top_frac_dict["hits_frac"].append(hits_frac)
        
        print "Finished domain "+domain
    print "Finished top columns "+str(X)

Finished domain 2OG-FeII_Oxy_3
Finished domain AMP-binding_C
Finished domain ANAPC3
Finished domain APOBEC_C
Finished domain Acyl-CoA_dh_M
Finished domain Ank
Finished domain Ank_2
Finished domain Ank_3
Finished domain Ank_4
Finished domain Ank_5
Finished domain Annexin
Finished domain Arm
Finished domain BEN
Finished domain BIR
Finished domain BRCT
Finished domain BTB_2
Finished domain Bcl-2
Finished domain Bromodomain
Finished domain C1-set
Finished domain C1_1
Finished domain C2-set_2
Finished domain CAP_GLY
Finished domain CBFD_NFYB_HMF
Finished domain CBS
Finished domain CSD
Finished domain Cadherin
Finished domain Cadherin_2
Finished domain Calx-beta
Finished domain Cation_ATPase
Finished domain Chromo
Finished domain Clathrin_propel
Finished domain Crystall
Finished domain Cyt-b5
Finished domain Disintegrin
Finished domain EF-hand_1
Finished domain EF-hand_5
Finished domain EF-hand_6
Finished domain EF-hand_7
Finished domain EF-hand_8
Finished domain EGF
Finished domain EGF_2
Fi



In [110]:
#Saving to data frames
top_frac_df = pd.DataFrame.from_dict(top_frac_dict)
skipped_domains_X_df = pd.DataFrame.from_dict(skipped_domains_X_dict)
skipped_domains_no_bind_df = pd.DataFrame.from_dict(skipped_domains_no_bind_dict)

In [117]:
#Export to .csv tables
top_frac_df.to_csv(curr_dir[0]+"/topX_res_tables/top_frac.csv", sep='\t')
skipped_domains_X_df.to_csv(curr_dir[0]+"/topX_res_tables/skipped_domains_X.csv", sep='\t')
skipped_domains_no_bind_df.to_csv(curr_dir[0]+"/topX_res_tables/skipped_domains_no_bind.csv", sep='\t')

In [118]:
top_frac_df.shape

(22491, 8)

In [112]:
skipped_domains_X_df.shape

(478, 3)

In [113]:
skipped_domains_no_bind_df.shape

(5231, 2)