In [1]:
#Basic imports
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
import random
import sys

from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

### Reading the input dataset

In [2]:
curr_dir = !pwd
input_path = curr_dir[0]+"/../domains_similarity/filtered_features_table/"
filename = "positions_features_mediode_filter_01.25.18.csv"

bind_scores_num = 10

#Features table
features_all = pd.read_csv(input_path+filename, sep='\t', index_col=0)
features_cols = features_all.columns[1:-bind_scores_num] #removing binding scores and domain name
ligands = ["dna", "dnabase", "dnabackbone", "rna", "rnabase", "rnabackbone", "peptide", "ion", "metabolite"]
print "all samples positions #: "+str(features_all.shape[0])

#lignd binding domains dictionary
with open(curr_dir[0]+"/../ligands_negatives_domains_dict.pik", 'rb') as handle:
        domains_bind_dict = pickle.load(handle)

random.seed(0)
np.random.seed(0)

all samples positions #: 38944


#### Datasets of positive examples by ligand

In [3]:
bind_th = 0.1
ligands_pos_dict = {}

for ligand in ligands:
    score_col_str = ligand+"_binding_score"
    ligand_binding_df = features_all[features_all[score_col_str] >= bind_th]
    pos_num = ligand_binding_df.shape[0]
    print ligand+" #: "+str(pos_num)
    ligands_pos_dict[ligand] = pos_num

ligand_sort_by_pos_num = []
for key, value in sorted(ligands_pos_dict.iteritems(), key=lambda (k,v): (v,k)):
    ligand_sort_by_pos_num.append(key)

dna #: 501
dnabase #: 193
dnabackbone #: 408
rna #: 433
rnabase #: 224
rnabackbone #: 308
peptide #: 1496
ion #: 1093
metabolite #: 1525


## Create a "smart" CV split

#### Helper functions 

In [4]:
def domain_ligands_counts(domain, group_stats):
    "Adding the domain ligands pos and neg counts to group_stats"
    
    curr_domain_table = features_all[features_all["domain_name"] == domain]
    
    for ligand in ligands:
        ligand_str = ligand+"_binding_score"
        group_stats[ligand+"_pos"] += np.count_nonzero(curr_domain_table[ligand_str] >= 0.1)
        group_stats[ligand+"_neg"] += np.count_nonzero(curr_domain_table[ligand_str] == 0)

In [5]:
def calc_next_group(prev_group_num, domain, ligands_included, splits_dict, stats_keys, K):
    "Retruning the optimal next group number that has the lowest counts of the ligands in the list ligands_included"
        
    #Calculate the number of positives in the groups for the relevant ligands
    group_pos_counts = defaultdict(list)
    for group_num in splits_dict.keys():

        for ligand in ligands_included:
            ligand_str = ligand+"_pos"
            group_pos_counts[ligand].append(splits_dict[group_num]["ligands_pos_neg"][ligand_str])
    
    #Finding the smallest number of positives, going from the smallest ligand up.
    min_idx = []
    next_group_num = 0
    for sorted_ligand in ligand_sort_by_pos_num:
        if (sorted_ligand not in ligands_included):
            continue
        curr_min_idx = np.where(group_pos_counts[sorted_ligand] == np.min(group_pos_counts[sorted_ligand]))[0].tolist()
        #skipping cases of all 0s
        if (len(curr_min_idx) == K):
            continue
        #Randomly choose one of the min idx for the current ligand
        next_group_num = (random.choice(curr_min_idx))+1 #index of each group is smaller by 1 from the group number
        break
    
    #Igf all relevant ligands had just 0s
    if (next_group_num == 0):
        next_group_num = prev_group_num + 1
        if (next_group_num > K):
            next_group_num = 1
    
    return next_group_num

#### Count the different ligands distributions amongst domains

In [6]:
domains_list = features_all["domain_name"].unique().tolist()
ligands_type_dict = defaultdict(list)

for domain in domains_list:
    curr_ligands_for_domain = []
    for ligand in ligands:
        if domain in domains_bind_dict[ligand].keys():
            curr_ligands_for_domain.append(ligand)
    
    if (len(curr_ligands_for_domain) == 0):
        ligand_str = "no_ligands"
    elif (len(curr_ligands_for_domain) == 1):
        ligand_str = "only_"+curr_ligands_for_domain[0]
    else:
        ligand_str = ""
        for ligand in curr_ligands_for_domain:
            ligand_str += ligand+"$"
        
    ligands_type_dict[ligand_str].append(domain)

### Create the K-folds groups

In [7]:
K = 10
stats_keys = [l+"_pos" for l in ligands]
stats_keys.extend([l+"_neg" for l in ligands])
#Init the splits dict
splits_dict = defaultdict(dict)
for group_num in range(1,K+1):
    splits_dict[group_num]["num"] = 0
    splits_dict[group_num]["domains"] = []
    splits_dict[group_num]["ligands_cnt"] = dict.fromkeys(ligands, 0)
    splits_dict[group_num]["ligands_pos_neg"] = dict.fromkeys(stats_keys, 0)
    
domains_left = domains_list[:]
next_group_to_assign = 0

#Assign the domains without any ligand
curr_domains_group = ligands_type_dict["no_ligands"]
for domain in curr_domains_group:
    next_group_to_assign += 1
    if (next_group_to_assign > 10):
        next_group_to_assign = 1
    splits_dict[next_group_to_assign]["domains"].append(domain)
    splits_dict[next_group_to_assign]["num"] += 1
    domain_ligands_counts(domain, splits_dict[next_group_to_assign]["ligands_pos_neg"])
    domains_left.remove(domain) 

#Assign the domains that have just one ligand
for ligand in ligands:
    ligand_str = "only_"+ligand
    curr_domains_group = ligands_type_dict[ligand_str]
    for domain in curr_domains_group:
        
        next_group_to_assign = calc_next_group(next_group_to_assign, domain, [ligand], splits_dict, stats_keys, K)
        splits_dict[next_group_to_assign]["domains"].append(domain)
        splits_dict[next_group_to_assign]["num"] += 1
        splits_dict[next_group_to_assign]["ligands_cnt"][ligand] += 1
        domain_ligands_counts(domain,  splits_dict[next_group_to_assign]["ligands_pos_neg"])
        domains_left.remove(domain)

#Assign the domains that has several ligands
groups_names = ligands_type_dict.keys()
rand_groups_names = np.random.permutation(groups_names)
sorted_group_names = np.sort(groups_names)
for ligands_comb in sorted_group_names:
    if (ligands_comb.startswith("only") or ligands_comb == "no_ligands"):
        continue
    curr_domains_group = ligands_type_dict[ligands_comb]
    ligands_included = ligands_comb.split("$")
    ligands_included.remove("")
    for domain in curr_domains_group:
        next_group_to_assign = calc_next_group(next_group_to_assign, domain, ligands_included, splits_dict, stats_keys, K)
        splits_dict[next_group_to_assign]["domains"].append(domain)
        splits_dict[next_group_to_assign]["num"] += 1
        for ligand in ligands_included:
            splits_dict[next_group_to_assign]["ligands_cnt"][ligand] += 1
        domain_ligands_counts(domain,  splits_dict[next_group_to_assign]["ligands_pos_neg"])
        domains_left.remove(domain)

#### Print domains counts results table

In [8]:
ligands_cnt_dict = {}
for group_num in splits_dict.keys():
    ligands_cnt_dict[group_num] = splits_dict[group_num]["ligands_cnt"]
ligands_cnt_df = pd.DataFrame.from_dict(ligands_cnt_dict)
ligands_cnt_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
dna,4,5,3,4,4,2,3,3,4,3
dnabackbone,4,4,3,4,4,2,3,3,4,3
dnabase,3,4,3,3,3,2,3,2,3,3
ion,17,20,20,19,22,24,11,17,20,20
metabolite,11,11,13,12,12,21,6,12,15,13
peptide,10,13,9,9,13,12,9,6,10,10
rna,2,5,2,2,2,4,2,1,3,2
rnabackbone,2,5,2,2,2,3,2,1,3,2
rnabase,2,5,2,1,2,3,2,1,3,2


### Count the number of positives negatives in each group

In [9]:
stats_keys = [l+"_pos" for l in ligands]
stats_keys.extend([l+"_neg" for l in ligands])

group_stats_dict = {}

for group_num in range(1,K+1):
    group_stats = dict.fromkeys(stats_keys)
    for key in group_stats.keys(): group_stats[key] = 0
    group_domains = splits_dict[group_num]["domains"]
    for domain in group_domains:
        curr_domain_table = features_all[features_all["domain_name"] == domain]
        for ligand in ligands:
            ligand_str = ligand+"_binding_score"
            group_stats[ligand+"_pos"] += np.count_nonzero(curr_domain_table[ligand_str] >= 0.1)
            group_stats[ligand+"_neg"] += np.count_nonzero(curr_domain_table[ligand_str] == 0)
        
    group_stats_dict[group_num] = group_stats
group_stats_df = pd.DataFrame.from_dict(group_stats_dict)
group_stats_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
dna_neg,4730,3333,3521,4517,3689,5201,1952,3439,4007,3706
dna_pos,46,44,59,76,60,30,53,43,56,34
dnabackbone_neg,4747,3343,3529,4522,3712,5209,1963,3446,4022,3710
dnabackbone_pos,36,35,48,72,45,26,38,33,47,28
dnabase_neg,4781,3374,3575,4645,3739,5219,1996,3468,4050,3730
dnabase_pos,19,18,19,13,24,16,25,24,20,15
ion_neg,4386,2942,3273,4265,3431,4449,1799,2980,3623,3340
ion_pos,120,114,105,112,105,104,108,106,110,109
metabolite_neg,4191,3126,3273,4276,3344,4416,1823,2971,3597,2954
metabolite_pos,153,112,187,168,131,224,81,135,181,153


#### Saving the 10 folds selection

In [10]:
with open(curr_dir[0]+'/domain_10_splits_dict.pik', 'wb') as handle:
    pickle.dump(splits_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [11]:
group_stats_df.to_csv(curr_dir[0]+"/domain_group_stats_df.csv", sep='\t')