In [1]:
#Basic imports
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
import random
import sys

from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

### Reading the input dataset

In [2]:
curr_dir = !pwd
input_path = curr_dir[0]+"/../domains_similarity/filtered_features_table/"
filename = "positions_features_mediode_filter_01.25.18.csv"

bind_scores_num = 10

#Features table
features_all = pd.read_csv(input_path+filename, sep='\t', index_col=0)
features_cols = features_all.columns[1:-bind_scores_num] #removing binding scores and domain name
ligands = ["dna", "dnabase", "dnabackbone", "rna", "rnabase", "rnabackbone", "peptide", "ion", "metabolite"]
print "all samples positions #: "+str(features_all.shape[0])

#lignd binding domains dictionary
with open(curr_dir[0]+"/../ligands_negatives_domains_dict.pik", 'rb') as handle:
        domains_bind_dict = pickle.load(handle)

random.seed(0)
np.random.seed(0)

all samples positions #: 38944


#### Datasets of positive examples by ligand

In [3]:
bind_th = 0.1
ligands_pos_dict = {}

for ligand in ligands:
    score_col_str = ligand+"_binding_score"
    ligand_binding_df = features_all[features_all[score_col_str] >= bind_th]
    pos_num = ligand_binding_df.shape[0]
    print ligand+" #: "+str(pos_num)
    ligands_pos_dict[ligand] = pos_num

ligand_sort_by_pos_num = []
for key, value in sorted(ligands_pos_dict.iteritems(), key=lambda (k,v): (v,k)):
    ligand_sort_by_pos_num.append(key)

dna #: 501
dnabase #: 193
dnabackbone #: 408
rna #: 433
rnabase #: 224
rnabackbone #: 308
peptide #: 1496
ion #: 1093
metabolite #: 1525


## Create a "smart" CV split

#### Helper functions 

In [4]:
def domain_ligands_counts(domain, group_stats):
    "Adding the domain ligands pos and neg counts to group_stats"
    
    curr_domain_table = features_all[features_all["domain_name"] == domain]
    
    for ligand in ligands:
        ligand_str = ligand+"_binding_score"
        group_stats[ligand+"_pos"] += np.count_nonzero(curr_domain_table[ligand_str] >= 0.1)
        group_stats[ligand+"_neg"] += np.count_nonzero(curr_domain_table[ligand_str] == 0)

In [5]:
def domain_add_positions(domain, positions_list):
    "Updated the positions list of a CV group wiht positions from a certain domain"
    
    curr_domain_table = features_all[features_all["domain_name"] == domain]
    
    positions_list.extend(curr_domain_table.index.tolist())

In [6]:
def update_pos_left(domains_left_list, stats_keys):
    "Updates the positions left list, a list of positions left to be distributed to the CV groups."
    
    stats_dict = dict.fromkeys(stats_keys, 0)
    
    for domain in domains_left_list:
        domain_ligands_counts(domain, stats_dict)
    
    return stats_dict

In [7]:
def calc_next_group(prev_group_num, domain, labels_included, splits_dict, positions_left, stats_keys, K):
    "Retruning the optimal next group number that has the lowest counts of the ligands in the list ligands_included"
        
    #Calculate the number of positions in the groups for the relevant ligands
    group_pos_counts = defaultdict(list)
    for group_num in splits_dict.keys():
        for label in labels_included:
            group_pos_counts[label].append(splits_dict[group_num]["ligands_pos_neg"][label])
    
    #sorting the labels by number of positions left (from smallest)
    labels_sort_by_num_left = []
    for key, value in sorted(positions_left.iteritems(), key=lambda (k,v): (v,k)):
        labels_sort_by_num_left.append(key)
    
    min_idx = []
    next_group_num = 0
    for sorted_label in labels_sort_by_num_left:
        #skip labels not included in the domain
        if (sorted_label not in labels_included):
            continue
        #Get all CV groups where the minimal label has minimal number of positions
        curr_min_idx = np.where(group_pos_counts[sorted_label] == np.min(group_pos_counts[sorted_label]))[0].tolist()
        #skipping cases of all 0s
        if (len(curr_min_idx) == K):
            continue
        #Randomly choose one of the min idx for the current ligand
        next_group_num = (random.choice(curr_min_idx))+1 #index of each group is smaller by 1 from the group number
        break
    
    #If all relevant labels had just 0s: just assign to the next CV group
    if (next_group_num == 0):
        next_group_num = prev_group_num + 1
        if (next_group_num > K):
            next_group_num = 1
    
    return next_group_num

##### Count the different ligands distributions amongst domains

In [8]:
ligand_labels = [l+"_pos" for l in ligands]
ligand_labels.extend([l+"_neg" for l in ligands])
domains_list = features_all["domain_name"].unique().tolist()
ligands_type_dict = defaultdict(list)

for domain in domains_list:
    
    ligands_dict = dict.fromkeys(ligand_labels, 0)
    #Get pos/neg counts for the domain
    domain_ligands_counts(domain, ligands_dict)
    curr_labels_for_domain = []
    
    #Add to list all the labels with non-zero count
    for ligand_label in ligands_dict.keys():
        if (ligands_dict[ligand_label] > 0):
            curr_labels_for_domain.append(ligand_label)
    
    #add to the ligands type dict
    ligand_str = ""
    for ligand_label in curr_labels_for_domain:
        ligand_str += ligand_label+"$"
        
    ligands_type_dict[ligand_str].append(domain)

##### Create a prioritize combinations list

In [9]:
def prioritize_labels(labels_to_prioritize, labels_sort_by_num):
    
    priority_scores = [0 for x in labels_to_prioritize]
    next_label = ""
    for sorted_label in labels_sort_by_num:

        for i in range(len(labels_to_prioritize)):
            label = labels_to_prioritize[i]
            if (label.find(sorted_label) != -1):
                priority_scores[i] += 1
        best_labels = np.where(priority_scores == np.max(priority_scores))[0].tolist()
         
        if (len(best_labels) > 1):
            #Update labels_to_prioritize with just the best from this round
            updated_labels_to_prioritize = []
            for i in range(len(labels_to_prioritize)):
                if (i in best_labels):
                    updated_labels_to_prioritize.append(labels_to_prioritize[i])
            labels_to_prioritize = updated_labels_to_prioritize
            priority_scores = [0 for x in labels_to_prioritize]
            continue
        else:
            #one best label
            next_label = labels_to_prioritize[best_labels[0]]
            break
    
    #The labels can't be distinguished? choose one randomly
    if (next_label == ""):
        print "chose the next_label randomly"
        next_label = random.choice(labels_to_prioritize)
        
    return next_label

In [10]:
#Sort labels by positions number
def prioritize_label_combinations(domains_list, ligand_labels, label_combinations):
    
    #Calculating updated sorting of the labels
    positions_left = update_pos_left(domains_list, ligand_labels)
    labels_sort_by_num = []
    for key, value in sorted(positions_left.iteritems(), key=lambda (k,v): (v,k)):
            labels_sort_by_num.append(key)
    
    #break ties in ligands combinations sorting: prioritize labels with fewer positions
    priority_label_combinations = []

    labels_to_prioritize = []
    last_labels_num = 0
    for i in range(len(label_combinations)):
        
        curr_label_comb = label_combinations[i]
        curr_labels_num = curr_label_comb.count("$")
        
        if (curr_labels_num < last_labels_num):
            
            #prioritize aggregated labels
            while (len(labels_to_prioritize) > 1):
                next_label = prioritize_labels(labels_to_prioritize, labels_sort_by_num)
                priority_label_combinations.append(next_label)
                labels_to_prioritize.remove(next_label)
            
            priority_label_combinations.append(labels_to_prioritize[0])
            
            labels_to_prioritize = [curr_label_comb]
        else:
            labels_to_prioritize.append(curr_label_comb)
        
        last_labels_num = curr_labels_num
    
    #prioritize the labels left at the end
    while (len(labels_to_prioritize) > 1):
        next_label = prioritize_labels(labels_to_prioritize, labels_sort_by_num)
        priority_label_combinations.append(next_label)
        labels_to_prioritize.remove(next_label)
    priority_label_combinations.append(labels_to_prioritize[0])
    
    return priority_label_combinations

### Create the K-folds groups

In [11]:
K = 3
#Init the splits dict
splits_dict = defaultdict(dict)
for group_num in range(1,K+1):
    splits_dict[group_num]["num"] = 0
    splits_dict[group_num]["domains"] = []
    splits_dict[group_num]["positions"] = []
    splits_dict[group_num]["ligands_cnt"] = dict.fromkeys(ligands, 0)
    splits_dict[group_num]["ligands_pos_neg"] = dict.fromkeys(ligand_labels, 0)
    
domains_left = domains_list[:]
positions_left = update_pos_left(domains_left, ligand_labels)
next_group_to_assign = 0

#Sort labels combinations based on labels number
priority_label_combinations = ligands_type_dict.keys()
priority_label_combinations.sort(key=lambda x: x.count('$'), reverse=True)


#Assign the domains that has several ligands
while (len(priority_label_combinations) != 0):
    priority_label_combinations = prioritize_label_combinations(domains_left, ligand_labels, priority_label_combinations)
    label_comb = priority_label_combinations[0]
    
    curr_domains_group = ligands_type_dict[label_comb]
    labels_included = label_comb.split("$")
    labels_included.remove("") #remove the last element
    
    #For each domain in current ligands comb: add to the CV splits
    for domain in curr_domains_group:
        
        next_group_to_assign = calc_next_group(next_group_to_assign, domain, labels_included, splits_dict, positions_left, ligand_labels, K)
        splits_dict[next_group_to_assign]["domains"].append(domain)
        splits_dict[next_group_to_assign]["num"] += 1
        for label in labels_included:
            if (label.find("pos") != -1):
                ligand = label[:label.find("_")]
                splits_dict[next_group_to_assign]["ligands_cnt"][ligand] += 1
        domain_ligands_counts(domain,  splits_dict[next_group_to_assign]["ligands_pos_neg"])
        domain_add_positions(domain, splits_dict[next_group_to_assign]["positions"])
        domains_left.remove(domain)
        positions_left = update_pos_left(domains_left, ligand_labels)
        
        print "Finished domain "+domain
    print "finished label_comb: "+label_comb
    priority_label_combinations.remove(label_comb)

Finished domain Ank_2
Finished domain Metallophos
finished label_comb: dnabase_neg$rnabase_pos$metabolite_pos$rna_pos$dnabase_pos$rna_neg$rnabackbone_pos$metabolite_neg$ion_neg$peptide_neg$dnabackbone_neg$dnabackbone_pos$rnabase_neg$peptide_pos$rnabackbone_neg$dna_pos$ion_pos$dna_neg$
Finished domain RRM_1
finished label_comb: dnabase_neg$rnabase_pos$rna_pos$dnabase_pos$rna_neg$rnabackbone_pos$metabolite_neg$ion_neg$peptide_neg$dnabackbone_neg$dnabackbone_pos$rnabase_neg$peptide_pos$rnabackbone_neg$dna_pos$ion_pos$dna_neg$
Finished domain RNase_T
finished label_comb: dnabase_neg$rnabase_pos$metabolite_pos$rna_pos$dnabase_pos$rna_neg$rnabackbone_pos$metabolite_neg$ion_neg$peptide_neg$dnabackbone_neg$dnabackbone_pos$rnabase_neg$rnabackbone_neg$dna_pos$ion_pos$dna_neg$
Finished domain CSD
Finished domain zf-CCHC
finished label_comb: dnabase_neg$rnabase_pos$rna_pos$dnabase_pos$rna_neg$rnabackbone_pos$metabolite_neg$ion_neg$peptide_neg$dnabackbone_neg$dnabackbone_pos$rnabase_neg$rnabackbone

Finished domain NAD_binding_1
Finished domain NDK
Finished domain PH
Finished domain PI-PLC-X
Finished domain PI3_PI4_kinase
Finished domain Peptidase_M14
Finished domain Pyr_redox_2
Finished domain Rhodanese
Finished domain SNF
Finished domain Serum_albumin
Finished domain Sulfatase
Finished domain Sushi
Finished domain TTL
Finished domain adh_short
finished label_comb: dnabase_neg$metabolite_pos$rna_neg$metabolite_neg$ion_neg$peptide_neg$dnabackbone_neg$rnabase_neg$rnabackbone_neg$ion_pos$dna_neg$
Finished domain AAA_8
Finished domain ADH_zinc_N
Finished domain AMP-binding_C
Finished domain CRAL_TRIO
Finished domain Cation_ATPase
Finished domain F420_oxidored
Finished domain Fascin
Finished domain GAF
Finished domain Glyco_transf_29
Finished domain IL8
Finished domain IQ
Finished domain Lig_chan
Finished domain Macro
Finished domain MreB_Mbl
Finished domain Oxysterol_BP
Finished domain PA
Finished domain Pkinase_C
Finished domain START
Finished domain Somatomedin_B
Finished domain TS

#### Print domains counts results table

In [12]:
ligands_cnt_dict = {}
for group_num in splits_dict.keys():
    ligands_cnt_dict[group_num] = splits_dict[group_num]["ligands_cnt"]
ligands_cnt_df = pd.DataFrame.from_dict(ligands_cnt_dict)
ligands_cnt_df

Unnamed: 0,1,2,3
dna,12,10,13
dnabackbone,11,10,13
dnabase,11,9,9
ion,59,62,69
metabolite,44,38,44
peptide,33,35,33
rna,8,8,9
rnabackbone,8,7,9
rnabase,8,7,8


### Count the number of positives negatives in each group

In [13]:
stats_keys = [l+"_pos" for l in ligands]
stats_keys.extend([l+"_neg" for l in ligands])

group_stats_dict = {}

for group_num in range(1,K+1):
    group_stats = dict.fromkeys(stats_keys)
    for key in group_stats.keys(): group_stats[key] = 0
    group_domains = splits_dict[group_num]["domains"]
    for domain in group_domains:
        curr_domain_table = features_all[features_all["domain_name"] == domain]
        for ligand in ligands:
            ligand_str = ligand+"_binding_score"
            group_stats[ligand+"_pos"] += np.count_nonzero(curr_domain_table[ligand_str] >= 0.1)
            group_stats[ligand+"_neg"] += np.count_nonzero(curr_domain_table[ligand_str] == 0)
        
    group_stats_dict[group_num] = group_stats
group_stats_df = pd.DataFrame.from_dict(group_stats_dict)
group_stats_df

Unnamed: 0,1,2,3
dna_neg,12390,12935,12770
dna_pos,185,151,165
dnabackbone_neg,12435,12967,12801
dnabackbone_pos,145,126,137
dnabase_neg,12574,13060,12943
dnabase_pos,65,62,66
ion_neg,11545,11503,11440
ion_pos,369,362,362
metabolite_neg,11175,11309,11487
metabolite_pos,513,504,508


#### Saving the 10 folds selection

In [14]:
with open(curr_dir[0]+'/domain_3_splits_dict.pik', 'wb') as handle:
    pickle.dump(splits_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [15]:
group_stats_df.to_csv(curr_dir[0]+"/domain_3_group_stats_df.csv", sep='\t')