## Domain-based splits to fold - my chosen algorithm

In [1]:
#Basic imports
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
import random
import sys

#Import utils functions
curr_dir = !pwd
sys.path.append(curr_dir[0]+"/../utils")
from prediction_general_funcs import ligands, score_cols_suffix, get_features_cols
from prop_threshold_funcs import create_negatives_datasets, create_positives_datasets, create_positives_datasets_combined, create_negatives_datasets_combined

from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

### Reading the input dataset

In [2]:
curr_dir = !pwd
pfam_version = "31"
datafile_date = "08.06.18"
input_path = curr_dir[0]+"/../domains_similarity/filtered_features_table/"
filename = "windowed_positions_features_mediode_filter_"+datafile_date+".csv"

#flags for creating negatives and positives
zero_prop = True
no_prop = True
all_ligands = False
prec_th = 0.75

#Features table
features_all = pd.read_csv(input_path+filename, sep='\t', index_col=0)
features_cols = get_features_cols(features_all)

print "all samples positions #: "+str(features_all.shape[0])

random.seed(0)
np.random.seed(0)

all samples positions #: 44872


### Remove all features columns to speed up running time

In [3]:
for feature in features_cols:
    del features_all[feature]
features_cols = []

#### Datasets of positive examples by ligand

In [4]:
ligands_positives_df = create_positives_datasets_combined(features_all, features_cols, all_ligands, True)
ligands = ligands_positives_df.keys()

ligands_pos_dict = {}
for ligand in ligands:
    ligands_pos_dict[ligand] = ligands_positives_df[ligand].shape[0]
    
ligand_sort_by_pos_num = []
for key, value in sorted(ligands_pos_dict.iteritems(), key=lambda (k,v): (v,k)):
    ligand_sort_by_pos_num.append(key)

dna #: 369
dnabase #: 161
dnabackbone #: 245
dna combined #: 397
rna #: 450
rnabase #: 290
rnabackbone #: 306
rna combined #: 531
peptide #: 436
ion #: 351
metabolite #: 522
druglike #: 763
sm #: 825




## Create a "smart" CV split

#### Helper functions 

In [5]:
def domain_ligands_counts(domain, group_stats):
    "Adding the domain ligands pos and neg counts to group_stats"
    
    curr_domain_table = features_all[features_all["domain_name"] == domain]
    curr_domain_pos_dfs = create_positives_datasets_combined(curr_domain_table, features_cols, all_ligands, False)
    curr_domain_neg_dfs = create_negatives_datasets_combined(zero_prop, no_prop, curr_domain_table, features_cols, all_ligands, False)
    
    for ligand in ligands:
        group_stats[ligand+"_pos"] += curr_domain_pos_dfs[ligand].shape[0]
        group_stats[ligand+"_neg"] += curr_domain_neg_dfs[ligand].shape[0]

In [6]:
def domain_add_positions(domain, positions_list):
    "Updated the positions list of a CV group wiht positions from a certain domain"
    
    curr_domain_table = features_all[features_all["domain_name"] == domain]
    
    positions_list.extend(curr_domain_table.index.tolist())

In [7]:
def update_pos_left(domains_left_list, stats_keys):
    "Updates the positions left list, a list of positions left to be distributed to the CV groups."
    
    stats_dict = dict.fromkeys(stats_keys, 0)
    
    for domain in domains_left_list:
        domain_ligands_counts(domain, stats_dict)
    
    return stats_dict

In [8]:
def calc_next_group(prev_group_num, domain, labels_included, splits_dict, positions_left, stats_keys, K):
    "Retruning the optimal next group number that has the lowest counts of the ligands in the list ligands_included"
        
    #Calculate the number of positions in the groups for the relevant ligands
    group_pos_counts = defaultdict(list)
    for group_num in splits_dict.keys():
        for label in labels_included:
            group_pos_counts[label].append(splits_dict[group_num]["ligands_pos_neg"][label])
    
    #sorting the labels by number of positions left (from smallest)
    labels_sort_by_num_left = []
    for key, value in sorted(positions_left.iteritems(), key=lambda (k,v): (v,k)):
        labels_sort_by_num_left.append(key)
    
    min_idx = []
    next_group_num = 0
    for sorted_label in labels_sort_by_num_left:
        #skip labels not included in the domain
        if (sorted_label not in labels_included):
            continue
        #Get all CV groups where the minimal label has minimal number of positions
        curr_min_idx = np.where(group_pos_counts[sorted_label] == np.min(group_pos_counts[sorted_label]))[0].tolist()
        #skipping cases of all 0s
        if (len(curr_min_idx) == K):
            continue
        #Randomly choose one of the min idx for the current ligand
        next_group_num = (random.choice(curr_min_idx))+1 #index of each group is smaller by 1 from the group number
        break
    
    #If all relevant labels had just 0s: just assign to the next CV group
    if (next_group_num == 0):
        next_group_num = prev_group_num + 1
        if (next_group_num > K):
            next_group_num = 1
    
    return next_group_num

##### Count the different ligands distributions amongst domains

In [9]:
ligand_labels = [l+"_pos" for l in ligands]
ligand_labels.extend([l+"_neg" for l in ligands])
domains_list = features_all["domain_name"].unique().tolist()
ligands_type_dict = defaultdict(list)

for domain in domains_list:
    
    ligands_dict = dict.fromkeys(ligand_labels, 0)
    #Get pos/neg counts for the domain
    domain_ligands_counts(domain, ligands_dict)
    curr_labels_for_domain = []
    
    #Add to list all the labels with non-zero count
    for ligand_label in ligands_dict.keys():
        if (ligands_dict[ligand_label] > 0):
            curr_labels_for_domain.append(ligand_label)
    
    #add to the ligands type dict
    ligand_str = ""
    for ligand_label in curr_labels_for_domain:
        ligand_str += ligand_label+"$"
        
    ligands_type_dict[ligand_str].append(domain)

##### Create a prioritize combinations list

In [10]:
def prioritize_labels(labels_to_prioritize, labels_sort_by_num):
    
    priority_scores = [0 for x in labels_to_prioritize]
    next_label = ""
    for sorted_label in labels_sort_by_num:

        for i in range(len(labels_to_prioritize)):
            label = labels_to_prioritize[i]
            if (label.find(sorted_label) != -1):
                priority_scores[i] += 1
        best_labels = np.where(priority_scores == np.max(priority_scores))[0].tolist()
         
        if (len(best_labels) > 1):
            #Update labels_to_prioritize with just the best from this round
            updated_labels_to_prioritize = []
            for i in range(len(labels_to_prioritize)):
                if (i in best_labels):
                    updated_labels_to_prioritize.append(labels_to_prioritize[i])
            labels_to_prioritize = updated_labels_to_prioritize
            priority_scores = [0 for x in labels_to_prioritize]
            continue
        else:
            #one best label
            next_label = labels_to_prioritize[best_labels[0]]
            break
    
    #The labels can't be distinguished? choose one randomly
    if (next_label == ""):
        print "chose the next_label randomly"
        next_label = random.choice(labels_to_prioritize)
        
    return next_label

In [11]:
#Sort labels by positions number
def prioritize_label_combinations(domains_list, ligand_labels, label_combinations):
    
    #Calculating updated sorting of the labels
    positions_left = update_pos_left(domains_list, ligand_labels)
    labels_sort_by_num = []
    for key, value in sorted(positions_left.iteritems(), key=lambda (k,v): (v,k)):
            labels_sort_by_num.append(key)
    
    #break ties in ligands combinations sorting: prioritize labels with fewer positions
    priority_label_combinations = []

    labels_to_prioritize = []
    last_labels_num = 0
    for i in range(len(label_combinations)):
        
        curr_label_comb = label_combinations[i]
        curr_labels_num = curr_label_comb.count("$")
        
        if (curr_labels_num < last_labels_num):
            
            #prioritize aggregated labels
            while (len(labels_to_prioritize) > 1):
                next_label = prioritize_labels(labels_to_prioritize, labels_sort_by_num)
                priority_label_combinations.append(next_label)
                labels_to_prioritize.remove(next_label)
            
            priority_label_combinations.append(labels_to_prioritize[0])
            
            labels_to_prioritize = [curr_label_comb]
        else:
            labels_to_prioritize.append(curr_label_comb)
        
        last_labels_num = curr_labels_num
    
    #prioritize the labels left at the end
    while (len(labels_to_prioritize) > 1):
        next_label = prioritize_labels(labels_to_prioritize, labels_sort_by_num)
        priority_label_combinations.append(next_label)
        labels_to_prioritize.remove(next_label)
    priority_label_combinations.append(labels_to_prioritize[0])
    
    return priority_label_combinations

### Create the K-folds groups

In [12]:
%%time
K = 5
#Init the splits dict
splits_dict = defaultdict(dict)
for group_num in range(1,K+1):
    splits_dict[group_num]["num"] = 0
    splits_dict[group_num]["domains"] = []
    splits_dict[group_num]["positions"] = []
    splits_dict[group_num]["ligands_cnt"] = dict.fromkeys(ligands, 0)
    splits_dict[group_num]["ligands_pos_neg"] = dict.fromkeys(ligand_labels, 0)
    
domains_left = domains_list[:]
positions_left = update_pos_left(domains_left, ligand_labels)
next_group_to_assign = 0

#Sort labels combinations based on labels number
priority_label_combinations = ligands_type_dict.keys()
priority_label_combinations.sort(key=lambda x: x.count('$'), reverse=True)


#Assign the domains that has several ligands
while (len(priority_label_combinations) != 0):
    priority_label_combinations = prioritize_label_combinations(domains_left, ligand_labels, priority_label_combinations)
    label_comb = priority_label_combinations[0]
    
    curr_domains_group = ligands_type_dict[label_comb]
    labels_included = label_comb.split("$")
    labels_included.remove("") #remove the last element
    
    #For each domain in current ligands comb: add to the CV splits
    for domain in curr_domains_group:
        
        next_group_to_assign = calc_next_group(next_group_to_assign, domain, labels_included, splits_dict, positions_left, ligand_labels, K)
        splits_dict[next_group_to_assign]["domains"].append(domain)
        splits_dict[next_group_to_assign]["num"] += 1
        for label in labels_included:
            if (label.find("pos") != -1):
                ligand = label[:label.find("_")]
                splits_dict[next_group_to_assign]["ligands_cnt"][ligand] += 1
        domain_ligands_counts(domain,  splits_dict[next_group_to_assign]["ligands_pos_neg"])
        domain_add_positions(domain, splits_dict[next_group_to_assign]["positions"])
        domains_left.remove(domain)
        positions_left = update_pos_left(domains_left, ligand_labels)
        
        print "Finished domain "+domain
    print "finished label_comb: "+label_comb
    priority_label_combinations.remove(label_comb)

Finished domain NTP_transf_2
Finished domain RNase_T
finished label_comb: dna_pos$rna_pos$rna_neg$metabolite_neg$ion_neg$peptide_neg$druglike_pos$sm_neg$druglike_neg$sm_pos$metabolite_pos$ion_pos$dna_neg$
Finished domain Helicase_C
finished label_comb: dna_pos$rna_pos$rna_neg$metabolite_neg$ion_neg$peptide_neg$druglike_pos$sm_neg$peptide_pos$druglike_neg$sm_pos$metabolite_pos$dna_neg$
Finished domain Exo_endo_phos
finished label_comb: dna_pos$rna_neg$metabolite_neg$ion_neg$peptide_neg$druglike_pos$sm_neg$druglike_neg$sm_pos$metabolite_pos$ion_pos$dna_neg$
Finished domain Lectin_C
Finished domain Peptidase_C1
Finished domain Peptidase_M14
finished label_comb: rna_neg$metabolite_neg$ion_neg$peptide_neg$druglike_pos$sm_neg$peptide_pos$druglike_neg$sm_pos$metabolite_pos$ion_pos$dna_neg$
Finished domain tRNA-synt_1
finished label_comb: rna_pos$rna_neg$metabolite_neg$ion_neg$peptide_neg$druglike_pos$sm_neg$druglike_neg$sm_pos$metabolite_pos$ion_pos$dna_neg$
Finished domain RRM_1
finished lab

Finished domain 7TM_GPCR_Srsx
Finished domain ANAPC3
Finished domain Adap_comp_sub
Finished domain Arm
Finished domain BTB
Finished domain CAP_GLY
Finished domain Clathrin_propel
Finished domain Cyclin_N
Finished domain FERM_M
Finished domain FHA
Finished domain Filamin
Finished domain Hist_deacetyl
Finished domain Hormone_recep
Finished domain IRS
Finished domain Lipocalin
Finished domain MHC_II_alpha
Finished domain MIT
Finished domain PCI
Finished domain PID
Finished domain PTCB-BRCT
Finished domain Peptidase_C14
Finished domain Phospholip_A2_1
Finished domain Rad60-SLD
Finished domain Recep_L_domain
Finished domain SH3_1
Finished domain SH3_9
Finished domain SPRY
Finished domain Spin-Ssty
Finished domain TPR_10
Finished domain UQ_con
Finished domain fn1
Finished domain fn3
Finished domain ubiquitin
finished label_comb: rna_neg$metabolite_neg$ion_neg$peptide_neg$sm_neg$peptide_pos$druglike_neg$dna_neg$
Finished domain WD40
finished label_comb: rna_pos$rna_neg$metabolite_neg$ion_neg$

In [13]:
with open(curr_dir[0]+"/pfam-v"+pfam_version+"/domain_"+str(K)+"_folds_combined_dna0.5_rna0.25_ion0.75_prec_dict.pik", 'wb') as handle:
    pickle.dump(splits_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [12]:
with open(curr_dir[0]+"/pfam-v"+pfam_version+"/domain_"+str(K)+"_folds_"+str(prec_th)+"_prec_dict.pik", 'wb') as handle:
    pickle.dump(splits_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

#### Print domains counts results table

In [14]:
ligands_cnt_dict = {}
for group_num in splits_dict.keys():
    ligands_cnt_dict[group_num] = splits_dict[group_num]["ligands_cnt"]
ligands_cnt_df = pd.DataFrame.from_dict(ligands_cnt_dict)
ligands_cnt_df

Unnamed: 0,1,2,3,4,5
dna,6,9,7,5,6
druglike,25,27,22,18,25
ion,18,17,15,21,20
metabolite,14,21,14,14,17
peptide,16,10,18,15,13
rna,5,5,6,7,3
sm,25,30,26,21,30


In [15]:
ligands_cnt_df.to_csv(curr_dir[0]+"/pfam-v"+pfam_version+"/domain_"+str(K)+"_folds_combined_dna0.5_rna0.25_ion0.75_prec_domains_cnt_df.csv", sep='\t')

In [14]:
ligands_cnt_df.to_csv(curr_dir[0]+"/pfam-v"+pfam_version+"/domain_"+str(K)+"_folds_"+str(prec_th)+"_prec_domains_cnt_df.csv", sep='\t')

### Count the number of positives negatives in each group

In [16]:
stats_keys = [l+"_pos" for l in ligands]
stats_keys.extend([l+"_neg" for l in ligands])

group_stats_dict = {}

for group_num in range(1,K+1):
    group_stats = dict.fromkeys(stats_keys)
    for key in group_stats.keys(): group_stats[key] = 0
    group_domains = splits_dict[group_num]["domains"]
    for domain in group_domains:
        domain_ligands_counts(domain, group_stats)
        
    group_stats_dict[group_num] = group_stats
group_stats_df = pd.DataFrame.from_dict(group_stats_dict)
group_stats_df

Unnamed: 0,1,2,3,4,5
dna_neg,9037,8894,8153,8761,9039
dna_pos,70,78,88,77,84
druglike_neg,6926,7068,6981,7027,7016
druglike_pos,158,147,157,150,151
ion_neg,8018,8029,7466,7929,8188
ion_pos,74,72,67,67,71
metabolite_neg,8111,8165,7600,7739,8023
metabolite_pos,98,97,112,113,102
peptide_neg,8223,8269,7619,8235,8759
peptide_pos,86,91,85,87,87


In [17]:
group_stats_df.to_csv(curr_dir[0]+"/pfam-v"+pfam_version+"/domain_"+str(K)+"_folds_combined_dna0.5_rna0.25_ion0.75_prec_group_stats_df.csv", sep='\t')

In [16]:
group_stats_df.to_csv(curr_dir[0]+"/pfam-v"+pfam_version+"/domain_"+str(K)+"_folds_"+str(prec_th)+"_prec_group_stats_df.csv", sep='\t')