In [1]:
#Basic imports
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
import random
import sys

from sklearn import utils

#Import utils functions
sys.path.append('/home/anat/Research/ExAC/10.Prediction/utils')
from neg_pos_funcs import create_negatives_datasets, create_positives_datasets

from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

### Reading the input dataset

In [2]:
curr_dir = !pwd
input_path = curr_dir[0]+"/../domains_similarity/filtered_features_table/"
filename = "positions_features_mediode_filter_01.25.18.csv"

bind_scores_num = 10

#Features table
features_all = pd.read_csv(input_path+filename, sep='\t', index_col=0)
features_cols = features_all.columns[1:-bind_scores_num] #removing binding scores and domain name
ligands = ["dna", "dnabase", "dnabackbone", "rna", "rnabase", "rnabackbone", "peptide", "ion", "metabolite"]
print "all samples positions #: "+str(features_all.shape[0])

#ligand binding domains dictionary
with open(curr_dir[0]+"/../ligands_negatives_domains_dict.pik", 'rb') as handle:
    domains_bind_dict = pickle.load(handle)

random.seed(0)
np.random.seed(0)
ABSOLUTE_NEGATIVES = False
FILTER_DOMAIN = False
FILTER_MAX_SCORE_ZERO = False

all samples positions #: 38944


## Implementing "Iterative stratification"
from the paper: "On the Stratification of Multi-Label Data", 2011

##### Algorithm input

In [3]:
#Labels columns names
labels_columns = []
for ligand in ligands:
    ligand_str = ligand+"_binding_score"
    labels_columns.append(ligand_str)

In [4]:
D = features_all.loc[:,labels_columns]
D = utils.shuffle(D, random_state=0) #Constant shuffle to ensure randomness

In [5]:
L = [l+"_pos" for l in ligands]
L.extend([l+"_neg" for l in ligands])
k = 10
R = [0.1 for i in range(k)]

##### Calculating the desired number of examples, cj, at each subset Sj

In [6]:
C_all = []
for j in range(k):
    C_all.append(D.shape[0]*R[j])

##### Calculating the desired number of examples of each label at each susbset

In [7]:
ligands_negatives_df = create_negatives_datasets(FILTER_DOMAIN, ABSOLUTE_NEGATIVES, FILTER_MAX_SCORE_ZERO, features_all, features_cols)
bind_th = 0.1
ligands_features_df = create_positives_datasets(bind_th, features_all, features_cols)

dna non-binding #:38095
dnabase non-binding #:38577
dnabackbone non-binding #:38203
rna non-binding #:38047
rnabase non-binding #:38407
rnabackbone non-binding #:38223
peptide non-binding #:35437
ion non-binding #:34488
metabolite non-binding #:33971
all_ligands non-binding #:27191
dna #: 501
dnabase #: 193
dnabackbone #: 408
rna #: 433
rnabase #: 224
rnabackbone #: 308
peptide #: 1496
ion #: 1093
metabolite #: 1525


In [8]:
C_labels = {}
for i in range(len(L)):
    #Find the examples of each label in the initial set
    (ligand, pos_neg) = L[i].split("_")
    if (pos_neg == "pos"):
        D_i = ligands_features_df[ligand].shape[0]
    else:
        D_i = ligands_negatives_df[ligand].shape[0]
    C_i = []
    for j in range(k):
        C_i.append(D_i*R[j])
     
    C_labels[L[i]] = C_i 

##### Iterative process

In [9]:
def position_ligands_counts(pos, group_stats):
    "Adding the domain ligands pos and neg counts to group_stats"
    
    curr_pos_rec =features_all.loc[pos,:]
    
    for ligand in ligands:
        ligand_str = ligand+"_binding_score"
        if (curr_pos_rec[ligand_str] >= 0.1):
            group_stats[ligand+"_pos"] += 1
        if (curr_pos_rec[ligand_str] == 0):
            group_stats[ligand+"_neg"] += 1

In [10]:
def calc_labels_numbers(D,L):
    
    labels_cnt = []
    
    for label in L:
        (ligand, pos_neg) = label.split("_")
        ligand_str = ligand+"_binding_score"
        if (pos_neg == "pos"):
            labels_cnt.append(D[D[ligand_str] >= bind_th].shape[0])
        else:
            labels_cnt.append(D[D[ligand_str] == 0].shape[0])
    
    return labels_cnt

In [11]:
#Init splits_dict
splits_dict = defaultdict(dict)
for group_num in range(1,k+1):
    splits_dict[group_num]["num"] = 0
    splits_dict[group_num]["positions"] = []
    splits_dict[group_num]["ligands_pos_neg"] = dict.fromkeys(L, 0)

    
D_left = D[:]
while (len(D_left) > 0):
    #Find the label with the fewest (but at least one) remaining examples, breaking ties randomly
    labels_cnt = np.array(calc_labels_numbers(D_left, L))
    min_label_pos = np.where(labels_cnt == np.min(labels_cnt[np.nonzero(labels_cnt)]))[0]
    min_label = L[min_label_pos[0]]
    
    #Iterate the remaining examples in min label
    (ligand, pos_neg) = min_label.split("_")
    ligand_str = ligand+"_binding_score"
    if (pos_neg == "pos"):
        D_left_min_label = D_left[D_left[ligand_str] >= bind_th]
    else:
        D_left_min_label = D_left[D_left[ligand_str] == 0]
    
    for index, row in D_left_min_label.iterrows():
        #Find the subset(s) with the largest number of desired examples for this label, 
        #breaking ties by considering the largest number of desired examples, 
        #breaking further ties randomly
        M_idx = np.where(C_labels[min_label] == np.max(C_labels[min_label]))[0]
        if (len(M_idx) == 1):
            m_pos = M_idx[0] #There is only one subset that needs this label the most
        else:
            #Amongst the M_positions, look for the one with highest total number of examples
            M_C_total = []
            for j in M_idx:
                M_C_total.append(C_all[j])
            M_C_idx = np.where(M_C_total == np.max(M_C_total))[0]
            if (len(M_C_idx) == 1):
                m_pos = M_idx[M_C_idx[0]] #There is only one subset that needs more examples the most
            else:
                m_pos = M_idx[random.choice(M_C_idx)]
    
        #Add the position to the chosen subset
        group_num = m_pos +1
        splits_dict[group_num]["positions"].append(index)
        splits_dict[group_num]["num"] += 1
        position_ligands_counts(index,  splits_dict[group_num]["ligands_pos_neg"])
        
        #Remove the position from the table
        D_left.drop(index, inplace=True)
        
        #Update desired number of examples for each label of this position
        labels_cnts = calc_labels_numbers(pd.DataFrame.transpose(pd.DataFrame(row)),L)
        labels_included_pos = np.where(np.array(labels_cnts) == 1)[0]
        for label_pos in labels_included_pos:
            label = L[label_pos]
            C_labels[label][m_pos] -= 1
        
        #Update desired number of examples for this subset
        C_all[m_pos] -= 1
    print "Finished label "+min_label

Finished label dnabase_pos
Finished label rnabase_pos
Finished label rnabackbone_pos
Finished label rna_pos
Finished label dnabackbone_pos
Finished label dna_pos
Finished label ion_pos
Finished label metabolite_pos
Finished label peptide_pos
Finished label metabolite_neg
Finished label ion_neg
Finished label peptide_neg
Finished label dna_neg
Finished label dnabase_neg
Finished label dnabackbone_neg
Finished label rna_neg


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


### Count the number of positives negatives in each group

In [17]:
stats_keys = [l+"_pos" for l in ligands]
stats_keys.extend([l+"_neg" for l in ligands])

group_stats_dict = {}

for group_num in range(1,k+1): 
    group_stats_dict[group_num] = splits_dict[group_num]["ligands_pos_neg"]
group_stats_df = pd.DataFrame.from_dict(group_stats_dict)
group_stats_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
dna_neg,3806,3835,3806,3806,3805,3806,3807,3806,3806,3812
dna_pos,50,50,50,50,50,49,50,52,51,49
dnabackbone_neg,3820,3848,3816,3816,3814,3812,3814,3821,3818,3824
dnabackbone_pos,41,41,41,40,41,41,41,40,41,41
dnabase_neg,3857,3879,3848,3848,3863,3855,3857,3853,3856,3861
dnabase_pos,20,19,19,20,20,19,19,19,19,19
ion_neg,3448,3449,3449,3449,3448,3449,3449,3449,3449,3449
ion_pos,110,109,110,109,110,109,109,109,109,109
metabolite_neg,3397,3397,3397,3397,3397,3397,3397,3397,3398,3397
metabolite_pos,152,153,153,152,153,153,152,153,152,152


#### Saving the 10 folds selection

In [18]:
with open(curr_dir[0]+'/iterative_10_splits_dict.pik', 'wb') as handle:
    pickle.dump(splits_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [19]:
group_stats_df.to_csv(curr_dir[0]+"/iterative_group_stats_df.csv", sep='\t')