In [1]:
#Basic imports
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict
import random
import sys
from sklearn import utils

#Import utils functions
sys.path.append('/home/anat/Research/ExAC/10.Prediction/utils')
from neg_pos_funcs import create_negatives_datasets, create_positives_datasets

from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

### Reading the input dataset

In [2]:
curr_dir = !pwd
input_path = curr_dir[0]+"/../domains_similarity/filtered_features_table/"
filename = "positions_features_mediode_filter_01.25.18.csv"

bind_scores_num = 10

#Features table
features_all = pd.read_csv(input_path+filename, sep='\t', index_col=0)
features_cols = features_all.columns[1:-bind_scores_num] #removing binding scores and domain name
ligands = ["dna", "dnabase", "dnabackbone", "rna", "rnabase", "rnabackbone", "peptide", "ion", "metabolite"]
print "all samples positions #: "+str(features_all.shape[0])

#lignd binding domains dictionary
with open(curr_dir[0]+"/../ligands_negatives_domains_dict.pik", 'rb') as handle:
        domains_bind_dict = pickle.load(handle)

random.seed(0)
np.random.seed(0)
ABSOLUTE_NEGATIVES = False
FILTER_DOMAIN = False
FILTER_MAX_SCORE_ZERO = False

all samples positions #: 38944


## Implementing "Iterative stratification"
from the paper: "On the Stratification of Multi-Label Data", 2011

##### Algorithm input

In [3]:
#Labels columns names
labels_columns = []
for ligand in ligands:
    ligand_str = ligand+"_binding_score"
    labels_columns.append(ligand_str)

D = features_all.loc[:,labels_columns]
D = utils.shuffle(D, random_state=0) #Constant shuffle to ensure randomness

L = [l+"_pos" for l in ligands]
L.extend([l+"_neg" for l in ligands])
k = 3
R = [0.1 for i in range(k)]

##### 1) Index all domains in each label and their positions counts in the label
##### 2) index domains length

In [4]:
domains_list = features_all["domain_name"].unique().tolist()
domains_labels_dict = dict.fromkeys(L)
for key in domains_labels_dict.keys(): domains_labels_dict[key] = {}
domain_len_dict = {}

for domain in domains_list:
    curr_domain_table = features_all[features_all["domain_name"] == domain]
    domain_stats = dict.fromkeys(L,0)
    domain_len_dict[domain] = curr_domain_table.shape[0]
    
    for ligand in ligands:
        ligand_str = ligand+"_binding_score"
        pos_cnt = np.count_nonzero(curr_domain_table[ligand_str] >= 0.1)
        neg_cnt = np.count_nonzero(curr_domain_table[ligand_str] == 0)
        domain_stats[ligand+"_pos"] += pos_cnt
        domain_stats[ligand+"_neg"] += neg_cnt
        if (pos_cnt > 0):
            domains_labels_dict[ligand+"_pos"][domain] = pos_cnt
        if (neg_cnt > 0):
            domains_labels_dict[ligand+"_neg"][domain] = neg_cnt

##### Calculating the desired number of examples, cj, at each subset Sj

In [5]:
C_all = []
for j in range(k):
    C_all.append(D.shape[0]*R[j])

##### Calculating the desired number of examples of each label at each susbset

In [6]:
ligands_negatives_df = create_negatives_datasets(FILTER_DOMAIN, ABSOLUTE_NEGATIVES, FILTER_MAX_SCORE_ZERO, features_all, features_cols)
bind_th = 0.1
ligands_features_df = create_positives_datasets(bind_th, features_all, features_cols)

dna non-binding #:38095
dnabase non-binding #:38577
dnabackbone non-binding #:38203
rna non-binding #:38047
rnabase non-binding #:38407
rnabackbone non-binding #:38223
peptide non-binding #:35437
ion non-binding #:34488
metabolite non-binding #:33971
all_ligands non-binding #:27191
dna #: 501
dnabase #: 193
dnabackbone #: 408
rna #: 433
rnabase #: 224
rnabackbone #: 308
peptide #: 1496
ion #: 1093
metabolite #: 1525


In [7]:
C_labels = {}
for i in range(len(L)):
    #Find the examples of each label in the initial set
    (ligand, pos_neg) = L[i].split("_")
    if (pos_neg == "pos"):
        D_i = ligands_features_df[ligand].shape[0]
    else:
        D_i = ligands_negatives_df[ligand].shape[0]
    C_i = []
    for j in range(k):
        C_i.append(D_i*R[j])
     
    C_labels[L[i]] = C_i 

##### Iterative process

In [8]:
def domain_ligands_counts(domain_table, group_stats):
    "Adding the domain ligands pos and neg counts to group_stats"
    
    for index, row in domain_table.iterrows():
    
        for ligand in ligands:
            ligand_str = ligand+"_binding_score"
            if (row[ligand_str] >= 0.1):
                group_stats[ligand+"_pos"] += 1
            if (row[ligand_str] == 0):
                group_stats[ligand+"_neg"] += 1

In [9]:
def calc_labels_numbers(D,L):
    
    labels_cnt = []
    
    for label in L:
        (ligand, pos_neg) = label.split("_")
        ligand_str = ligand+"_binding_score"
        if (pos_neg == "pos"):
            labels_cnt.append(D[D[ligand_str] >= bind_th].shape[0])
        else:
            labels_cnt.append(D[D[ligand_str] == 0].shape[0])
    
    return labels_cnt

In [10]:
%%time
#Init splits_dict
splits_dict = defaultdict(dict)
for group_num in range(1,k+1):
    splits_dict[group_num]["num"] = 0
    splits_dict[group_num]["domains"] = []
    splits_dict[group_num]["positions"] = []
    splits_dict[group_num]["ligands_pos_neg"] = dict.fromkeys(L, 0)

D_left = D[:]
while (len(D_left) > 0):
    labels_cnt = np.array(calc_labels_numbers(D_left, L))
    min_label_pos = np.where(labels_cnt == np.min(labels_cnt[np.nonzero(labels_cnt)]))[0]
    min_label = L[min_label_pos[0]]
    
    #Find a domain with the most min_label positions
    max_domains = [key for key,val in domains_labels_dict[min_label].iteritems() if val == max(domains_labels_dict[min_label].values())]
    if (len(max_domains) == 1):
        curr_domain = max_domains[0]
    else:
        #breaking ties by taking the domain with minimal length (less impact on other labels)
        max_domains_len = [domain_len_dict[x] for x in max_domains]
        curr_domain = max_domains[np.where(max_domains_len == np.max(max_domains_len))[0][0]]
    
    curr_domain_table = features_all[features_all["domain_name"] == curr_domain]
    
    #Find the subset(s) with the largest number of desired examples for this label, 
    #breaking ties by considering the largest number of desired examples, 
    #breaking further ties randomly
    M_idx = np.where(C_labels[min_label] == np.max(C_labels[min_label]))[0]
    if (len(M_idx) == 1):
        m_pos = M_idx[0] #There is only one subset that needs this label the most
    else:
        #Amongst the M_positions, look for the one with highest total number of examples
        M_C_total = []
        for j in M_idx:
            M_C_total.append(C_all[j])
        M_C_idx = np.where(M_C_total == np.max(M_C_total))[0]
        if (len(M_C_idx) == 1):
            m_pos = M_idx[M_C_idx[0]] #There is only one subset that needs more examples the most
        else:
            m_pos = M_idx[random.choice(M_C_idx)] #break ties randomly
    
    #Add the domain to the chosen subset
    group_num = m_pos +1
    splits_dict[group_num]["domains"].append(curr_domain)
    splits_dict[group_num]["positions"].extend(curr_domain_table.index.tolist())
    splits_dict[group_num]["num"] += 1
    domain_ligands_counts(curr_domain_table,  splits_dict[group_num]["ligands_pos_neg"])
    
    #Remove the domains positions from the table
    for index, row in curr_domain_table.iterrows():
        D_left.drop(index, inplace=True)
    
    #Update desired number of examples for each label of this position
    labels_cnts = calc_labels_numbers(curr_domain_table,L)
    labels_included_pos = np.where(np.array(labels_cnts) == 1)[0]
    for label_pos in labels_included_pos:
            label = L[label_pos]
            cnt = labels_cnts[label_pos]
            C_labels[label][m_pos] -= cnt
    
    #Update desired number of examples for this subset (deducting domains length)
    C_all[m_pos] -= domain_len_dict[curr_domain]
    
    #Remove domain from the domains_labels_dict
    for label in domains_labels_dict.keys():
        if (curr_domain in domains_labels_dict[label].keys()):
            del domains_labels_dict[label][curr_domain]
    
    print "Finished domain "+curr_domain+" for label "+min_label

Finished domain RnaseA for label dnabase_pos
Finished domain KH_1 for label dnabase_pos
Finished domain CSD for label dnabase_pos
Finished domain HMG_box for label dnabase_pos
Finished domain Trypsin for label dnabase_pos
Finished domain Metallophos for label dnabase_pos
Finished domain RNase_T for label dnabase_pos
Finished domain RRM_1 for label dnabase_pos
Finished domain T-box for label dnabase_pos
Finished domain BEN for label dnabase_pos
Finished domain GATA for label dnabase_pos
Finished domain MH1 for label dnabase_pos
Finished domain HSF_DNA-bind for label dnabase_pos
Finished domain bZIP_1 for label dnabase_pos
Finished domain Homeobox for label dnabase_pos
Finished domain HLH for label dnabase_pos
Finished domain Helicase_C for label dnabase_pos
Finished domain Forkhead for label dnabase_pos
Finished domain Ets for label dnabase_pos
Finished domain GTP_EFTU_D2 for label dnabase_pos
Finished domain Pou for label dnabase_pos
Finished domain zf-C4 for label dnabase_pos
Finished

Finished domain HATPase_c for label ion_pos
Finished domain Phospholip_A2_1 for label ion_pos
Finished domain BTB_2 for label ion_pos
Finished domain PAN_1 for label ion_pos
Finished domain Cyt-b5 for label ion_pos
Finished domain CBS for label ion_pos
Finished domain Sushi for label ion_pos
Finished domain ANF_receptor for label ion_pos
Finished domain ECH_1 for label ion_pos
Finished domain E1-E2_ATPase for label ion_pos
Finished domain PARP for label ion_pos
Finished domain Proteasome for label ion_pos
Finished domain Cyclin_N for label ion_pos
Finished domain DAGK_cat for label ion_pos
Finished domain STAS for label ion_pos
Finished domain SRCR for label ion_pos
Finished domain Methyltransf_11 for label ion_pos
Finished domain GST_C for label ion_pos
Finished domain Bromodomain for label ion_pos
Finished domain FCH for label ion_pos
Finished domain Kelch_1 for label ion_pos
Finished domain Hemopexin for label ion_pos
Finished domain TPR_2 for label ion_pos
Finished domain EGF for l

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


#### Print domains counts results table

In [11]:
ligands_cnt_dict = {}
for group_num in splits_dict.keys():
    ligands_cnt_dict[group_num] = splits_dict[group_num]["num"]
ligands_cnt_df = pd.DataFrame.from_dict(ligands_cnt_dict, orient='index')
ligands_cnt_df

Unnamed: 0,0
1,50
2,58
3,164


### Count the number of positives negatives in each group

In [None]:
splits_dict[1].keys()

In [12]:
stats_keys = [l+"_pos" for l in ligands]
stats_keys.extend([l+"_neg" for l in ligands])
group_stats_dict = {}

for group_num in range(1,k+1):
    group_stats = dict.fromkeys(stats_keys)
    for key in group_stats.keys(): group_stats[key] = 0
    group_domains = splits_dict[group_num]["domains"]
    for domain in group_domains:
        curr_domain_table = features_all[features_all["domain_name"] == domain]
        for ligand in ligands:
            ligand_str = ligand+"_binding_score"
            group_stats[ligand+"_pos"] += np.count_nonzero(curr_domain_table[ligand_str] >= 0.1)
            group_stats[ligand+"_neg"] += np.count_nonzero(curr_domain_table[ligand_str] == 0)
        
    group_stats_dict[group_num] = group_stats
group_stats_df = pd.DataFrame.from_dict(group_stats_dict)
group_stats_df

Unnamed: 0,1,2,3
dna_neg,6473,5410,26212
dna_pos,163,166,172
dnabackbone_neg,6499,5458,26246
dnabackbone_pos,140,131,137
dnabase_neg,6666,5561,26350
dnabase_pos,60,69,64
ion_neg,6370,5421,22697
ion_pos,60,53,980
metabolite_neg,5845,5516,22610
metabolite_pos,369,43,1113


#### Saving the 10 folds selection

In [13]:
with open(curr_dir[0]+'/iterative_domain_3_splits_dict.pik', 'wb') as handle:
    pickle.dump(splits_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [14]:
group_stats_df.to_csv(curr_dir[0]+"/iterative_domain_3_group_stats_df.csv", sep='\t')

In [21]:
splits_dict.keys()

[1, 2, 3, 4, 5]