In [1]:
from __future__ import division, absolute_import

import sys
import os
import numpy as np
import random
import pickle
import h5py
import time

import pandas as pd
from imblearn import over_sampling 
from plotnine import *
from tables import *

from collections import Counter
from sklearn import preprocessing
from sklearn.cluster import MiniBatchKMeans

#root
absPath = '/home/angela3/imbalance_pcm_benchmark/'
sys.path.insert(0, absPath)

from src.imbalance_functions import *

np.random.seed(8)
random.seed(8)

In [2]:
nfolds = 10
protein_types = ["kinases", "GPCRs", "nuclear_receptors", "proteases"] 

### no_resampling

In [3]:
lista_dictos = []
for protein_type in protein_types:
    #Opening HDF5 with data
    file_h5 = "".join((absPath, "data/", protein_type,"/no_resampling/compounds_activity.h5"))
    f = h5py.File(file_h5, 'r')
    group = '/activity'
    table = "prot_comp"
    for fold in range(nfolds):
        print("Fold:", str(fold))
        file_list = "".join((absPath, "data/", protein_type, "/no_resampling/splitting_lists/splitting_",
                               str(fold), "_list.pickle"))
        with open(file_list, "rb") as input_file:
            splitting_list = pickle.load(input_file)    
    
        splitting_list[0].sort()
        splitting_list[1].sort()
        splitting_list[2].sort()
    
        train_y = list(f[group][table][splitting_list[0]]["label"])
        val_y = list(f[group][table][splitting_list[1]]["label"])
        test_y = list(f[group][table][splitting_list[2]]["label"])
        
        dict_fold = {"fold": fold, "protein_type": protein_type, "training": Counter(train_y), 
                     "validation":Counter(val_y), "test":Counter(test_y), "strategy": "no_resampling"}
        lista_dictos.append(dict_fold)
    
        #print("Training")
        #print(Counter(train_y))
        #print("Validation")
        #print(Counter(val_y))
        #print("Test")
        #print(Counter(test_y))
no_resampling_df = pd.DataFrame(lista_dictos)

Fold: 0
Fold: 1
Fold: 2
Fold: 3
Fold: 4
Fold: 5
Fold: 6
Fold: 7
Fold: 8
Fold: 9
Fold: 0
Fold: 1
Fold: 2
Fold: 3
Fold: 4
Fold: 5
Fold: 6
Fold: 7
Fold: 8
Fold: 9
Fold: 0
Fold: 1
Fold: 2
Fold: 3
Fold: 4
Fold: 5
Fold: 6
Fold: 7
Fold: 8
Fold: 9
Fold: 0
Fold: 1
Fold: 2
Fold: 3
Fold: 4
Fold: 5
Fold: 6
Fold: 7
Fold: 8
Fold: 9


In [4]:
no_resampling_df

Unnamed: 0,fold,protein_type,training,validation,test,strategy
0,0,kinases,"{1: 75708, 0: 22985}","{1: 10330, 0: 4014}","{1: 10914, 0: 2986}",no_resampling
1,1,kinases,"{0: 23255, 1: 76022}","{1: 10822, 0: 3605}","{0: 3125, 1: 10108}",no_resampling
2,2,kinases,"{1: 75607, 0: 23501}","{0: 3863, 1: 11249}","{0: 2621, 1: 10096}",no_resampling
3,3,kinases,"{1: 75816, 0: 23463}","{1: 11095, 0: 3478}","{0: 3044, 1: 10041}",no_resampling
4,4,kinases,"{1: 75302, 0: 24323}","{0: 2907, 1: 11664}","{0: 2755, 1: 9986}",no_resampling
5,5,kinases,"{1: 76088, 0: 23639}","{1: 10440, 0: 3434}","{1: 10424, 0: 2912}",no_resampling
6,6,kinases,"{1: 73972, 0: 22668}","{0: 4480, 1: 12307}","{0: 2837, 1: 10673}",no_resampling
7,7,kinases,"{1: 75369, 0: 23881}","{0: 2729, 1: 11988}","{0: 3375, 1: 9595}",no_resampling
8,8,kinases,"{0: 23372, 1: 77239}","{1: 8908, 0: 3554}","{1: 10805, 0: 3059}",no_resampling
9,9,kinases,"{1: 76546, 0: 24022}","{1: 10900, 0: 2223}","{0: 3740, 1: 9506}",no_resampling


### resampling_after_clustering

In [5]:
lista_dictos = []
for protein_type in protein_types:
    for fold in range(nfolds):
        print("Fold:", str(fold))
        #training
        file_train = "".join((absPath, "data/", protein_type, "/resampling_after_clustering/",
                                 str(fold), "/compounds_activity_training.h5"))
        f_train = h5py.File(file_train, 'r')
        group = '/activity'
        table = "prot_comp"
        #validation
        file_val = "".join((absPath, "data/", protein_type, "/resampling_after_clustering/",
                                 str(fold), "/compounds_activity_validation.h5"))
        f_val = h5py.File(file_val, 'r')
        #test
        file_test = "".join((absPath, "data/", protein_type, "/resampling_after_clustering/",
                                 str(fold), "/compounds_activity_test.h5"))
        f_test = h5py.File(file_test, 'r')
    
        sample_indices_train = range(len(f_train[group][table]))
        sample_indices_val = range(len(f_val[group][table]))
        sample_indices_test = range(len(f_test[group][table]))
    
        training_y = list(f_train[group][table][sample_indices_train]["label"])
        val_y = list(f_val[group][table][sample_indices_val]["label"])
        test_y = list(f_test[group][table][sample_indices_test]["label"])
    
        dict_fold = {"fold": fold, "protein_type": protein_type, "training": Counter(training_y), 
                     "validation":Counter(val_y), "test":Counter(test_y), "strategy": "resampling_after_clustering"}
        lista_dictos.append(dict_fold)
    
        #print("Training")
        #print(Counter(train_y))
        #print("Validation")
        #print(Counter(val_y))
        #print("Test")
        #print(Counter(test_y))
resampling_after_clust_df = pd.DataFrame(lista_dictos)

Fold: 0
Fold: 1
Fold: 2
Fold: 3
Fold: 4
Fold: 5
Fold: 6
Fold: 7
Fold: 8
Fold: 9
Fold: 0
Fold: 1
Fold: 2
Fold: 3
Fold: 4
Fold: 5
Fold: 6
Fold: 7
Fold: 8
Fold: 9
Fold: 0
Fold: 1
Fold: 2
Fold: 3
Fold: 4
Fold: 5
Fold: 6
Fold: 7
Fold: 8
Fold: 9
Fold: 0
Fold: 1
Fold: 2
Fold: 3
Fold: 4
Fold: 5
Fold: 6
Fold: 7
Fold: 8
Fold: 9


In [6]:
resampling_after_clust_df 

Unnamed: 0,fold,protein_type,training,validation,test,strategy
0,0,kinases,"{1: 77083, 0: 77083}","{0: 9388, 1: 9388}","{0: 9952, 1: 9952}",resampling_after_clustering
1,1,kinases,"{1: 77111, 0: 77111}","{0: 9878, 1: 9878}","{0: 8803, 1: 8803}",resampling_after_clustering
2,2,kinases,"{1: 76982, 0: 76982}","{0: 10327, 1: 10327}","{1: 8673, 0: 8673}",resampling_after_clustering
3,3,kinases,"{1: 76565, 0: 76565}","{0: 9995, 1: 9995}","{0: 8980, 1: 8980}",resampling_after_clustering
4,4,kinases,"{1: 76438, 0: 76438}","{0: 10157, 1: 10157}","{0: 7466, 1: 7466}",resampling_after_clustering
5,5,kinases,"{1: 76953, 0: 76953}","{0: 9081, 1: 9081}","{0: 9175, 1: 9175}",resampling_after_clustering
6,6,kinases,"{1: 74596, 0: 74596}","{1: 11056, 0: 11056}","{0: 9747, 1: 9747}",resampling_after_clustering
7,7,kinases,"{1: 76495, 0: 76495}","{0: 10818, 1: 10818}","{0: 8433, 1: 8433}",resampling_after_clustering
8,8,kinases,"{1: 78383, 0: 78383}","{0: 7984, 1: 7984}","{0: 9803, 1: 9803}",resampling_after_clustering
9,9,kinases,"{1: 78362, 0: 78362}","{0: 9093, 1: 9093}","{1: 8517, 0: 8517}",resampling_after_clustering


### resampling_before_clustering

In [7]:
lista_folds = []
for protein_type in protein_types:
    #Opening HDF5 with data
    file_h5 = "".join((absPath, "data/", protein_type,"/resampling_before_clustering/compounds_activity.h5"))
    f = h5py.File(file_h5, 'r')
    group = '/activity'
    table = "prot_comp"
    for fold in range(nfolds):
        print("Fold:", str(fold))
        file_list = "".join((absPath, "data/", protein_type, "/resampling_before_clustering/splitting_lists/splitting_",
                               str(fold), "_list.pickle"))
        with open(file_list, "rb") as input_file:
            splitting_list = pickle.load(input_file)    
    
        splitting_list[0].sort()
        splitting_list[1].sort()
        splitting_list[2].sort()

        training_y = list(f[group][table][splitting_list[0]]["label"])
        val_y = list(f[group][table][splitting_list[1]]["label"])
        test_y = list(f[group][table][splitting_list[2]]["label"])
    
        dict_fold = {"fold": fold, "protein_type": protein_type, "training": Counter(training_y), 
                     "validation":Counter(val_y), "test":Counter(test_y), "strategy": "resampling_before_clustering"}
        lista_folds.append(dict_fold)
    

resampling_before_clust_df = pd.DataFrame(lista_folds)

Fold: 0
Fold: 1
Fold: 2
Fold: 3
Fold: 4
Fold: 5
Fold: 6
Fold: 7
Fold: 8
Fold: 9
Fold: 0
Fold: 1
Fold: 2
Fold: 3
Fold: 4
Fold: 5
Fold: 6
Fold: 7
Fold: 8
Fold: 9
Fold: 0
Fold: 1
Fold: 2
Fold: 3
Fold: 4
Fold: 5
Fold: 6
Fold: 7
Fold: 8
Fold: 9
Fold: 0
Fold: 1
Fold: 2
Fold: 3
Fold: 4
Fold: 5
Fold: 6
Fold: 7
Fold: 8
Fold: 9


In [8]:
resampling_before_clust_df

Unnamed: 0,fold,protein_type,training,validation,test,strategy
0,0,kinases,"{0: 76767, 1: 75507}","{0: 11132, 1: 13086}","{0: 10476, 1: 9782}",resampling_before_clustering
1,1,kinases,"{0: 79550, 1: 75979}","{0: 7624, 1: 12280}","{0: 11201, 1: 10116}",resampling_before_clustering
2,2,kinases,"{0: 78316, 1: 78369}","{0: 10755, 1: 7038}","{0: 9304, 1: 12968}",resampling_before_clustering
3,3,kinases,"{0: 79423, 1: 77456}","{0: 9652, 1: 10177}","{0: 9300, 1: 10742}",resampling_before_clustering
4,4,kinases,"{0: 76709, 1: 80381}","{0: 11296, 1: 8264}","{0: 10370, 1: 9730}",resampling_before_clustering
5,5,kinases,"{0: 76112, 1: 78064}","{0: 11524, 1: 8428}","{0: 10739, 1: 11883}",resampling_before_clustering
6,6,kinases,"{0: 76126, 1: 79680}","{0: 9017, 1: 9573}","{0: 13232, 1: 9122}",resampling_before_clustering
7,7,kinases,"{0: 77338, 1: 78679}","{0: 9537, 1: 10988}","{0: 11500, 1: 8708}",resampling_before_clustering
8,8,kinases,"{0: 77947, 1: 78131}","{0: 12202, 1: 8221}","{0: 8226, 1: 12023}",resampling_before_clustering
9,9,kinases,"{0: 75323, 1: 78773}","{1: 10936, 0: 11819}","{0: 11233, 1: 8666}",resampling_before_clustering


### semi_resampling

In [9]:
lista_folds = []
for protein_type in protein_types:

    for fold in range(nfolds): 
        print("Fold:", str(fold))

        file_h5 = "".join((absPath, "data/", protein_type, "/semi_resampling/", str(fold),
                                 "/compounds_activity.h5"))
        f = h5py.File(file_h5, 'r')
        group = '/activity'
        table = "prot_comp"
        ##### TEST
        file_test = "".join((absPath, "data/", protein_type, "/semi_resampling/", str(fold),
                                 "/compounds_activity_test.h5"))
        f_test = h5py.File(file_test, 'r')
    
        file_list = "".join((absPath, "data/", protein_type, "/semi_resampling/", str(fold), 
                         "/splitting_lists/splitting_", str(fold),"_list.pickle"))
        with open(file_list, "rb") as input_file:
            splitting_list = pickle.load(input_file)
    
        splitting_list[0].sort()
        splitting_list[1].sort()
        test_indices = range(len(f_test[group][table]))
    
        train_y = list(f[group][table][splitting_list[0]]["label"])
        val_y = list(f[group][table][splitting_list[1]]["label"])
        test_y = list(f_test[group][table][test_indices]["label"])
        
        dict_fold = {"fold": fold, "protein_type": protein_type, "training": Counter(train_y), 
                     "validation":Counter(val_y), "test":Counter(test_y), "strategy": "semi_resampling"}
        lista_folds.append(dict_fold)
    
semi_resampling_df = pd.DataFrame(lista_folds)

Fold: 0
Fold: 1
Fold: 2
Fold: 3
Fold: 4
Fold: 5
Fold: 6
Fold: 7
Fold: 8
Fold: 9
Fold: 0
Fold: 1
Fold: 2
Fold: 3
Fold: 4
Fold: 5
Fold: 6
Fold: 7
Fold: 8
Fold: 9
Fold: 0
Fold: 1
Fold: 2
Fold: 3
Fold: 4
Fold: 5
Fold: 6
Fold: 7
Fold: 8
Fold: 9
Fold: 0
Fold: 1
Fold: 2
Fold: 3
Fold: 4
Fold: 5
Fold: 6
Fold: 7
Fold: 8
Fold: 9


In [10]:
frames = [no_resampling_df, resampling_after_clust_df, resampling_before_clust_df, semi_resampling_df]

resulting_df = pd.concat(frames)

In [11]:
resulting_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 160 entries, 0 to 39
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   fold          160 non-null    int64 
 1   protein_type  160 non-null    object
 2   training      160 non-null    object
 3   validation    160 non-null    object
 4   test          160 non-null    object
 5   strategy      160 non-null    object
dtypes: int64(1), object(5)
memory usage: 8.8+ KB


In [12]:
resulting_df

Unnamed: 0,fold,protein_type,training,validation,test,strategy
0,0,kinases,"{1: 75708, 0: 22985}","{1: 10330, 0: 4014}","{1: 10914, 0: 2986}",no_resampling
1,1,kinases,"{0: 23255, 1: 76022}","{1: 10822, 0: 3605}","{0: 3125, 1: 10108}",no_resampling
2,2,kinases,"{1: 75607, 0: 23501}","{0: 3863, 1: 11249}","{0: 2621, 1: 10096}",no_resampling
3,3,kinases,"{1: 75816, 0: 23463}","{1: 11095, 0: 3478}","{0: 3044, 1: 10041}",no_resampling
4,4,kinases,"{1: 75302, 0: 24323}","{0: 2907, 1: 11664}","{0: 2755, 1: 9986}",no_resampling
...,...,...,...,...,...,...
35,5,proteases,"{0: 51302, 1: 50393}","{0: 5717, 1: 6626}","{0: 3800, 1: 6497}",semi_resampling
36,6,proteases,"{1: 48201, 0: 47830}","{1: 8868, 0: 9239}","{0: 4569, 1: 6259}",semi_resampling
37,7,proteases,"{0: 51374, 1: 52420}","{0: 6611, 1: 5565}","{0: 3366, 1: 6319}",semi_resampling
38,8,proteases,"{1: 51111, 0: 50632}","{1: 5448, 0: 5927}","{0: 3814, 1: 7616}",semi_resampling


In [13]:
resulting_df.to_csv("".join((absPath, "data/data_count_df.csv"))) 