In [25]:
from __future__ import division, absolute_import

import sys
import os
import numpy as np
import random
import pickle
import h5py
import time

import pandas as pd
from imblearn import over_sampling 
from plotnine import *
from tables import *

from collections import Counter
from sklearn import preprocessing
from sklearn.cluster import MiniBatchKMeans

#root
absPath = '/home/angela3/imbalance_pcm_benchmark/'
sys.path.insert(0, absPath)

from src.imbalance_functions import *

np.random.seed(8)
random.seed(8)

In [26]:
protein_type = "GPCRs" #"kinases"

In [27]:
#Loading list of unique proteins 
with open("".join((absPath, "data/", protein_type, "/", protein_type, "_prots.pickle")), "rb") as handle:
    unique_prots = pickle.load(handle)

In [28]:
unique_prots

['UYMO',
 '0BJE',
 '78TA',
 'RJYX',
 '95E2',
 'P59Q',
 'XY2A',
 '3Y9A',
 'J6QF',
 'I8AL',
 'K1O5',
 'Q6RX',
 'L3MG',
 '5W5R',
 '3KD1',
 'MUMW',
 'SH36',
 '2CH1',
 'TOAR',
 'F5KQ',
 'N0SR',
 '230E',
 'OYY9',
 '7WTN',
 '8VB1',
 'CVFR',
 'LETU',
 'LFAB',
 'C0XK',
 'DFO0',
 '1ZXI',
 'SXV6',
 'DPHN',
 'ZALS',
 'Y9QF',
 'YO6L',
 '3Y7S',
 '5GL1',
 'NXHT',
 'JVOZ',
 '4C54',
 '06WU',
 '662N',
 'ES23',
 'KPOO',
 'DFF0',
 'CFPC',
 'SPT7',
 'HSYO',
 'FESF',
 'FFXN',
 '1AZF',
 '5N4X',
 'G8ER',
 'SG15',
 '2SAM',
 '8PJC',
 'TBTI',
 '4BF3',
 'DCP7',
 'XTZC',
 'VX88',
 'V31H',
 '8ZJZ',
 'OG3W',
 '33X9',
 '69Q1',
 'SVAL',
 'OBT5',
 '3QL2',
 '7V1X',
 '1MWP',
 'IAKH',
 'DU8D',
 'FFCA',
 '4SAW',
 'ZG4W',
 '9FAD',
 'K90C',
 'CJWT',
 'SJGF',
 'LTX5',
 'V27B',
 'QZ2G',
 'RPPN',
 'C0L6',
 '5K6Y',
 'IE5U',
 'YVNY',
 '3WO1',
 'S982',
 'YT4M',
 'DLLJ',
 'MAYC',
 '79FS',
 'BDHY',
 'SKZW',
 '8L6X',
 'OTEJ',
 '5T8S',
 'KOPG',
 '3DQ7',
 'H5E4',
 '5NBB',
 '00QN',
 'JB1G',
 'WP5M',
 'L0SK',
 'JC99',
 '7ODP',
 'MQ2W',
 

In [29]:
# Loading kinase activities
activity_file = "".join((absPath, "data/", protein_type, "_activity.csv"))
activity_df = pd.read_csv(activity_file, sep="\t")
print(activity_df.info())
print(activity_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200523 entries, 0 to 200522
Data columns (total 10 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Unnamed: 0                200523 non-null  int64  
 1   DeepAffinity Protein ID   200523 non-null  object 
 2   Uniprot ID                200523 non-null  object 
 3   DeepAffinity Compound ID  200523 non-null  object 
 4   CID                       200523 non-null  int64  
 5   activity                  200523 non-null  float64
 6   label                     200523 non-null  float64
 7   Canonical SMILE           200523 non-null  object 
 8   Sequence                  200523 non-null  object 
 9   family                    200523 non-null  object 
dtypes: float64(2), int64(2), object(6)
memory usage: 15.3+ MB
None
   Unnamed: 0 DeepAffinity Protein ID Uniprot ID DeepAffinity Compound ID  \
0          32                    UYMO     Q92847                     4vb

In [30]:
#loading fingeprints
file_fps = "".join((absPath, "raw_data/dcid_fingerprint.tsv"))
fps_df = pd.read_csv(file_fps, sep="\t")
fps_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 598206 entries, 0 to 598205
Data columns (total 2 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   DeepAffinity Compound ID  598206 non-null  object
 1   Fingerprint Feature       598206 non-null  object
dtypes: object(2)
memory usage: 9.1+ MB


In [31]:
#Keeping only data from valid proteins
sub_df = activity_df[activity_df["DeepAffinity Protein ID"].isin(unique_prots)]
sub_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200523 entries, 0 to 200522
Data columns (total 10 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Unnamed: 0                200523 non-null  int64  
 1   DeepAffinity Protein ID   200523 non-null  object 
 2   Uniprot ID                200523 non-null  object 
 3   DeepAffinity Compound ID  200523 non-null  object 
 4   CID                       200523 non-null  int64  
 5   activity                  200523 non-null  float64
 6   label                     200523 non-null  float64
 7   Canonical SMILE           200523 non-null  object 
 8   Sequence                  200523 non-null  object 
 9   family                    200523 non-null  object 
dtypes: float64(2), int64(2), object(6)
memory usage: 16.8+ MB


In [32]:
activity_mini = pd.merge(sub_df, fps_df, "left", on=["DeepAffinity Compound ID"])
activity_mini.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200523 entries, 0 to 200522
Data columns (total 11 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Unnamed: 0                200523 non-null  int64  
 1   DeepAffinity Protein ID   200523 non-null  object 
 2   Uniprot ID                200523 non-null  object 
 3   DeepAffinity Compound ID  200523 non-null  object 
 4   CID                       200523 non-null  int64  
 5   activity                  200523 non-null  float64
 6   label                     200523 non-null  float64
 7   Canonical SMILE           200523 non-null  object 
 8   Sequence                  200523 non-null  object 
 9   family                    200523 non-null  object 
 10  Fingerprint Feature       200523 non-null  object 
dtypes: float64(2), int64(2), object(7)
memory usage: 18.4+ MB


In [33]:
activity_mini["separated_fps"] = activity_mini["Fingerprint Feature"].apply(separating_fps)

In [34]:
X = pd.DataFrame.from_dict(dict(zip(activity_mini["separated_fps"].index, activity_mini["separated_fps"].values))).transpose()
X["Y"] = activity_mini["label"].values
X["prot"] = activity_mini["DeepAffinity Protein ID"].values
X["comp_ID"] = activity_mini["DeepAffinity Compound ID"].values
X["Sequence"] = activity_mini["Sequence"].values
X["family"] = activity_mini["family"].values
X["Canonical SMILE"] = activity_mini["Canonical SMILE"].values

X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200523 entries, 0 to 200522
Columns: 887 entries, 0 to Canonical SMILE
dtypes: float64(1), int64(881), object(5)
memory usage: 1.3+ GB


In [35]:
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,877,878,879,880,Y,prot,comp_ID,Sequence,family,Canonical SMILE
0,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,1.0,UYMO,4vb9,MWNATPSEEPGFNLTLADLDWDASPGNDSLGDELLQLFPAPLLAGV...,GPCR,CC(C)NC(=O)NC1=CC=C(C=C1)C2=C(N=C(N=C2N)N)COCC...
1,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,1.0,UYMO,samf,MWNATPSEEPGFNLTLADLDWDASPGNDSLGDELLQLFPAPLLAGV...,GPCR,CC(C)(C)NC(=O)NC1=CC=C(C=C1)C2=C(N=C(N=C2N)N)C...
2,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,1.0,UYMO,fckm,MWNATPSEEPGFNLTLADLDWDASPGNDSLGDELLQLFPAPLLAGV...,GPCR,C1=CC=C(C=C1)COCC2=C(C(=NC(=N2)N)N)C3=CC=C(C=C...
3,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,1.0,UYMO,e68w,MWNATPSEEPGFNLTLADLDWDASPGNDSLGDELLQLFPAPLLAGV...,GPCR,C1=CC=C(C=C1)CNC(=O)NC2=CC=C(C=C2)C3=C(N=C(N=C...
4,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,1.0,UYMO,ob5k,MWNATPSEEPGFNLTLADLDWDASPGNDSLGDELLQLFPAPLLAGV...,GPCR,CC(C1=CC=CC=C1)NC(=O)NC2=CC=C(C=C2)C3=C(N=C(N=...


In [36]:
# for each protein, save a sub-dataframe with data
if not os.path.exists("".join((absPath, "data/", protein_type, "/no_resampling/"))):
    os.makedirs("".join((absPath, "data/", protein_type, "/no_resampling/")))

In [37]:
X.to_csv("".join((absPath, "data/", protein_type, "/no_resampling/activity_fps.csv")))

## Clustering 

In [38]:
nclusters = 100
batch_size = 1000


#keeping only minimum info for clustering
only_comps = X.drop(["prot", "Y", "Sequence", "Canonical SMILE", "family"], axis=1)
only_comps.drop_duplicates("comp_ID", inplace=True)
only_comps.reset_index(inplace=True)

sample_indices = np.arange(0, only_comps.shape[0])
sample_indices = np.random.permutation(sample_indices)

only_comps.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120906 entries, 0 to 120905
Columns: 883 entries, index to comp_ID
dtypes: int64(882), object(1)
memory usage: 814.5+ MB


In [39]:
generate_batches = batch_generator(batch_size, only_comps, sample_indices)

In [40]:
#K-Means
model = MiniBatchKMeans(n_clusters=nclusters, init='k-means++', compute_labels=True)
sse = {}
labels = []
comp_ids = []
clusters_centers = {}
for i, batch in enumerate(generate_batches):
    print("Iteration ", i)
    t0 = time.time()
    df = pd.DataFrame(batch)
    print(df["comp_ID"])
    comp_ids.append(df["comp_ID"])
    to_array = df.drop(['comp_ID'], axis=1).values
    to_array = preprocessing.scale(to_array)
    
    model.partial_fit(to_array)
    print("The inertia for the batch %s is %s" % (i, model.inertia_))
    t_mini_batch = time.time() - t0
    print(t_mini_batch)
    sse[i] = model.inertia_
    labels.append(model.labels_)
    clusters_centers[i] = model.cluster_centers_

Iteration  0
0      xmss
1      ap7z
2      b5cs
3      kgmm
4      vf28
       ... 
995    mqvo
996    712a
997    pbl0
998    meqn
999    j9qn
Name: comp_ID, Length: 1000, dtype: object
The inertia for the batch 0 is 295521.43132489
0.5125100612640381
Iteration  1
0      axmo
1      d8ed
2      itll
3      llhk
4      fxj8
       ... 
995    1dz8
996    a4yx
997    65ft
998    owjn
999    c33c
Name: comp_ID, Length: 1000, dtype: object
The inertia for the batch 1 is 360675.512152438
0.3087923526763916
Iteration  2
0      ifql
1      3t9z
2      voxt
3      01hx
4      4sjs
       ... 
995    v3nw
996    08tt
997    n1lx
998    l86z
999    isaq
Name: comp_ID, Length: 1000, dtype: object
The inertia for the batch 2 is 350328.18632749066
0.3049049377441406
Iteration  3
0      z9o3
1      o208
2      uhb2
3      6qfi
4      jc9s
       ... 
995    r25f
996    qr1v
997    vugk
998    n2cz
999    wd70
Name: comp_ID, Length: 1000, dtype: object
The inertia for the batch 3 is 363578.57598657

Iteration  32
0      9x1n
1      5phd
2      pvq8
3      n8ke
4      j1ks
       ... 
995    z1we
996    xgz7
997    yww4
998    vdf3
999    htfm
Name: comp_ID, Length: 1000, dtype: object
The inertia for the batch 32 is 359786.43078798114
0.31453776359558105
Iteration  33
0      r120
1      1vq1
2      3bce
3      hjy1
4      x4oa
       ... 
995    ys1p
996    ryly
997    7xlt
998    rd0c
999    k1no
Name: comp_ID, Length: 1000, dtype: object
The inertia for the batch 33 is 362066.0687319616
0.30861330032348633
Iteration  34
0      qtl7
1      3elu
2      1w2a
3      ezbn
4      94ov
       ... 
995    sesi
996    fp6u
997    gz5e
998    ar2p
999    6s0m
Name: comp_ID, Length: 1000, dtype: object
The inertia for the batch 34 is 374140.8394717783
0.3129885196685791
Iteration  35
0      3c9d
1      06uo
2      12lt
3      4em3
4      zr1t
       ... 
995    imsk
996    ldok
997    7ufp
998    ucoi
999    ql4b
Name: comp_ID, Length: 1000, dtype: object
The inertia for the batch 35 is 36

Iteration  64
0      wadv
1      i8hm
2      ost6
3      2wtx
4      g3o2
       ... 
995    3lgh
996    5e78
997    l288
998    4dya
999    2hcj
Name: comp_ID, Length: 1000, dtype: object
The inertia for the batch 64 is 357998.4110890419
0.31764745712280273
Iteration  65
0      6mi2
1      mfk8
2      kimz
3      jetm
4      j5rn
       ... 
995    8wjj
996    gs0b
997    4axx
998    ge3g
999    z7cg
Name: comp_ID, Length: 1000, dtype: object
The inertia for the batch 65 is 343976.4741722671
0.33709287643432617
Iteration  66
0      eodk
1      amge
2      6gtm
3      sq62
4      xlvj
       ... 
995    cubv
996    1ju2
997    1pz4
998    q862
999    vyk0
Name: comp_ID, Length: 1000, dtype: object
The inertia for the batch 66 is 367277.0821910453
0.3181946277618408
Iteration  67
0      lt36
1      kypd
2      fyr7
3      vizx
4      mzwj
       ... 
995    snqz
996    w8dl
997    trzd
998    6sj4
999    j2k6
Name: comp_ID, Length: 1000, dtype: object
The inertia for the batch 67 is 380

Iteration  96
0      hj18
1      3o5v
2      kdo5
3      g091
4      g2ku
       ... 
995    1wg3
996    gm5v
997    h9qn
998    ybuj
999    g2t2
Name: comp_ID, Length: 1000, dtype: object
The inertia for the batch 96 is 356267.1341399097
0.3507254123687744
Iteration  97
0      4mg1
1      2zi9
2      t97v
3      dhmw
4      ihm3
       ... 
995    ttrf
996    8ulc
997    lge7
998    cbi1
999    1pwk
Name: comp_ID, Length: 1000, dtype: object
The inertia for the batch 97 is 347423.0186637239
0.3262443542480469
Iteration  98
0      h1ep
1      ppmb
2      svs8
3      msjy
4      v1go
       ... 
995    n2ak
996    gbqi
997    qkd5
998    pgqf
999    l1qc
Name: comp_ID, Length: 1000, dtype: object
The inertia for the batch 98 is 374864.8040639438
0.3289158344268799
Iteration  99
0      sor9
1      xucw
2      ippl
3      be2l
4      kign
       ... 
995    wewk
996    gwa2
997    ltwl
998    2jhn
999    qkz2
Name: comp_ID, Length: 1000, dtype: object
The inertia for the batch 99 is 35337

In [41]:
labels_array = np.hstack(labels)
comp_ids_list = np.hstack(comp_ids)

#Joining compound IDs and cluster labels
compound_clusters = pd.DataFrame({'comp_ID':comp_ids_list,
                                  'cluster_label':labels_array})

compound_clusters.to_csv("".join((absPath, "data/", protein_type, "/no_resampling/compound_clusters.csv")), header=True)

In [42]:
activity_with_clusters = pd.merge(X, compound_clusters, on=["comp_ID"], how="left")
activity_with_clusters.rename(columns={"prot": "DeepAffinity Protein ID"}, inplace=True)
activity_with_clusters.to_csv("".join((absPath, "data/", protein_type, "/no_resampling/activity_clusters.csv")), 
                              sep="\t", header=True)
activity_with_clusters.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200523 entries, 0 to 200522
Columns: 888 entries, 0 to cluster_label
dtypes: float64(1), int32(1), int64(881), object(5)
memory usage: 1.3+ GB


## Splitting training/test

In [43]:
nfolds = 10

In [44]:
print(Counter(activity_with_clusters.Y))

Counter({1.0: 153993, 0.0: 46530})


In [45]:
#we create nfolds differents splitting partitions
for i in range(nfolds):
    compounds_classif = accumulated_size_clusters(activity_with_clusters)
    compounds_classif = training_test_split(compounds_classif, 80, 10, 10, i)
    #Joining smiles-label dataframe with label information dataframe
    name_column = "splitting_" + str(i)
    activity_with_clusters = pd.merge(compounds_classif.loc[:, ["cluster_label", name_column]], 
                                 activity_with_clusters, on="cluster_label")

In [46]:
#Checking that all the partitions are more or less the same size
for i in range(nfolds):
    name_column = "splitting_" + str(i)
    print(activity_with_clusters[name_column].value_counts())
    
activity_with_clusters.info()

0    156115
1     23273
2     21135
Name: splitting_0, dtype: int64
0    158865
2     21448
1     20210
Name: splitting_1, dtype: int64
0    159976
2     20563
1     19984
Name: splitting_2, dtype: int64
0    159583
2     21192
1     19748
Name: splitting_3, dtype: int64
0    155919
1     23927
2     20677
Name: splitting_4, dtype: int64
0    157550
1     22305
2     20668
Name: splitting_5, dtype: int64
0    158976
1     21342
2     20205
Name: splitting_6, dtype: int64
0    158164
1     22256
2     20103
Name: splitting_7, dtype: int64
0    159604
2     22454
1     18465
Name: splitting_8, dtype: int64
0    158507
2     21184
1     20832
Name: splitting_9, dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 200523 entries, 0 to 200522
Columns: 898 entries, cluster_label to Canonical SMILE
dtypes: float64(1), int64(892), object(5)
memory usage: 1.3+ GB


In [47]:
#Now we should check number of actives/inactives per splitting fold
count_list = []
for i in range(nfolds):
    name_column = "splitting_" + str(i)
    count_split = activity_with_clusters.loc[:,[name_column, "Y"]].groupby([name_column, 
                                          "Y"]).size().unstack(fill_value=0)
    count_split_df = pd.DataFrame(count_split)
    count_split_df_melt = pd.melt(count_split_df.reset_index(), id_vars = name_column, value_vars=[0.0,1.0])
    count_split_df_melt["splitting_fold"] = name_column
    count_split_df_melt = count_split_df_melt.rename(columns = {name_column:"split_set"})
    print(count_split)
    count_list.append(count_split_df_melt)

Y              0.0     1.0
splitting_0               
0            35728  120387
1             5937   17336
2             4865   16270
Y              0.0     1.0
splitting_1               
0            37046  121819
1             4524   15686
2             4960   16488
Y              0.0     1.0
splitting_2               
0            37079  122897
1             4369   15615
2             5082   15481
Y              0.0     1.0
splitting_3               
0            37093  122490
1             4407   15341
2             5030   16162
Y              0.0     1.0
splitting_4               
0            36512  119407
1             5789   18138
2             4229   16448
Y              0.0     1.0
splitting_5               
0            35915  121635
1             5594   16711
2             5021   15647
Y              0.0     1.0
splitting_6               
0            37525  121451
1             4946   16396
2             4059   16146
Y              0.0     1.0
splitting_7               
0

In [48]:
#saving data into a HDF5
#Defining HDF5 table-type for storing data
class Protein_Compound_Complex(IsDescription):
    #CID = UInt16Col()
    da_comp_id = StringCol(4) 
    da_prot_id = StringCol(4)
    uniprot_id = StringCol(6)
    #activity = Float16Col()
    label = UInt16Col()
    #canonical_smiles = StringCol(100)
    sequence = StringCol(2000)
    prot_family = StringCol(5)
    comp_cluster = UInt16Col()
    splitting_0 = UInt8Col()
    splitting_1 = UInt8Col()
    splitting_2 = UInt8Col()
    splitting_3 = UInt8Col()
    splitting_4 = UInt8Col()
    splitting_5 = UInt8Col()
    splitting_6 = UInt8Col()
    splitting_7 = UInt8Col()
    splitting_8 = UInt8Col()
    splitting_9 = UInt8Col()
    fingerprint = StringCol(900)

In [49]:
activity_with_clusters.head()

Unnamed: 0,cluster_label,splitting_9,splitting_8,splitting_7,splitting_6,splitting_5,splitting_4,splitting_3,splitting_2,splitting_1,...,877,878,879,880,Y,DeepAffinity Protein ID,comp_ID,Sequence,family,Canonical SMILE
0,4,0,0,2,0,0,2,0,0,0,...,0,0,0,0,1.0,0BJE,z4jn,MTPNSTGEVPSPIPKGALGLSLALASLIITANLLLALGIAWDRRLR...,GPCR,CN1CCN(CC1)C2=NC(=NC(=C2)N3CCCC(C3)C(=O)NCCC4=...
1,4,0,0,2,0,0,2,0,0,0,...,0,0,0,0,1.0,0BJE,6jwr,MTPNSTGEVPSPIPKGALGLSLALASLIITANLLLALGIAWDRRLR...,GPCR,C1CC(CN(C1)C2=CC(=NC(=N2)C(F)(F)F)N3CCNCC3)C(=...
2,4,0,0,2,0,0,2,0,0,0,...,0,0,0,0,1.0,0BJE,2sl7,MTPNSTGEVPSPIPKGALGLSLALASLIITANLLLALGIAWDRRLR...,GPCR,CCN1CCN(CC1)C2=NC(=NC(=C2)N3CCCC(C3)C(=O)NCCC4...
3,4,0,0,2,0,0,2,0,0,0,...,0,0,0,0,1.0,78TA,z4jn,MMTPNSTELSAIPMGVLGLSLALASLIVIANLLLALGIALDRHLRS...,GPCR,CN1CCN(CC1)C2=NC(=NC(=C2)N3CCCC(C3)C(=O)NCCC4=...
4,4,0,0,2,0,0,2,0,0,0,...,0,0,0,0,1.0,78TA,6jwr,MMTPNSTELSAIPMGVLGLSLALASLIVIANLLLALGIALDRHLRS...,GPCR,C1CC(CN(C1)C2=CC(=NC(=N2)C(F)(F)F)N3CCNCC3)C(=...


In [50]:
activity_with_clusters.rename(columns={"prot": "DeepAffinity Protein ID"}, inplace=True)

In [51]:
activity_with_clusters["fingerprint"] = activity_with_clusters[[i for i in range(881)]].apply(lambda row: "".join(row.values.astype(str)), axis=1)

In [52]:
#open a HDF5 file with write options
file_h5 = open_file("".join((absPath, "data/", protein_type, "/no_resampling/compounds_activity.h5")), "w")
root = file_h5.root
group = file_h5.create_group(root, "activity")
table = file_h5.create_table('/activity', "prot_comp", Protein_Compound_Complex) 
pair = table.row

for index,row in activity_with_clusters.iterrows():
    #pair["CID"] = row["CID"]
    pair["da_comp_id"] = row["comp_ID"]
    pair["da_prot_id"] = row["DeepAffinity Protein ID"]
    #pair["activity"] = row["activity"]
    pair["label"] = row["Y"]
    #pair["canonical_smiles"] = row["Canonical SMILE"]
    pair["sequence"] = row["Sequence"]
    pair["prot_family"] = row["family"]
    pair["comp_cluster"] = row["cluster_label"]
    pair["fingerprint"] = row["fingerprint"]
    for i in range(nfolds):
        name_col = "splitting_" + str(i)
        pair[name_col] = row[name_col]
    pair.append()
table.flush()
file_h5.close()

In [53]:
#creating folder to storage splitting lists if it does not exist
if not os.path.exists("".join((absPath, "data/", protein_type, "/no_resampling/splitting_lists/"))):
    os.makedirs("".join((absPath, "data/", protein_type, "/no_resampling/splitting_lists/")))

In [54]:
#Opening HDF5 with data
file_h5 = "".join((absPath, "data/", protein_type, "/no_resampling/compounds_activity.h5"))
f = h5py.File(file_h5, 'r')
group = '/activity'
table = "prot_comp"
n_samples = len(f[group][table])
sample_indices = np.arange(0, n_samples)
sample_indices = np.random.permutation(sample_indices)

In [55]:
for i in range(nfolds):
    column_name = "splitting_" + str(i)
    training_list, validation_list, test_list = splitting_division(f, group, 
                                                                   table, 
                                                                   sample_indices, 
                                                                   column_name)
    
    pickle_filename = "".join((absPath, "data/", protein_type, "/no_resampling/splitting_lists/", column_name, "_list.pickle"))
    with open(pickle_filename, "wb") as handle:
        pickle.dump((training_list, validation_list, test_list), handle)

293.9797399044037
290.4398682117462
290.97201466560364
290.3173522949219
291.7607741355896
292.3066077232361
290.6400218009949
293.42017245292664
291.618501663208
314.804456949234
