In [1]:
import pandas as pd 
import numpy as np
import pickle
import os

In [2]:
os.listdir()

['chembl_less_64_instances.csv',
 'chembl_more_256_instances.csv',
 'chembl_more_64_instances.csv',
 'chembl_less_128_instances.csv',
 'chembl_more_128_instances.csv',
 'chembl_less_256_instances.csv',
 'chembl_less_512_instances.csv',
 'chembl_assay_names.pickle',
 'chembl_assay_type_to_names.pickle',
 'chembl_assay_type_legend.pickle',
 'chembl_assays_skipped_404.pickle',
 'chembl_128_tasks.pickle',
 'chembl_assay_name_to_type.pickle',
 'chembl_128_assay_name_to_type.pickle',
 'chembl_128_assay_type_to_names.pickle',
 'chembl_more_128_instances_902_tasks.csv',
 'chembl_more_512_instances.csv',
 'chembl_less_1024_instances.csv',
 'chembl_more_1024_instances.csv',
 'chembl_less_5000_instances.csv',
 'chembl_more_5000_instances.csv',
 'chembl_less_10000_instances.csv',
 'chembl_more_10000_instances.csv',
 'all_test_gsk_assays.pickle',
 'assays_fewer_than_128_instances.pickle',
 'num_datapoints_per_assay.pickle',
 'test_gsk_assays_more_than_128.pickle',
 'test_gsk_assays_less_than_128.pi

In [3]:
with open('chembl_1024_tasks.pickle', 'rb') as handle:
    tasks = pickle.load(handle)
with open('chembl_1024_assay_name_to_type.pickle', 'rb') as handle:
    assay_name_to_type = pickle.load(handle)
with open('chembl_1024_assay_type_to_names.pickle', 'rb') as handle:
    assay_type_to_names = pickle.load(handle)

In [6]:
print("There are {} total tasks".format(len(tasks)))
for assay_type in assay_type_to_names.keys():
    print("Assay type {} has {} assays".format(assay_type, len(assay_type_to_names[assay_type])))

There are 645 total tasks
Assay type F has 491 assays
Assay type B has 148 assays
Assay type U has 2 assays
Assay type T has 2 assays
Assay type A has 2 assays


In [7]:
# HYPER PARAMS -- num of B and F tasks we want in test and val splits 
NUM_VAL_BF_TASKS = 10
NUM_TEST_BF_TASKS = 10

T_val_idx = NUM_VAL_BF_TASKS 
T_test_idx = NUM_VAL_BF_TASKS + NUM_TEST_BF_TASKS

In [8]:
chembl_id_to_idx = {assay: idx for idx, assay in enumerate(tasks)}

In [9]:
# Randomly shuffle B and F tasks
randomized_B_tasks = np.copy(assay_type_to_names['B'])
np.random.shuffle(randomized_B_tasks)
randomized_B_task_indices = [chembl_id_to_idx[assay] for assay in randomized_B_tasks]

randomized_F_tasks = np.copy(assay_type_to_names['F'])
np.random.shuffle(randomized_F_tasks)
randomized_F_task_indices = [chembl_id_to_idx[assay] for assay in randomized_F_tasks]

In [10]:
# Partition BF tasks among meta train, val, test

# Generate BF val split 
T_val_B_task_indices = randomized_B_task_indices[:T_val_idx]
T_val_F_task_indices = randomized_F_task_indices[:T_val_idx]

# Generate BF test split 
T_test_B_task_indices = randomized_B_task_indices[T_val_idx:T_test_idx]
T_test_F_task_indices = randomized_F_task_indices[T_val_idx:T_test_idx]

# Slot remaining BF tasks into train split
T_train_B_task_indices = randomized_B_task_indices[T_test_idx:]
T_train_F_task_indices = randomized_F_task_indices[T_test_idx:]

In [11]:
# Slot remaining A, T and U tasks into meta test split
T_test_A_task_indices = [chembl_id_to_idx[assay] for assay in assay_type_to_names['A']]
T_test_T_task_indices = [chembl_id_to_idx[assay] for assay in assay_type_to_names['T']]
T_test_U_task_indices = [chembl_id_to_idx[assay] for assay in assay_type_to_names['U']]

In [12]:
total_partitioned_tasks = len(T_val_B_task_indices) + len(T_val_F_task_indices) + len(T_test_B_task_indices) + len(T_test_F_task_indices) + len(T_train_B_task_indices) + len(T_train_F_task_indices) + len(T_test_A_task_indices) + len(T_test_T_task_indices) + len(T_test_U_task_indices)
assert total_partitioned_tasks == len(tasks)

In [13]:
# Make final bit vectors representing tasks for each of the train, validation and test splits 
T_tr = [0] * len(tasks)
T_val = [0] * len(tasks)
T_test = [0] * len(tasks)

for idx_list in (T_train_B_task_indices, T_train_F_task_indices):
    for idx in idx_list:
        T_tr[idx] = 1

In [14]:
for idx_list in (T_val_B_task_indices, T_val_F_task_indices):
    for idx in idx_list:
        T_val[idx] = 1

In [15]:
for idx_list in (T_test_B_task_indices, T_test_F_task_indices, T_test_A_task_indices, T_test_T_task_indices, T_test_U_task_indices):
    for idx in idx_list:
        T_test[idx] = 1

In [16]:
# sanity checks
assert sum(T_tr) + sum(T_val) + sum(T_test) == len(tasks)
T_train_indices = set(np.nonzero(T_tr)[0])
T_val_indices = set(np.nonzero(T_val)[0])
T_test_indices = set(np.nonzero(T_test)[0])
assert T_train_indices.isdisjoint(T_val_indices)
assert T_train_indices.isdisjoint(T_test_indices)
assert T_val_indices.isdisjoint(T_test_indices)

In [17]:
# save these task splits
with open('chembl_1024_meta_train_task_split.pickle', 'wb') as handle:
    pickle.dump(T_tr, handle)
with open('chembl_1024_meta_val_task_split.pickle', 'wb') as handle:
    pickle.dump(T_val, handle)
with open('chembl_1024_meta_test_task_split.pickle', 'wb') as handle:
    pickle.dump(T_test, handle)