In [1]:
# after data_processing_updated_1
# use this file to obtain each fold data separately based on the files generated by updated_1.ipynb

In [2]:
import pandas as pd
import numpy as np
import pickle
import copy
import requests
import random
from scipy import sparse
from copy import deepcopy
import time

start_processing_time = time.time()

pd.options.display.max_columns = None
# the folder to store the general data for generating each fold data
prefix = 'D:/B/PROJECT B2_2/dataset/Muthene_dataset/' 
midfix = 'common files/'

# relative idx mapping dict
in_file = open(prefix + midfix + 'drug2relid_dict.pickle', 'rb')
drug2relid_dict = pickle.load(in_file)
in_file.close()
in_file = open(prefix + midfix + 'target2relid_dict.pickle', 'rb')
target2relid_dict = pickle.load(in_file)
in_file.close()
in_file = open(prefix + midfix + 'cellline2relid_dict.pickle', 'rb')
cellline2relid_dict = pickle.load(in_file)
in_file.close()

# interaction information after data reduction based on cell line selection (select fixed number of cell lines)
drugcomb_reduced = pd.read_csv(prefix + midfix + 'drugcomb_depmap.csv')
twosides_reduced = pd.read_csv(prefix + midfix + 'twosides_depmap.csv')
drug_target_reduced = pd.read_csv(prefix + midfix + 'drug_target_depmap.csv')
target_target_reduced = pd.read_csv(prefix + midfix + 'target_target_depmap.csv')

# read the generated adjacent matrices
drug_se_drug_coomatrix = sparse.load_npz(prefix + midfix + 'drug_se_drug_coomatrix.npz') # asymmetric
drug_te_drug_coomatrix = sparse.load_npz(prefix + midfix + 'drug_te_drug_coomatrix.npz') # asymmetric
drug_target_coomatrix = sparse.load_npz(prefix + midfix + 'drug_target_coomatrix.npz') # asymmetric
target_target_coomatrix = sparse.load_npz(prefix + midfix + 'target_target_coomatrix.npz') # symmetric

drug_se_drug_matrix = drug_se_drug_coomatrix.todense()
drug_te_drug_matrix = drug_te_drug_coomatrix.todense()

# add relative ids to each data file (insert into the 0th and 1th columns)

# drug_target
drug_target_drugid=list(map(lambda n: drug2relid_dict[n], list(drug_target_reduced['drug_lower'])))
drug_target_targetid=list(map(lambda n: target2relid_dict[n], list(drug_target_reduced['gene symbol'])))
drug_target_reduced.insert(0, 'drugid', drug_target_drugid, allow_duplicates=False)
drug_target_reduced.insert(1, 'targetid', drug_target_targetid, allow_duplicates=False) # columns 0, 1

# target_target
target_target_targetid1=list(map(lambda n: target2relid_dict[n], list(target_target_reduced['gene1 symbol'])))
target_target_targetid2=list(map(lambda n: target2relid_dict[n], list(target_target_reduced['gene2 symbol'])))
target_target_reduced.insert(0, 'targetid1', target_target_targetid1, allow_duplicates=False)
target_target_reduced.insert(1, 'targetid2', target_target_targetid2, allow_duplicates=False) # columns 0, 1

# drug_se_drug
drug_se_drug_drugid1=list(map(lambda n: drug2relid_dict[n], list(twosides_reduced['drug1_lower'])))
drug_se_drug_drugid2=list(map(lambda n: drug2relid_dict[n], list(twosides_reduced['drug2_lower'])))
twosides_reduced.insert(0, 'drugid1', drug_se_drug_drugid1, allow_duplicates=False)
twosides_reduced.insert(1, 'drugid2', drug_se_drug_drugid2, allow_duplicates=False) # columns 0, 1

# drug_te_drug
drug_te_drug_drugid1=list(map(lambda n: drug2relid_dict[n], list(drugcomb_reduced['drug1_lower'])))
drug_te_drug_drugid2=list(map(lambda n: drug2relid_dict[n], list(drugcomb_reduced['drug2_lower'])))
drugcomb_reduced.insert(1, 'drugid1', drug_te_drug_drugid1, allow_duplicates=False)
drugcomb_reduced.insert(2, 'drugid2', drug_te_drug_drugid2, allow_duplicates=False) # columns 1, 2

In [41]:
print('drugcomb_reduced:', drugcomb_reduced.shape)
print('twosides_reduced:', twosides_reduced.shape)
print('drug_target_reduced:', drug_target_reduced.shape)
print('target_target_reduced:', target_target_reduced.shape)

drugcomb_reduced: (11166, 31)
twosides_reduced: (2446, 9)
drug_target_reduced: (2332, 5)
target_target_reduced: (91785, 4)


In [3]:
# *** use different random seed to generate data for independent repeats ***
# *** to generate the model input for one independent repeat, you need to run all blocks in this file from top to bottom ***

# 1 (fold/repeat 1), 429 (fold/repeat 2), 1002 (fold/repeat 3), 1012 (fold/repeat 4), 1024 (fold/repeat 5)
random_seed = 1024
random.seed(random_seed)
np.random.seed(random_seed)

seed2fold = {1:1, 429:2, 1002:3, 1012:4, 1024:5}

folds = 10
train_fold = [0, 1, 2, 3, 4, 5]
val_fold = [6, 7]
test_fold = [8, 9]

# the generated ids are based on the total number of input samples

def data_split_for_training(drug_disease_target_list, folds, train_fold, val_fold, test_fold):

    prng = np.random.RandomState(random_seed)
    allindex = prng.permutation(len(drug_disease_target_list))
    pos_inter_fold = np.array_split(allindex, folds)

    train_pos_sample = [pos_inter_fold[i] for i in train_fold]
    val_pos_sample = [pos_inter_fold[i] for i in val_fold]
    test_pos_sample = [pos_inter_fold[i] for i in test_fold]

    # train_pos_sample: array type
    train_pos_sample = np.concatenate(train_pos_sample)
    val_pos_sample = np.concatenate(val_pos_sample)
    test_pos_sample = np.concatenate(test_pos_sample)

    # print('train_pos_sample:', len(train_pos_sample), train_pos_sample)
    # print('val_pos_sample', len(val_pos_sample), val_pos_sample)
    # print('test_pos_sample', len(test_pos_sample), test_pos_sample)

    # sorted: list type
    return np.array(sorted(train_pos_sample)), np.array(sorted(val_pos_sample)), np.array(sorted(test_pos_sample))

print('random_seed:', random_seed)

random_seed: 1024


In [4]:
drugcomb_reduced.head()

Unnamed: 0,block_id,drugid1,drugid2,drug1,drug2,cell_line_name,study_name,tissue_name,conc_row_unit,conc_col_unit,ic50_row,ic50_col,ri_row,ri_col,css_row,css_col,css_ri,S_sum,S_mean,S_max,synergy_zip,synergy_loewe,synergy_hsa,synergy_bliss,drug_row_clinical_phase,drug_col_clinical_phase,drug_row_target_name,drug_col_target_name,drug1_lower,drug2_lower,unified_name
0,132914,51,13,Imatinib,bleomycin,SF-268,ALMANAC,brain,uM,uM,1.997124,0.21727,-1.206,14.845,16.977,16.503,16.74,3.101,9.9205,1.895,-5.967873,-10.254135,1.665557,3.507983,4,4,Tyrosine-protein kinase ABL; Platelet-derived ...,\N,imatinib,bleomycin,imatinib_bleomycin
1,135848,76,5,paclitaxel,allopurinol,SF-268,ALMANAC,brain,uM,uM,0.026675,4.675736,29.318,0.146,17.788,4.071,10.9295,-18.5345,-3.8025,-18.3885,-5.326121,-22.472343,-9.719076,-9.262139,4,4,Tubulin; Integrin alpha-V/beta-3; Prelamin-A/C...,Xanthine dehydrogenase; Ras-related protein Ra...,paclitaxel,allopurinol,paclitaxel_allopurinol
2,135980,43,31,Fulvestrant,docetaxel,SF-268,ALMANAC,brain,uM,uM,4.0,0.030177,-14.037,4.741,3.049,17.096,10.0725,19.3685,14.7205,5.3315,2.377254,-6.237605,-4.144442,9.601456,4,4,Estrogen receptor alpha; Estrogen receptor bet...,\N,fulvestrant,docetaxel,fulvestrant_docetaxel
3,137713,59,26,letrozole,cyclophosphamide,SF-268,ALMANAC,brain,uM,uM,0.044412,33.509424,-1.077,-0.379,2.678,1.556,2.117,3.573,2.845,2.496,0.316095,1.835797,2.4167,4.009233,4,4,Cytochrome P450 19A1,\N,letrozole,cyclophosphamide,letrozole_cyclophosphamide
4,138370,51,48,Imatinib,hydroxyurea,SF-268,ALMANAC,brain,uM,uM,0.542648,1.398216,1.269,0.129,-1.341,-0.498,-0.9195,-2.3175,-1.6185,-2.1885,0.036013,-0.616815,-3.276514,-1.881524,4,4,Tyrosine-protein kinase ABL; Platelet-derived ...,Ribonucleoside-diphosphate reductase RR1,imatinib,hydroxyurea,imatinib_hydroxyurea


In [5]:
# 'reduced' represents the sample reduction caused by choosing fixed number of cell lines
# the drug/target id is based on relative ids (drug2relid/target2relid)
twosides_reduced.head()

Unnamed: 0,drugid1,drugid2,drug1,drug2,Polypharmacy Side Effect,Side Effect Name,drug1_lower,drug2_lower,unified_name
0,0,35,5-fluorouracil,exemestane,C0015967,body temperature increased,5-fluorouracil,exemestane,exemestane_5-fluorouracil
1,0,35,5-fluorouracil,exemestane,C0030193,Pain,5-fluorouracil,exemestane,exemestane_5-fluorouracil
2,0,35,5-fluorouracil,exemestane,C0027947,Neutropenia,5-fluorouracil,exemestane,exemestane_5-fluorouracil
3,0,35,5-fluorouracil,exemestane,C0398353,Hypoventilation,5-fluorouracil,exemestane,exemestane_5-fluorouracil
4,0,35,5-fluorouracil,exemestane,C0002871,anaemia,5-fluorouracil,exemestane,exemestane_5-fluorouracil


In [6]:
# split data into training, val, test sets according to drug-drug pairs
# samples with the same drug-drug pair will be allocated into the same set

# find all drug-drug pairs at first
np_drugcomb_reduced = np.array(drugcomb_reduced)
np_twosides_reduced = np.array(twosides_reduced)
np_drug_target_reduced = np.array(drug_target_reduced)
np_target_target_reduced = np.array(target_target_reduced)

DDI_set = set(drugcomb_reduced['unified_name']).intersection(set(twosides_reduced['unified_name']))
DDI_set = np.array(sorted(list(DDI_set))) # ordered numpy
print('len(DDI_set):', len(DDI_set))

train_idx, val_idx, test_idx = data_split_for_training(DDI_set, folds, train_fold, val_fold, test_fold)
# obtain index for retrieving drug-drug pairs used in different sets
train_DDI, val_DDI, test_DDI = DDI_set[train_idx], DDI_set[val_idx], DDI_set[test_idx]

train_drugcomb, val_drugcomb, test_drugcomb = [], [], []  # for drugcomb
train_twosides, val_twosides, test_twosides = [], [], []  # for twosides

# based on the drug_reduced file and its unified drug pair names
for i in train_DDI:
    train_drugcomb.append(np_drugcomb_reduced[np_drugcomb_reduced[:, -1] == i])
    train_twosides.append(np_twosides_reduced[np_twosides_reduced[:, -1] == i])
train_drugcomb = pd.DataFrame(np.concatenate(
    train_drugcomb), columns=drugcomb_reduced.columns)
train_twosides = pd.DataFrame(np.concatenate(
    train_twosides), columns=twosides_reduced.columns)
print('train_drugcomb.shape:', train_drugcomb.shape, 'train_twosides.shape:', train_twosides.shape)

for i in val_DDI:
    val_drugcomb.append(np_drugcomb_reduced[np_drugcomb_reduced[:, -1] == i])
    val_twosides.append(np_twosides_reduced[np_twosides_reduced[:, -1] == i])
val_drugcomb = pd.DataFrame(np.concatenate(
    val_drugcomb), columns=drugcomb_reduced.columns)
val_twosides = pd.DataFrame(np.concatenate(
    val_twosides), columns=twosides_reduced.columns)
print('val_drugcomb.shape:', val_drugcomb.shape, 'val_twosides.shape:', val_twosides.shape)

for i in test_DDI:
    test_drugcomb.append(np_drugcomb_reduced[np_drugcomb_reduced[:, -1] == i])
    test_twosides.append(np_twosides_reduced[np_twosides_reduced[:, -1] == i])
test_drugcomb = pd.DataFrame(np.concatenate(
    test_drugcomb), columns=drugcomb_reduced.columns)
test_twosides = pd.DataFrame(np.concatenate(
    test_twosides), columns=twosides_reduced.columns)
print('test_drugcomb.shape:', test_drugcomb.shape, 'test_twosides.shape:', test_twosides.shape)

# sort the obtained files according to the ranking of ids
train_drugcomb = train_drugcomb.sort_values(
    ['drugid1', 'drugid2'], ascending=(True, True)).reset_index(drop=True)
val_drugcomb = val_drugcomb.sort_values(
    ['drugid1', 'drugid2'], ascending=(True, True)).reset_index(drop=True)
test_drugcomb = test_drugcomb.sort_values(
    ['drugid1', 'drugid2'], ascending=(True, True)).reset_index(drop=True)
train_twosides = train_twosides.sort_values(
    ['drugid1', 'drugid2'], ascending=(True, True)).reset_index(drop=True)
val_twosides = val_twosides.sort_values(
    ['drugid1', 'drugid2'], ascending=(True, True)).reset_index(drop=True)
test_twosides = test_twosides.sort_values(
    ['drugid1', 'drugid2'], ascending=(True, True)).reset_index(drop=True)

# test for split drug-drug pair sets
print('len(train_DDI), len(val_DDI), len(test_DDI):', len(train_DDI), len(val_DDI), len(test_DDI))
print('len(set(train_DDI)-set(val_DDI)):', len(set(train_DDI)-set(val_DDI))) # do have any duplicated drug pairs in these two sets
print('len(set(train_DDI)-set(test_DDI)):', len(set(train_DDI)-set(test_DDI))) # do have any duplicated drug pairs in these two sets
print('len(set(val_DDI)-set(test_DDI)):', len(set(val_DDI)-set(test_DDI))) # do have any duplicated drug pairs in these two sets

test_drugcomb.head()

len(DDI_set): 346
train_drugcomb.shape: (6546, 31) train_twosides.shape: (1467, 9)
val_drugcomb.shape: (2464, 31) val_twosides.shape: (514, 9)
test_drugcomb.shape: (2156, 31) test_twosides.shape: (465, 9)
len(train_DDI), len(val_DDI), len(test_DDI): 210 68 68
len(set(train_DDI)-set(val_DDI)): 210
len(set(train_DDI)-set(test_DDI)): 210
len(set(val_DDI)-set(test_DDI)): 68


Unnamed: 0,block_id,drugid1,drugid2,drug1,drug2,cell_line_name,study_name,tissue_name,conc_row_unit,conc_col_unit,ic50_row,ic50_col,ri_row,ri_col,css_row,css_col,css_ri,S_sum,S_mean,S_max,synergy_zip,synergy_loewe,synergy_hsa,synergy_bliss,drug_row_clinical_phase,drug_col_clinical_phase,drug_row_target_name,drug_col_target_name,drug1_lower,drug2_lower,unified_name
0,392784,0,48,5-Fluorouracil,hydroxyurea,SF-268,ALMANAC,brain,uM,uM,14.5899,75.4088,17.005,-2.012,24.317,35.735,30.026,15.033,22.5295,13.021,1.05671,-1.73449,4.01518,3.83808,4,4,Prelamin-A/C; Survival motor neuron protein; T...,Ribonucleoside-diphosphate reductase RR1,5-fluorouracil,hydroxyurea,5-fluorouracil_hydroxyurea
1,392785,0,48,5-Fluorouracil,hydroxyurea,SF-295,ALMANAC,brain,uM,uM,20.2656,100.0,28.684,2.707,21.612,46.378,33.995,2.604,18.2995,5.311,-11.0146,-9.77126,-5.53546,-8.08582,4,4,Prelamin-A/C; Survival motor neuron protein; T...,Ribonucleoside-diphosphate reductase RR1,5-fluorouracil,hydroxyurea,5-fluorouracil_hydroxyurea
2,392731,0,48,5-Fluorouracil,hydroxyurea,SF-539,ALMANAC,brain,uM,uM,90.5494,96.9812,44.357,0.364,30.107,62.647,46.377,1.656,24.0165,2.02,-6.35403,-34.7497,-13.1418,-14.1142,4,4,Prelamin-A/C; Survival motor neuron protein; T...,Ribonucleoside-diphosphate reductase RR1,5-fluorouracil,hydroxyurea,5-fluorouracil_hydroxyurea
3,392730,0,48,5-Fluorouracil,hydroxyurea,SNB-75,ALMANAC,brain,uM,uM,100.0,100.0,14.323,1.372,15.025,30.94,22.9825,7.2875,15.135,8.6595,3.84619,-6.60952,4.04867,2.83189,4,4,Prelamin-A/C; Survival motor neuron protein; T...,Ribonucleoside-diphosphate reductase RR1,5-fluorouracil,hydroxyurea,5-fluorouracil_hydroxyurea
4,392733,0,48,5-Fluorouracil,hydroxyurea,U251,ALMANAC,brain,uM,uM,28.7806,100.0,32.903,5.035,32.962,50.1,41.531,3.593,22.562,8.628,-4.97902,-5.87885,-4.64817,-7.40013,4,4,Prelamin-A/C; Survival motor neuron protein; T...,Ribonucleoside-diphosphate reductase RR1,5-fluorouracil,hydroxyurea,5-fluorouracil_hydroxyurea


In [7]:
# start to build the adjacency matrix for heterogeneous network embedding

# the node order: 0 for drug, 1 for target, 2 for cell line
# relative ids are the id used in our defined task
num_drug = len(drug2relid_dict)
num_target = len(target2relid_dict)
num_cellline = len(cellline2relid_dict)
dim = num_drug+num_target+num_cellline

# give a type mask to every node in the dataset
type_mask = np.zeros((dim), dtype=int)
type_mask[num_drug: num_drug+num_target] = 1 # target
type_mask[num_drug+num_target:] = 2 # cell line

# a test for the symmetry of te and se matrices

drug_se_drug_matrix_symmetric = np.zeros(drug_se_drug_matrix.shape)
counter = -1
for row in np.array(drug_se_drug_matrix):
    counter += 1
    for col in row.nonzero()[0]:
        drug_se_drug_matrix_symmetric[col, counter] = 1
drug_se_drug_matrix_symmetric += np.array(drug_se_drug_matrix)

# should be zero due to its symetry
print((drug_se_drug_matrix_symmetric != drug_se_drug_matrix_symmetric.T).sum())

drug_te_drug_matrix_symmetric = np.zeros(drug_te_drug_matrix.shape)
counter = -1
for row in np.array(drug_te_drug_matrix):
    counter += 1
    for col in row.nonzero()[0]:
        drug_te_drug_matrix_symmetric[col, counter] = 1
drug_te_drug_matrix_symmetric += np.array(drug_te_drug_matrix)
# could be (te_num*2 - se_num*2) because the edges in the te and se matrices could be different
print((drug_se_drug_matrix_symmetric-drug_te_drug_matrix_symmetric).sum())

print('edge numbers in te and se matrices:', drug_te_drug_matrix_symmetric.sum(),
      drug_te_drug_matrix.sum(), drug_se_drug_matrix_symmetric.sum(), drug_se_drug_matrix.sum())
print('num_drug:', num_drug, 'num_target:', num_target)

0
620.0
edge numbers in te and se matrices: 72.0 36.0 692.0 346.0
num_drug: 106 num_target: 12217


In [8]:
# if every drug-drug pair has at least one side effect label, this block will print nothing
for unified_name in set(train_twosides['unified_name']):
    num = train_twosides[train_twosides['unified_name'] == unified_name].shape[0]
    if num == 0:
        print(unified_name)

train_twosides.head()

Unnamed: 0,drugid1,drugid2,drug1,drug2,Polypharmacy Side Effect,Side Effect Name,drug1_lower,drug2_lower,unified_name
0,0,5,5-fluorouracil,allopurinol,C0015967,body temperature increased,5-fluorouracil,allopurinol,5-fluorouracil_allopurinol
1,0,5,5-fluorouracil,allopurinol,C0011175,dehydration,5-fluorouracil,allopurinol,5-fluorouracil_allopurinol
2,0,5,5-fluorouracil,allopurinol,C0008033,pleural pain,5-fluorouracil,allopurinol,5-fluorouracil_allopurinol
3,0,5,5-fluorouracil,allopurinol,C0011991,diarrhea,5-fluorouracil,allopurinol,5-fluorouracil_allopurinol
4,0,5,5-fluorouracil,allopurinol,C0043096,loss of weight,5-fluorouracil,allopurinol,5-fluorouracil_allopurinol


In [9]:
drug_target_reduced.head()

Unnamed: 0,drugid,targetid,drug,drug_lower,gene symbol
0,36,6855,ezetimibe,ezetimibe,NPC1L1
1,36,6854,ezetimibe,ezetimibe,NPC1
2,36,9069,ezetimibe,ezetimibe,SCAP
3,52,10522,imiquimod,imiquimod,TLR7
4,52,4623,imiquimod,imiquimod,HRH2


In [10]:
# based on the test above, for the se and te adjacent matrices for model training
# we should double the samples by making the asymmetric te and se matrices symmetric (only for the training set)

# the model only takes the adjlist and metapath index as the model input, not including adjM
adjM = np.zeros((dim,dim),dtype=int)
print('adjM.shape:', adjM.shape)

# store drug-target (0-1)
drug_target_reduced=drug_target_reduced.sort_values(['drugid','targetid'],ascending=(True,True)).reset_index(drop=True)
for _, row in drug_target_reduced.iterrows():
    uid = row['drugid']
    aid = num_drug + row['targetid']
    adjM[uid, aid] = 1
    adjM[aid, uid] = 1
    
# store target-target (1-1)
target_target_reduced = target_target_reduced.sort_values(['targetid1','targetid2'],ascending=(True,True)).reset_index(drop=True)
for _, row in target_target_reduced.iterrows():
    uid = num_drug + row['targetid1']
    aid = num_drug + row['targetid2']
    adjM[uid,aid] = 1
    adjM[aid,uid] = 1
    
# create two adjM to store se and te relationships separately
adjM_te = deepcopy(adjM)
    
# store drug-se-drug (0-se-0)
# based on the drug-se-drug pairs in the training set
train_drug_se_drug = train_twosides[['drug1','drug2','drug1_lower','drug2_lower','drugid1','drugid2','unified_name']]  
for _, row in train_drug_se_drug.iterrows():
    uid = row['drugid1']
    aid = row['drugid2']
    adjM[uid,aid] = 1
    adjM[aid,uid] = 1

# store drug-te-drug (0-te-0)
# based on the drug-te-drug pairs in the training set
train_drug_te_drug_ = train_drugcomb[['drug1','drug2','drug1_lower','drug2_lower','drugid1','drugid2', 'unified_name']]
# filter the edges according to drug_te_drug_matrix, which is obtained based on pre-defined synergy score threshold
train_drug_te_drug = []
for _, row in train_drug_te_drug_.iterrows():
    temp1, temp2 = row['drugid1'], row['drugid2']
    if ((drug_te_drug_matrix[temp1, temp2]==1) or (drug_te_drug_matrix[temp2, temp1]==1)):
        train_drug_te_drug.append(row)

# 509 samples corresponding to 24 unified_name (based on fold1)
train_drug_te_drug = pd.DataFrame(train_drug_te_drug,columns=['drug1','drug2','drug1_lower','drug2_lower','drugid1','drugid2', 'unified_name']).reset_index(drop=True)

for _,row in train_drug_te_drug.iterrows():
    uid = row['drugid1']
    aid = row['drugid2']
    adjM_te[uid,aid] = 1
    adjM_te[aid,uid] = 1
    
print(adjM_te[:num_drug,:num_drug].sum()/2)
# adjM stores AE relationships, adjM stores TE relationships
# as every drug pair in dataset has at least one AE labels, thus for every drug pair, it will be allocated an effective edge
# leading to the uselessness of adjM, while TE edges only exist between several drug pairs
# thus adjM_te is used to create a kind of meta-path

adjM.shape: (12383, 12383)
21.0


In [11]:
train_drugcomb.head()

Unnamed: 0,block_id,drugid1,drugid2,drug1,drug2,cell_line_name,study_name,tissue_name,conc_row_unit,conc_col_unit,ic50_row,ic50_col,ri_row,ri_col,css_row,css_col,css_ri,S_sum,S_mean,S_max,synergy_zip,synergy_loewe,synergy_hsa,synergy_bliss,drug_row_clinical_phase,drug_col_clinical_phase,drug_row_target_name,drug_col_target_name,drug1_lower,drug2_lower,unified_name
0,390241,0,5,5-Fluorouracil,allopurinol,SF-268,ALMANAC,brain,uM,uM,14.5899,166.823,17.005,-4.267,24.754,32.247,28.5005,15.7625,22.1315,11.4955,6.17715,-4.49406,7.10084,9.99667,4,4,Prelamin-A/C; Survival motor neuron protein; T...,Xanthine dehydrogenase; Ras-related protein Ra...,5-fluorouracil,allopurinol,5-fluorouracil_allopurinol
1,390242,0,5,5-Fluorouracil,allopurinol,SF-295,ALMANAC,brain,uM,uM,20.2656,200.0,28.684,2.038,19.15,46.89,33.02,2.298,17.659,4.336,-6.28842,-11.9889,-4.53622,-6.21557,4,4,Prelamin-A/C; Survival motor neuron protein; T...,Xanthine dehydrogenase; Ras-related protein Ra...,5-fluorouracil,allopurinol,5-fluorouracil_allopurinol
2,390283,0,5,5-Fluorouracil,allopurinol,SF-539,ALMANAC,brain,uM,uM,90.5494,99.4936,44.357,0.785,40.246,74.455,57.3505,12.2085,34.7795,12.9935,-1.44965,-25.1057,-4.29388,-5.2754,4,4,Prelamin-A/C; Survival motor neuron protein; T...,Xanthine dehydrogenase; Ras-related protein Ra...,5-fluorouracil,allopurinol,5-fluorouracil_allopurinol
3,390282,0,5,5-Fluorouracil,allopurinol,SNB-75,ALMANAC,brain,uM,uM,100.0,26.6103,14.323,1.932,19.283,44.077,31.68,15.425,23.5525,17.357,3.50575,0.808711,7.11048,5.4775,4,4,Prelamin-A/C; Survival motor neuron protein; T...,Xanthine dehydrogenase; Ras-related protein Ra...,5-fluorouracil,allopurinol,5-fluorouracil_allopurinol
4,390277,0,5,5-Fluorouracil,allopurinol,U251,ALMANAC,brain,uM,uM,28.7806,200.0,32.903,1.693,28.894,55.685,42.2895,7.6935,24.9915,9.3865,-3.68153,-9.15827,-4.86891,-6.68447,4,4,Prelamin-A/C; Survival motor neuron protein; T...,Xanthine dehydrogenase; Ras-related protein Ra...,5-fluorouracil,allopurinol,5-fluorouracil_allopurinol


In [None]:
# ********************************************************************************************************
# obtain dicts storing the neighbors of different nodes with diferent types (based on different metapaths)
# ********************************************************************************************************

# metapath/edge types for drug embedding:
# 1.drug-target-drug (drug-target)
# 2.drug-target-target-drug (drug-target,target-target)
# 3.drug-target-target-target-drug (drug-target,target-target)
# 4.drug-se-drug (drug-drug)
# 5.drug-te-drug (drug-drug)

# ******************************************************************************************
# indices that the dict stores are all based on the relative index instead of absolute index
# ******************************************************************************************

# currently the cellline part in adjM is empty, which will not influence the following calculation
print(num_drug + num_target + num_cellline, adjM.shape[0]) 

# 1.
drug_target_list = {i: adjM[i, num_drug:].nonzero()[0] for i in range(num_drug)}
# 2.
target_drug_list = {i: adjM[i+num_drug, :num_drug].nonzero()[0] for i in range(num_target)}
# 3.
target_target_list = {i: adjM[i+num_drug, num_drug:].nonzero()[0] for i in range(num_target)}
# 4.
drug_se_drug_list = {i: adjM[i, :num_drug].nonzero()[0] for i in range(num_drug)}
# 5.
drug_te_drug_list = {i: adjM_te[i, :num_drug].nonzero()[0] for i in range(num_drug)}

# test
print(target_target_reduced.shape,adjM[num_drug:,num_drug:].sum())
print(train_drug_se_drug.shape,adjM[:num_drug,:num_drug].sum())
print(train_drug_te_drug.shape,adjM_te[:num_drug,:num_drug].sum())

In [14]:
# ***********************************************************************************************************************
# generate different metapath-based datasets (based on absolute indices)
# absolute indices here (only here!) refer to the indices used in adjM, i.e., drug indices are in front of target indices
# normally, absolute ids refer to the id used in complete drugcomb and twosides datasets
# ***********************************************************************************************************************

# drug-target-drug (0-1-0)
drug_target_drug=[]

for target, drug_list in target_drug_list.items():
    # the order of drug1 and drug2 should not be important
    drug_target_drug.extend([(drug1, target, drug2)
                            # [[temp1,temp2] for temp1 in [1,2,3] for temp2 in [4,5,6]]
                            for drug1 in drug_list for drug2 in drug_list])
    
drug_target_drug = np.array(drug_target_drug)
# transform all the node indices to absolute indices (drugs first then targets)
drug_target_drug[:, 1] += num_drug

sorted_index = sorted(list(range(len(drug_target_drug))), key=lambda i: drug_target_drug[i, [0, 2, 1]].tolist())

# sort the drug_target_drug according to the priority order: drug1->drug2->target
drug_target_drug = drug_target_drug[sorted_index]
print('drug_target_drug.shape:',drug_target_drug.shape)

drug_target_drug.shape: (11493, 3)


In [15]:
# after adding target relative ids and index sorting
target_target_reduced

Unnamed: 0,targetid1,targetid2,gene1 symbol,gene2 symbol
0,1,1707,A2M,CDK7
1,1,6188,A2M,MMP2
2,1,6189,A2M,MMP20
3,1,6196,A2M,MMP8
4,1,10394,A2M,TF
...,...,...,...,...
91780,12214,9899,ZYX,SRC
91781,12214,11307,ZYX,VCAM1
91782,12215,2661,ZZEF1,DHRS4L2
91783,12215,3562,ZZEF1,FGFRL1


In [16]:
# drug-target-target-drug (0-1-1-0)
# obtain target-inter-target at first (1-1)
# drug1-target1-target2-drug2 and drug2-target2-target1-drug1 are all meaningful
# creating both target1-target2 and target2-target1 needs symmetric target-target source data 

target_inter_target=target_target_reduced[['targetid1','targetid2']].to_numpy(dtype=np.int32)
target_inter_target=np.concatenate([target_inter_target,target_inter_target[:,[1,0]]],axis=0) # add reverse-direction samples
target_inter_target=[tuple(row) for row in target_inter_target]
target_inter_target=np.unique(target_inter_target,axis=0) # remove redundant samples

# target_inter_target += num_drug
sorted_index = sorted(list(range(len(target_inter_target))), key=lambda i: target_inter_target[i].tolist())
target_inter_target = target_inter_target[sorted_index]
print('target_inter_target.shape:',target_inter_target.shape)

drug_target_target_drug=[]
for target1,target2 in target_inter_target:
     drug_target_target_drug.extend([(drug1, target1, target2, drug2) for drug1 in target_drug_list[target1] for drug2 in target_drug_list[target2]])
drug_target_target_drug=np.array(drug_target_target_drug)

# transform relative indices to absolute indices
drug_target_target_drug[:, [1, 2]] += num_drug
# order: drug > target
sorted_index = sorted(list(range(len(drug_target_target_drug))), key=lambda i : drug_target_target_drug[i, [0, 3, 1, 2]].tolist())
drug_target_target_drug = drug_target_target_drug[sorted_index]
print('drug_target_target_drug.shape:',drug_target_target_drug.shape)

target_inter_target.shape: (183570, 2)
drug_target_target_drug.shape: (341972, 4)


In [17]:
# target_inter_target has already been sorted based on target ids
target_inter_target

array([[    0,  3513],
       [    0,  5095],
       [    0,  9899],
       ...,
       [12215,  3562],
       [12215,  7370],
       [12216,  3038]])

In [18]:
# drug-target-target-target-drug (0-1-1-1-0)
# obtain target-target-target (1-1-1) at first 

target_target_target=[]
for target,target_list in target_target_list.items():
    target_target_target.extend([(target1,target,target2) for target1 in target_list for target2 in target_list])
target_target_target=np.array(target_target_target)

sorted_index=sorted(list(range(len(target_target_target))), key=lambda i: target_target_target[i,[0,2,1]].tolist())
target_target_target=target_target_target[sorted_index]

print('target_target_target.shape:',target_target_target.shape)

# consider all possibilities of target-target-target pairs (an unique central target with two side targets)

target_target_target.shape: (36025496, 3)


In [19]:
print(len(target_drug_list)) # including DTIs for all targets in the dataset

12217


In [20]:
drug_target_target_target_drug=[]

dtttd_ratio = 0.5
for target1,target,target2 in target_target_target:
    # target_drug_list is based on relative indices (the followings are several metapath instance selection limitations)
    # if(len(target_drug_list[target1])==0 or len(target_drug_list[target])==0):
    # if(len(target_drug_list[target1])==0 or len(target_drug_list[target2])==0): 
    if(len(target_drug_list[target1])==0 or len(target_drug_list[target])==0 or len(target_drug_list[target2])==0):
        continue
        
    candidate_drug1_list=np.random.choice(len(target_drug_list[target1]), int(dtttd_ratio*len(target_drug_list[target1])), replace=False)
    
    candidate_drug1_list=target_drug_list[target1][candidate_drug1_list]

    candidate_drug2_list=np.random.choice(len(target_drug_list[target2]), int(dtttd_ratio*len(target_drug_list[target2])), replace=False)
    
    candidate_drug2_list=target_drug_list[target2][candidate_drug2_list]
    
    drug_target_target_target_drug.extend([(drug1,target1,target,target2,drug2) for drug1 in candidate_drug1_list for drug2 in candidate_drug2_list])

drug_target_target_target_drug=np.array(drug_target_target_target_drug)

# transform relative indices to absolute indices
drug_target_target_target_drug[:, [1, 2, 3]] += num_drug
sorted_index=sorted(list(range(len(drug_target_target_target_drug))),key=lambda i: drug_target_target_target_drug[i,[0,4,1,2,3]].tolist())
drug_target_target_target_drug=drug_target_target_target_drug[sorted_index]
print('drug_target_target_target_drug.shape:',drug_target_target_target_drug.shape)

drug_target_target_target_drug.shape: (4278094, 5)


In [21]:
train_drug_te_drug.head()

Unnamed: 0,drug1,drug2,drug1_lower,drug2_lower,drugid1,drugid2,unified_name
0,celecoxib,paclitaxel,celecoxib,paclitaxel,21,76,celecoxib_paclitaxel
1,celecoxib,paclitaxel,celecoxib,paclitaxel,21,76,celecoxib_paclitaxel
2,celecoxib,paclitaxel,celecoxib,paclitaxel,21,76,celecoxib_paclitaxel
3,celecoxib,paclitaxel,celecoxib,paclitaxel,21,76,celecoxib_paclitaxel
4,celecoxib,paclitaxel,celecoxib,paclitaxel,21,76,celecoxib_paclitaxel


In [22]:
# *******************************************************************************
# process the drug-drug data. Notice: only drug pairs in training set can be used
# like 0-1-1-0, drug1-drug2 and drug2-drug1 are both meaningful
# one for generating drug1 embedding, one for generating drug2 embedding
# Notice: doubling samples only are done after the data split
# *******************************************************************************

# metapath index for therapeutic effect
drug_inter1_drug=train_drug_te_drug[['drugid1','drugid2']].to_numpy(dtype=np.int32)
drug_inter1_drug=np.concatenate([drug_inter1_drug,drug_inter1_drug[:, [1,0]]],axis=0)
drug_inter1_drug=[tuple(row) for row in drug_inter1_drug]
drug_inter1_drug=np.unique(drug_inter1_drug,axis=0)

sorted_index = sorted(list(range(len(drug_inter1_drug))),key=lambda i: drug_inter1_drug[i].tolist())
drug_inter1_drug = drug_inter1_drug[sorted_index]
print('drug_inter1_drug.shape:',drug_inter1_drug.shape)

# metapath index for side effect
drug_inter2_drug=train_drug_se_drug[['drugid1','drugid2']].to_numpy(dtype=np.int32)
drug_inter2_drug=np.concatenate([drug_inter2_drug,drug_inter2_drug[:, [1,0]]],axis=0)
drug_inter2_drug=[tuple(row) for row in drug_inter2_drug]
drug_inter2_drug=np.unique(drug_inter2_drug,axis=0)

sorted_index = sorted(list(range(len(drug_inter2_drug))), key=lambda i: drug_inter2_drug[i].tolist())
drug_inter2_drug = drug_inter2_drug[sorted_index]
print('drug_inter2_drug.shape:',drug_inter2_drug.shape)

drug_inter1_drug.shape: (42, 2)
drug_inter2_drug.shape: (420, 2)


In [23]:
import pathlib
# start the saving process
# Notice: the total number of training samples: train_drugcomb*2 -> consider A-B/B-A issue

# the cell line information stored in the order of cellline2relid_dict has already been saved

# 1, 429, 1002, 1012, 1024: 1, 2, 3, 4, 5

involved_metapaths=[ [(0,1,0),(0,1,1,0),(0,1,1,1,0),(0,'te',0),(0,'se',0)] ]

for i in range(len(involved_metapaths)):
    pathlib.Path(prefix + 'fold{}/'.format(seed2fold[random_seed]) + '{}'.format(i)).mkdir(parents=True, exist_ok=True)
    
metapath_indices_mapping = {
    (0, 1, 0): drug_target_drug,
    (0, 1, 1, 0): drug_target_target_drug,
    (0, 1, 1, 1, 0): drug_target_target_target_drug,
    (0, 'te', 0): drug_inter1_drug,
    (0, 'se', 0): drug_inter2_drug,
}

# store all data (according to metapath_indices_mapping)
target_idx_lists = [np.arange(num_drug), np.arange(num_target)]
offset_list = [0, num_drug]
for i, metapaths in enumerate(involved_metapaths):
    for metapath in metapaths:
        # get corresponding metapath nodes
        edge_metapath_idx_array = metapath_indices_mapping[metapath]
        
        # store the metapath node as pickle file
        with open(prefix + 'fold{}/'.format(seed2fold[random_seed]) + '{}/'.format(i)+'-'.join(map(str, metapath))+'_idx.pickle', 'wb') as out_file:
            target_metapaths_mapping={}
            left=0
            right=0
            for target_idx in target_idx_lists[i]:
                # target_idx refers a specific drug or target specified by an index 
                # the aim is to locate the last position of a metapath of a node in edge_metapath_idx_array
                # edge_metapath_idx_array is ordered by the first node in a metapath
                while right < len(edge_metapath_idx_array) and edge_metapath_idx_array[right, 0] == target_idx + offset_list[i]:
                    right += 1
                # first select all metapath choices using the first node, then put the first node into the last position of the metapath choice as the target node
                # edge_metapath_idx_array[left:right, ::-1]: reverse every metapath choice
                # and then store them to 'target_metapaths_mapping'
                target_metapaths_mapping[target_idx] = edge_metapath_idx_array[left:right, ::-1]
                # move to the next drug/target node
                left = right
            # write dict after iterations
            pickle.dump(target_metapaths_mapping, out_file)
        
        # store the corresponding source and target node of metapath choice as adjlist file
        with open(prefix + 'fold{}/'.format(seed2fold[random_seed]) + '{}/'.format(i) + '-'.join(map(str, metapath)) + '.adjlist', 'w') as out_file:
            left = 0
            right = 0
            for target_idx in target_idx_lists[i]:
                while right < len(edge_metapath_idx_array) and edge_metapath_idx_array[right, 0] == target_idx + offset_list[i]:
                    right += 1
                # adjlist is based on relative index
                neighbors = edge_metapath_idx_array[left:right, -1] - \
                    offset_list[i]
                neighbors = list(map(str, neighbors))
                if(len(neighbors) > 0):
                    # write line in each iteration
                    out_file.write('{} '.format(target_idx) +
                                   ' '.join(neighbors) + '\n')
                else:
                    out_file.write('{}\n'.format(target_idx))
                left = right

In [24]:
# start to store the model input specically for current fold (not general data)
import scipy

# these files are based on drugcomb_(depmap)_reduced
# train_drugcomb.to_csv(prefix + 'fold{}/'.format(seed2fold[random_seed]) + 'train_drugcomb.csv', index=0)
# val_drugcomb.to_csv(prefix + 'fold{}/'.format(seed2fold[random_seed]) + 'val_drugcomb.csv', index=0)
# test_drugcomb.to_csv(prefix + 'fold{}/'.format(seed2fold[random_seed]) + 'test_drugcomb.csv', index=0)

# train_twosides.to_csv(prefix + 'fold{}/'.format(seed2fold[random_seed]) + 'train_twosides.csv', index=0)
# val_twosides.to_csv(prefix + 'fold{}/'.format(seed2fold[random_seed]) + 'val_twosides.csv', index=0)
# test_twosides.to_csv(prefix + 'fold{}/'.format(seed2fold[random_seed]) + 'test_twosides.csv', index=0)

# store the heterogeneous adjacent matrix using a sparse npz file (which stores the model training data)
scipy.sparse.save_npz(prefix + 'fold{}/'.format(seed2fold[random_seed]) + 'adjM.npz', scipy.sparse.csr_matrix(adjM))
# store the node type mask of all nodes
np.save(prefix + 'fold{}/'.format(seed2fold[random_seed]) + 'node_types.npy', type_mask)

with open(prefix + 'fold{}/'.format(seed2fold[random_seed]) + 'drug2relid_dict.pickle', 'wb') as out_file:
    pickle.dump(drug2relid_dict, out_file)
    
with open(prefix + 'fold{}/'.format(seed2fold[random_seed]) + 'target2relid_dict.pickle', 'wb') as out_file:
    pickle.dump(target2relid_dict, out_file)
    
with open(prefix + 'fold{}/'.format(seed2fold[random_seed]) + 'cellline2relid_dict.pickle', 'wb') as out_file:
    pickle.dump(cellline2relid_dict, out_file)

In [25]:
from sklearn.preprocessing import MinMaxScaler

cellline_expression = np.load(prefix + midfix + 'cellline_expression.npy', allow_pickle = True)
scaler = MinMaxScaler()
scaler.fit(cellline_expression)
cellline_expression_normalized = scaler.transform(cellline_expression)

# after 0-1 normalization
np.save(prefix + 'fold{}/'.format(seed2fold[random_seed]) + 'cellline_expression_normalized.npy', cellline_expression_normalized)

in_file = open(prefix + midfix + 'atomnum2id_dict.pickle', 'rb')
atomnum2id_dict = pickle.load(in_file)
in_file.close()

with open(prefix + 'fold{}/'.format(seed2fold[random_seed]) + 'atomnum2id_dict.pickle', 'wb') as out_file:
    pickle.dump(atomnum2id_dict, out_file)

In [26]:
# as the description above, we need to double the sample in training set by reversing drug-drug pairs

# test for drug pair number in each set
print('len(train_DDI), len(val_DDI), len(test_DDI):', len(train_DDI), len(val_DDI), len(test_DDI))

# train_drugcomb is based on drugcomb_reduced
print(len(set(train_drugcomb['unified_name'])), len(set(val_drugcomb['unified_name'])), len(set(test_drugcomb['unified_name'])))

# double samples in training set (and remove some redundant columns)
train_drugcomb = train_drugcomb[['drugid1','drugid2','cell_line_name','S_mean','synergy_zip','synergy_loewe','synergy_hsa','synergy_bliss']]
train_drugcomb_ = train_drugcomb[['drugid2','drugid1','cell_line_name','S_mean','synergy_zip','synergy_loewe','synergy_hsa','synergy_bliss']]

train_drugcomb = pd.DataFrame(np.concatenate([np.array(train_drugcomb),np.array(train_drugcomb_)],axis=0),columns=train_drugcomb.columns)
train_drugcomb = train_drugcomb.sort_values(['drugid1', 'drugid2'], ascending=(True,True)).reset_index(drop=True)

# keep the number of samples in val/test sets unchanged
val_drugcomb = val_drugcomb[['drugid1','drugid2','cell_line_name','S_mean','synergy_zip','synergy_loewe','synergy_hsa','synergy_bliss']]
val_drugcomb = val_drugcomb.sort_values(['drugid1', 'drugid2'], ascending=(True,True)).reset_index(drop=True)

test_drugcomb = test_drugcomb[['drugid1','drugid2','cell_line_name','S_mean','synergy_zip','synergy_loewe','synergy_hsa','synergy_bliss']]
test_drugcomb = test_drugcomb.sort_values(['drugid1', 'drugid2'], ascending=(True,True)).reset_index(drop=True)

len(train_DDI), len(val_DDI), len(test_DDI): 210 68 68
210 68 68


In [27]:
train_twosides

Unnamed: 0,drugid1,drugid2,drug1,drug2,Polypharmacy Side Effect,Side Effect Name,drug1_lower,drug2_lower,unified_name
0,0,5,5-fluorouracil,allopurinol,C0015967,body temperature increased,5-fluorouracil,allopurinol,5-fluorouracil_allopurinol
1,0,5,5-fluorouracil,allopurinol,C0011175,dehydration,5-fluorouracil,allopurinol,5-fluorouracil_allopurinol
2,0,5,5-fluorouracil,allopurinol,C0008033,pleural pain,5-fluorouracil,allopurinol,5-fluorouracil_allopurinol
3,0,5,5-fluorouracil,allopurinol,C0011991,diarrhea,5-fluorouracil,allopurinol,5-fluorouracil_allopurinol
4,0,5,5-fluorouracil,allopurinol,C0043096,loss of weight,5-fluorouracil,allopurinol,5-fluorouracil_allopurinol
...,...,...,...,...,...,...,...,...,...
1462,105,90,zoledronic acid,sorafenib,C0043096,loss of weight,zoledronic acid,sorafenib,sorafenib_zoledronic acid
1463,105,90,zoledronic acid,sorafenib,C0040034,thrombocytopenia,zoledronic acid,sorafenib,sorafenib_zoledronic acid
1464,105,90,zoledronic acid,sorafenib,C0000737,abdominal pain,zoledronic acid,sorafenib,sorafenib_zoledronic acid
1465,105,90,zoledronic acid,sorafenib,C0042963,emesis,zoledronic acid,sorafenib,sorafenib_zoledronic acid


In [28]:
# test for drug pair number in each set
print('len(train_DDI), len(val_DDI), len(test_DDI):', len(train_DDI), len(val_DDI), len(test_DDI))
print(len(set(train_twosides['unified_name'])), len(set(val_twosides['unified_name'])), len(set(test_twosides['unified_name'])))

# train_twosides is based on twosides_reduced
train_twosides = pd.DataFrame(train_twosides,columns=twosides_reduced.columns).reset_index(drop=True)
val_twosides = pd.DataFrame(val_twosides,columns=twosides_reduced.columns).reset_index(drop=True)
test_twosides = pd.DataFrame(test_twosides,columns=twosides_reduced.columns).reset_index(drop=True)

# double samples
train_twosides = train_twosides[['drugid1','drugid2','Polypharmacy Side Effect','Side Effect Name']]
train_twosides_ = train_twosides[['drugid2','drugid1','Polypharmacy Side Effect','Side Effect Name']]

train_twosides = pd.DataFrame(np.concatenate([np.array(train_twosides),np.array(train_twosides_)],axis=0),columns=train_twosides.columns)
train_twosides = train_twosides.sort_values(['drugid1', 'drugid2'],ascending=(True,True)).reset_index(drop=True)

val_twosides = val_twosides[['drugid1','drugid2','Polypharmacy Side Effect','Side Effect Name']]
val_twosides = val_twosides.sort_values(['drugid1', 'drugid2'],ascending=(True,True)).reset_index(drop=True)

test_twosides = test_twosides[['drugid1','drugid2','Polypharmacy Side Effect','Side Effect Name']]
test_twosides = test_twosides.sort_values(['drugid1', 'drugid2'],ascending=(True,True)).reset_index(drop=True)

len(train_DDI), len(val_DDI), len(test_DDI): 210 68 68
210 68 68


In [29]:
# merge drugcomb and twosides

# side effect mapping dict
se_symbolset = list(set(twosides_reduced['Polypharmacy Side Effect'])) # based on selected twosides_reduced
se_symbolset.sort()
se_symbol2id_dict = {se: i for i, se in enumerate(se_symbolset)}
print('se_symbol2id_dict:', se_symbol2id_dict)

with open(prefix + 'fold{}/'.format(seed2fold[random_seed]) + 'se_symbol2id_dict.pickle', 'wb') as out_file:
    pickle.dump(se_symbol2id_dict, out_file)

se_symbol2id_dict: {'C0000737': 0, 'C0002871': 1, 'C0004093': 2, 'C0008033': 3, 'C0009676': 4, 'C0011175': 5, 'C0011991': 6, 'C0013404': 7, 'C0015672': 8, 'C0015967': 9, 'C0020649': 10, 'C0027497': 11, 'C0027947': 12, 'C0030193': 13, 'C0032285': 14, 'C0040034': 15, 'C0042963': 16, 'C0043096': 17, 'C0085649': 18, 'C0398353': 19}


In [30]:
train_drugcomb.head()

Unnamed: 0,drugid1,drugid2,cell_line_name,S_mean,synergy_zip,synergy_loewe,synergy_hsa,synergy_bliss
0,0,5,SF-268,22.1315,6.17715,-4.49406,7.10084,9.99667
1,0,5,SF-295,17.659,-6.28842,-11.9889,-4.53622,-6.21557
2,0,5,SF-539,34.7795,-1.44965,-25.1057,-4.29388,-5.2754
3,0,5,SNB-75,23.5525,3.50575,0.808711,7.11048,5.4775
4,0,5,U251,24.9915,-3.68153,-9.15827,-4.86891,-6.68447


In [31]:
train_twosides.head()

Unnamed: 0,drugid1,drugid2,Polypharmacy Side Effect,Side Effect Name
0,0,5,C0015967,body temperature increased
1,0,5,C0011175,dehydration
2,0,5,C0008033,pleural pain
3,0,5,C0011991,diarrhea
4,0,5,C0043096,loss of weight


In [32]:
# obtain labels for drug-drug-cell line pairs
# every drug-drug-cell line pair has a label

# training (the se labels have the same order as samples in train_drugcomb_numpy)
train_se_label = np.zeros((train_drugcomb.shape[0], len(se_symbol2id_dict)))
train_drugcomb_numpy = np.array(train_drugcomb)
train_twosides_numpy = np.array(train_twosides)
counter = -1
counter_ = 0
for row in np.array(train_drugcomb_numpy): # based on drug-drug-cell lines in drugcomb
    # row: a drug-drug-cell line pair
    counter += 1
    drugid1 = row[0]
    drugid2 = row[1]

    search = train_twosides_numpy[
        ((train_twosides_numpy[:, 0] == drugid1) & (train_twosides_numpy[:, 1] == drugid2)) |
        ((train_twosides_numpy[:, 1] == drugid1) & (train_twosides_numpy[:, 0] == drugid2))]

    search = [se_symbol2id_dict[i] for i in search[:, 2]] # adverse effect labels corresponding to current drug-drug pair
    
    counter_ += len(search) # because the sample number in training has doubled, thus counter_ counts the se labels twice
    
    train_se_label[counter, search] = 1 # put retrieved labels to the label matrix (in the position of current drug-drug-cell line pair)

# validation
val_se_label = np.zeros((val_drugcomb.shape[0], len(se_symbol2id_dict)))
val_drugcomb_numpy = np.array(val_drugcomb)
val_twosides_numpy = np.array(val_twosides)

counter = -1
for row in np.array(val_drugcomb_numpy):
    # row: a drug-drug-cell line pair
    counter += 1
    drugid1 = row[0]
    drugid2 = row[1]
    
    search = val_twosides_numpy[((val_twosides_numpy[:, 0] == drugid1) & (val_twosides_numpy[:, 1] == drugid2)) |
                                ((val_twosides_numpy[:, 1] == drugid1) & (val_twosides_numpy[:, 0] == drugid2))]

    search = [se_symbol2id_dict[i] for i in search[:, 2]]
    val_se_label[counter, search] = 1

# test
test_se_label = np.zeros((test_drugcomb.shape[0], len(se_symbol2id_dict)))
test_drugcomb_numpy = np.array(test_drugcomb)
test_twosides_numpy = np.array(test_twosides)

counter = -1
for row in np.array(test_drugcomb_numpy):
    # row: a drug-drug-cell line pair
    counter += 1
    drugid1 = row[0]
    drugid2 = row[1]
    
    search = test_twosides_numpy[((test_twosides_numpy[:, 0] == drugid1) & (test_twosides_numpy[:, 1] == drugid2)) |
                                 ((test_twosides_numpy[:, 1] == drugid1) & (test_twosides_numpy[:, 0] == drugid2))]
    
    search = [se_symbol2id_dict[i] for i in search[:, 2]]
    test_se_label[counter, search] = 1

print('train_drugcomb_numpy.shape:', train_drugcomb_numpy.shape, 'train_drugcomb.shape:', train_drugcomb.shape)
print('counter_:',counter_)
print('train_se_label.shape:',train_se_label.shape)
print('train_se_label.sum():',train_se_label.sum())

# create te labels(the column order in created te labels：S_mean，synergy_zip，synergy_loewe，synergy_hsa，synergy_bliss)
# train_drugcomb column order: drugid1, drugid2, cell_line_name, S_mean, synergy_zip, synergy_loewe, synergy_hsa, synergy_bliss
train_te_label=np.array(train_drugcomb)[:,3:]
val_te_label=np.array(val_drugcomb)[:,3:]
test_te_label=np.array(test_drugcomb)[:,3:]

train_drugcomb_numpy.shape: (13092, 8) train_drugcomb.shape: (13092, 8)
counter_: 183448
train_se_label.shape: (13092, 20)
train_se_label.sum(): 91724.0


In [33]:
# npy file can store one standard binary numpy matrix, while npz as a zip could store multiple npy(s)
# *** in these files, store the updated drugcomb file in which samples have doubled and columns are filtered ***
np.savez(prefix + 'fold{}/'.format(seed2fold[random_seed]) + 'train_val_test_drug_drug_samples.npz',
         train_drug_drug_samples=np.array(train_drugcomb,dtype=str)[:,:3], # drugid1, drugid2, cell_line_name
         val_drug_drug_samples=np.array(val_drugcomb,dtype=str)[:,:3],
         test_drug_drug_samples=np.array(test_drugcomb,dtype=str)[:,:3])

np.savez(prefix + 'fold{}/'.format(seed2fold[random_seed]) + 'train_val_test_drug_drug_labels.npz',
         train_te_labels=train_te_label.astype('float32'),
         train_se_labels=train_se_label.astype('float32'),
         val_te_labels=val_te_label.astype('float32'),
         val_se_labels=val_se_label.astype('float32'),
         test_te_labels=test_te_label.astype('float32'),
         test_se_labels=test_se_label.astype('float32'))

In [34]:
# store ECFP6 data
# dict storing mapping from relative drug ids to corresponding ECFP6
in_file = open(prefix + midfix + 'drugid2morgan.pickle', 'rb')
drugid2morgan = pickle.load(in_file)
in_file.close()

ECFP6_DNN = []
# drugid2morgan is based on drug2relid_dict (drug2relid_dict is fixed after selecting cell lines)
for key in drugid2morgan.keys(): 
    ECFP6_DNN.append(drugid2morgan[key])
    
ECFP6_DNN = np.array(ECFP6_DNN)

ECFP6_DNN_coomatrix = sparse.coo_matrix(ECFP6_DNN)
sparse.save_npz(prefix + 'fold{}/'.format(seed2fold[random_seed]) + 'ECFP6_DNN_coomatrix.npz', ECFP6_DNN_coomatrix)

print(ECFP6_DNN.shape)
print(drugid2morgan.keys())

(106, 1024)
dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105])


In [35]:
# create a path storing generated models from the main code
import os
model_storage_path = prefix + 'fold{}/'.format(seed2fold[random_seed]) + 'checkpoint/'
isExists=os.path.exists(model_storage_path)
if not isExists:
    os.makedirs(model_storage_path)

In [36]:
end_processing_time = time.time()
duration = end_processing_time - start_processing_time
print('the processing time to generate model input for current fold is:', duration)
# the preprocessing time is about 7-8 min

the processing time to generate model input for current fold is: 334.9984679222107


In [37]:
# *************************************##################### #
# the end of data preprocessing for current independent repeat
# *************************************##################### #

In [45]:
# extra test for the overlapping of drug pairs in different folds
fold_path_1 = prefix + 'fold1/'
fold_path_2 = prefix + 'fold2/'
fold_path_3 = prefix + 'fold3/'
fold_path_4 = prefix + 'fold4/'
fold_path_5 = prefix + 'fold5/'

drug_drug_samples_fold1 = np.load(fold_path_1 + 'train_val_test_drug_drug_samples.npz')
train_drug_drug_samples_fold1 = drug_drug_samples_fold1['train_drug_drug_samples']
val_drug_drug_samples_fold1 = drug_drug_samples_fold1['val_drug_drug_samples']
test_drug_drug_samples_fold1 = drug_drug_samples_fold1['test_drug_drug_samples']

drug_drug_samples_fold2 = np.load(fold_path_2 + 'train_val_test_drug_drug_samples.npz')
train_drug_drug_samples_fold2 = drug_drug_samples_fold2['train_drug_drug_samples']
val_drug_drug_samples_fold2 = drug_drug_samples_fold2['val_drug_drug_samples']
test_drug_drug_samples_fold2 = drug_drug_samples_fold2['test_drug_drug_samples']

drug_drug_samples_fold3 = np.load(fold_path_3 + 'train_val_test_drug_drug_samples.npz')
train_drug_drug_samples_fold3 = drug_drug_samples_fold3['train_drug_drug_samples']
val_drug_drug_samples_fold3 = drug_drug_samples_fold3['val_drug_drug_samples']
test_drug_drug_samples_fold3 = drug_drug_samples_fold3['test_drug_drug_samples']

drug_drug_samples_fold4 = np.load(fold_path_4 + 'train_val_test_drug_drug_samples.npz')
train_drug_drug_samples_fold4 = drug_drug_samples_fold4['train_drug_drug_samples']
val_drug_drug_samples_fold4 = drug_drug_samples_fold4['val_drug_drug_samples']
test_drug_drug_samples_fold4 = drug_drug_samples_fold4['test_drug_drug_samples']

drug_drug_samples_fold5 = np.load(fold_path_5 + 'train_val_test_drug_drug_samples.npz')
train_drug_drug_samples_fold5 = drug_drug_samples_fold5['train_drug_drug_samples']
val_drug_drug_samples_fold5 = drug_drug_samples_fold5['val_drug_drug_samples']
test_drug_drug_samples_fold5 = drug_drug_samples_fold5['test_drug_drug_samples']

# the number of drug pair varieties in training set of every fold is the same: 420
fold1_set = set([tuple(i) for i in train_drug_drug_samples_fold1[:, :2]])
fold2_set = set([tuple(i) for i in train_drug_drug_samples_fold2[:, :2]])
fold3_set = set([tuple(i) for i in train_drug_drug_samples_fold3[:, :2]])
fold4_set = set([tuple(i) for i in train_drug_drug_samples_fold4[:, :2]])
fold5_set = set([tuple(i) for i in train_drug_drug_samples_fold5[:, :2]])

difference_list = [] 
for i in range(5):
    for j in range(5):
        temp = len(locals()['fold{}_set'.format(i+1)] - locals()['fold{}_set'.format(j+1)])
        difference_list.append(temp)
    difference_list.append('\n')

# the difference ratio of drug pair varieties in different folds is about 50%
print(difference_list)

[0, 160, 164, 170, 168, '\n', 160, 0, 180, 170, 156, '\n', 164, 180, 0, 148, 158, '\n', 170, 170, 148, 0, 162, '\n', 168, 156, 158, 162, 0, '\n']


In [None]:
fold_name = 'fold1' # investigate the drug distribution difference among different sets
train_drugset = set(locals()['train_drug_drug_samples_{}'.format(fold_name)][:, 0]).union(
    set(locals()['train_drug_drug_samples_{}'.format(fold_name)][:, 1]))

val_drugset = set(locals()['val_drug_drug_samples_{}'.format(fold_name)][:, 0]).union(
    set(locals()['val_drug_drug_samples_{}'.format(fold_name)][:, 1]))

test_drugset = set(locals()['test_drug_drug_samples_{}'.format(fold_name)][:, 0]).union(
    set(locals()['test_drug_drug_samples_{}'.format(fold_name)][:, 1]))

print(len(train_drugset-val_drugset), len(train_drugset-test_drugset))
print(len(val_drugset-train_drugset), len(val_drugset-test_drugset))
print(len(test_drugset-train_drugset), len(test_drugset-val_drugset))
# test has, train does not have; test has, val does not have

# there are about 10-20 drugs that are in the test set and are not in the training and validation sets
