In [1]:
from torch_geometric.data import Data, HeteroData
import pandas as pd


In [2]:
c_p = pd.read_csv("../../data/DrugCombDB/cell_protein.csv")
d_p = pd.read_csv("../../data/DrugCombDB/drug_protein.csv")
p_p = pd.read_excel("../../data/DrugCombDB/protein-protein_network.xlsx")
comb = pd.read_csv("../../data/DrugCombDB/drug_combinations.csv")


In [5]:
comb_d = set(comb['drug1_db']).union(set(comb['drug2_db']))
comb_c = set(comb['cell'])

c_p_c = set(c_p['cell'])
c_p_p = set(c_p['protein'])

d_p_d = set(d_p['drug'])
d_p_p = set(d_p['protein'])

p_p_p = set(p_p['protein_a']).union(set(p_p['protein_b']))

In [6]:
num_cell = len(c_p_c)
num_drug = len(d_p_d)
num_protein = len(p_p_p)

In [8]:
cell2id = {
    list(c_p_c)[idx]: idx for idx in range(num_cell)
}
drug2id = {
    list(d_p_d)[idx]: idx for idx in range(num_drug)
}
protein2id = {
    list(p_p_p)[idx]: idx for idx in range(num_protein)
}

In [9]:
# c-p
cells = list(c_p['cell'])
proteins = list(c_p['protein'])

cell_ids = [cell2id[cell] for cell in cells]
protein_ids = [protein2id[protein] for protein in proteins]
c_p_edge = [cell_ids, protein_ids]

# d-p
drugs = list(d_p['drug'])
proteins = list(d_p['protein'])

drug_ids = [drug2id[drug] for drug in drugs]
protein_ids = [protein2id[protein] for protein in proteins]
d_p_edge = [drug_ids, protein_ids]

# p-p
proteins1 = list(p_p['protein_a'])
proteins2 = list(p_p['protein_b'])

protein_ids1 = [protein2id[protein] for protein in proteins1]
protein_ids2 = [protein2id[protein] for protein in proteins2]
p_p_edge = [protein_ids1, protein_ids2]



In [10]:
## d-d

from time import sleep
import pubchempy as pcp
from tqdm import tqdm

drugs = list(drug2id.keys())

drug_dict = {}
for drug_name in tqdm(drugs):
    try:
        drug_c = pcp.get_compounds(drug_name, "name")
        drug_dict[drug_name] = drug_c
    except Exception as e:
        sleep(5)
        drug_c = pcp.get_compounds(drug_name, "name")
        drug_dict[drug_name] = drug_c



In [11]:
import itertools
from rdkit import Chem, DataStructs
from tqdm import tqdm

new_dict = {}
for key in drug_dict.keys():
    mol = Chem.MolFromSmiles(drug_dict[key][0].isomeric_smiles)
    new_dict[key] = Chem.RDKFingerprint(mol)

combs = itertools.combinations(new_dict.keys(), 2)

simi_matrix = pd.DataFrame(index=new_dict.keys(), columns=new_dict.keys())

for index, row in simi_matrix.iterrows():
    for col in simi_matrix.columns:
        if index == col:
            simi_matrix.loc[index, col] = 0.0
        else:
            simi_matrix.loc[index, col] = DataStructs.FingerprintSimilarity(
                new_dict[index],
                new_dict[col]
            )


100%|██████████| 764/764 [00:00<00:00, 19480.17it/s]
100%|██████████| 764/764 [00:00<00:00, 20334.73it/s]
100%|██████████| 764/764 [00:00<00:00, 19456.51it/s]
100%|██████████| 764/764 [00:00<00:00, 19255.19it/s]
100%|██████████| 764/764 [00:00<00:00, 19956.83it/s]
100%|██████████| 764/764 [00:00<00:00, 20174.70it/s]
100%|██████████| 764/764 [00:00<00:00, 19222.73it/s]
100%|██████████| 764/764 [00:00<00:00, 20344.28it/s]
100%|██████████| 764/764 [00:00<00:00, 20730.70it/s]
100%|██████████| 764/764 [00:00<00:00, 20596.52it/s]
100%|██████████| 764/764 [00:00<00:00, 20061.28it/s]
100%|██████████| 764/764 [00:00<00:00, 20025.42it/s]
100%|██████████| 764/764 [00:00<00:00, 20233.04it/s]
100%|██████████| 764/764 [00:00<00:00, 19636.66it/s]
100%|██████████| 764/764 [00:00<00:00, 20590.04it/s]
100%|██████████| 764/764 [00:00<00:00, 20015.04it/s]
100%|██████████| 764/764 [00:00<00:00, 19178.09it/s]
100%|██████████| 764/764 [00:00<00:00, 20073.97it/s]
100%|██████████| 764/764 [00:00<00:00, 19684.9

In [23]:
combs = itertools.combinations(new_dict.keys(), 2)



In [49]:
import torch
import torch_geometric.transforms as T

drug_x = [x for x in range(num_drug)]
protein_x = [x for x in range(num_protein)]
cell_x = [x for x in range(num_cell)]

d1 = []
d2 = []
combs = itertools.combinations(new_dict.keys(), 2)
for comb in list(combs):
    d1.append(drug2id[comb[0]])
    d2.append(drug2id[comb[1]])
d_d_edge = [d1, d2]

d_d_attr = [simi_matrix.values[d_d_edge[0][i]][d_d_edge[1][i]] for i in range(len(d_d_edge[0]))]

data = HeteroData()

data['drug'].x = torch.tensor(drug_x, dtype=torch.int)
data['protein'].x = torch.tensor(protein_x, dtype=torch.int)
data['cell'].x = torch.tensor(cell_x, dtype=torch.int)

data['drug', 'd-d', 'drug'].edge_index = torch.tensor(d_d_edge, dtype=torch.int64)
data['drug', 'd-p', 'protein'].edge_index = torch.tensor(d_p_edge, dtype=torch.int64)
data['protein', 'p-p', 'protein'].edge_index = torch.tensor(p_p_edge, dtype=torch.int64)
data['cell', 'c-p', 'protein'].edge_index = torch.tensor(c_p_edge, dtype=torch.int64)

data['drug', 'd-d', 'drug'].edge_attr = torch.tensor(d_d_attr, dtype=torch.float32)

data = T.ToUndirected()(data)




In [28]:
data.collect('edge_attr')

{('drug',
  'd-d',
  'drug'): tensor([0.2430, 0.1960, 0.3008,  ..., 0.4374, 0.0110, 0.1401])}

In [50]:
import pickle

with open("processed/graph.pkl",'wb') as tf:
    pickle.dump(data, tf)

In [33]:
import pickle

with open("processed/dict.pkl", 'wb') as tf:
    pickle.dump({"drug2id":drug2id,"protein2id":protein2id,"cell2id":cell2id},tf)

In [45]:
with open("processed/graph.pkl",'rb') as tf:
    read = pickle.load( tf)

In [40]:
import pickle
import pandas as pd
import torch
from torch.utils.data import TensorDataset, random_split

comb_data = pd.read_csv("./drug_combinations.csv")
cells = list(comb_data['cell'])
drug1s = list(comb_data['drug1_db'])
drug2s = list(comb_data['drug2_db'])

synergys = list(comb_data['synergy'])
labels =torch.LongTensor([1 if synergy>0 else 0 for synergy in synergys])

with open("./processed/dict.pkl", 'rb') as tf:
    dicts = pickle.load(tf)
    cell2id = dicts['cell2id']
    drug2id = dicts['drug2id']

cellsid = torch.LongTensor([cell2id[cell] for cell in cells])
drugs1id = torch.LongTensor([drug2id[drug] for drug in drug1s])
drugs2id = torch.LongTensor([drug2id[drug] for drug in drug2s])

dataset = TensorDataset(drugs1id, drugs2id, cellsid, labels)

total = len(dataset)
valid_len = int(0.1 * total)
test_len = int(0.2 * total)
valid_dataset, test_dataset, train_dataset = random_split(dataset, [valid_len, test_len, total - valid_len - test_len])

with open("./processed/dataset.pkl",'wb') as tf:
    pickle.dump({"train":train_dataset,"valid":valid_dataset,"test":test_dataset},tf)