In [1]:
import pandas as pd
import numpy as np

from rdkit import Chem
from rdkit.Chem import AllChem, rdFingerprintGenerator

In [16]:
# Create PubChem_id list
pubchem_id = pd.read_csv('../DrugCell/data/pubchem_id_by_nsc.csv', index_col=0).dropna()
pubchem_id
pubchem_id['PUBCHEM_ID'].astype(int).to_csv('../DrugCell/data/pubchem_id.csv', index=None, header=None)
pubchem_id.shape

(16522, 1)

In [17]:
#  Get SMILES from PubChem ID  https://pubchem.ncbi.nlm.nih.gov/idexchange/idexchange.cgi
tmp = pd.read_table('../DrugCell/data/SMILES_from_PubchemID.txt', header=None)
tmp.index = pubchem_id.index
tmp = tmp.drop(0, axis=1)
tmp.shape

(16522, 1)

In [4]:
nci60Act = pd.read_csv('../DrugCell/data/nci60Act.csv', index_col=0)
nci60Act
nci60Act = nci60Act.T[tmp.index].T
nci60Act.index = list(tmp[1])

In [5]:
t = []
for i in nci60Act.index:
    for l in nci60Act.columns:
        if not np.isnan(nci60Act[l][i]):
            t.append([l, i, nci60Act[l][i]])

In [18]:
t = pd.DataFrame(t)
t[0] = t[0].str.replace(' ', '-')
t = t.sort_values(0, ascending=True).reset_index(drop=True)
t.to_csv(
    '../data/train.csv', sep='\t', 
    header=None, index=None
)
t.shape

(901781, 3)

In [19]:
cell2id = pd.DataFrame(sorted(set(t[0]))).reset_index()
cell2id.to_csv(
    '../data/cell2id.csv', sep='\t', 
    header=None, index=None
)

In [20]:
cell2id

Unnamed: 0,index,0
0,0,BR:BT-549
1,1,BR:HS-578T
2,2,BR:MCF7
3,3,BR:MDA-MB-231
4,4,BR:T-47D
5,5,CNS:SF-268
6,6,CNS:SF-295
7,7,CNS:SF-539
8,8,CNS:SNB-19
9,9,CNS:SNB-75


In [8]:
drug2id = pd.DataFrame(sorted(set(t[1]))).reset_index()
drug2id.to_csv(
    '../data/drug2id.csv', sep='\t', 
    header=None, index=None
)

In [21]:
drug2id

Unnamed: 0,index,0
0,0,B(C1=CC=CC=C1)(C2=CC=CC=C2)OC(C3=CC=CC=C3)C(C)...
1,1,B(C1=CC=CC=C1)(C2=CC=CC=C2)OC3=C(NC=CC3=O)C
2,2,B(C1=CC=CC=C1C#CC2=CC=CC=C2B(O)O)(O)O
3,3,B(C1=CC=CC=C1C=O)(O)O
4,4,B(CCCCCCCCCCCCC)(O)O
...,...,...
16517,16517,[CH]1[CH][CH][CH][CH]1.[CH]1[CH][CH][CH][CH]1....
16518,16518,[CH]1[CH][CH][CH][CH]1.[CH]1[CH][CH][C]([CH]1)...
16519,16519,[H+].C1=CC(=CC=C1N)Cl.C1=CC(=CC=C1N)Cl.C1=CC(=...
16520,16520,[H+].CCN(CC)CC[N+]1=CC2=C(C3=C(C(=C2C=C1)C)NC4...


In [9]:
mfp = np.array([
 np.array(
  AllChem.GetMorganFingerprintAsBitVect(
    Chem.MolFromSmiles(i), 
    useChirality=True, 
    radius=2, 
    nBits=2048
  )) for i in drug2id[0]
])



In [10]:
pd.DataFrame(mfp).to_csv(
    '../data/drug2fingerprint.csv', sep=',', 
    header=None, index=None
)

In [11]:
cell2mut = np.sign(pd.read_csv('../DrugCell/data/nci60GeneMut.csv', index_col=0)).T
cell2mut.head()

Unnamed: 0,C1orf222,SAMD11,KLHL17,PLEKHN1,ISG15,AGRN,TTLL10,TNFRSF4,UBE2J2,ACAP3,...,ATP6AP1,GDI1,FAM50A,PLXNA3,GAB3,F8,MTCP1,CLIC2,TMLHE,DDX3Y
BR:MCF7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BR:MDA-MB-231,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BR:HS 578T,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BR:BT-549,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BR:T-47D,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
cell2mut.to_csv(
    '../data/cell2mutation.csv', 
    header=None, index=None
)

In [13]:
pd.DataFrame(cell2mut.columns).reset_index().to_csv(
    '../data/gene2ind.csv', sep='\t', 
    header=None, index=None
)

In [15]:
# !mkdir MODEL
!python -u ../DrugCell/code/train_drugcell.py -onto ../DrugCell/data/drugcell_ont.txt \
                            -gene2id ../data/gene2ind.csv \
                            -cell2id ../data/cell2id.csv \
                            -drug2id ../data/drug2id.csv \
                            -genotype ../data/cell2mutation.csv \
                            -fingerprint ../data/drug2fingerprint.csv \
                            -train ../data/train.csv \
                            -test ../data/train.csv \
                            -model ./MODEL \
                            -genotype_hiddens 6 \
                            -drug_hiddens "100,50,6" \
                            -final_hiddens 6 \
                            -epoch 100 \
                            -batchsize 5000 \
                            -cuda 0

Total number of cell lines = 60
Total number of drugs = 16522
[[0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
There are 1752 genes
There are 1 roots: GO:0008150
There are 2086 terms
There are 1 connected componenets
term	GO:0007005	term_size	37	num_hiddens	6
term	GO:0007006	term_size	17	num_hiddens	6
term	GO:0008637	term_size	11	num_hiddens	6
term	GO:0006281	term_size	58	num_hiddens	6
term	GO:0006284	term_size	10	num_hiddens	6
term	GO:0006283	term_size	10	num_hiddens	6
term	GO:0019985	term_size	6	num_hiddens	6
term	GO:0000724	term_size	9	num_hiddens	6
term	GO:0006303	term_size	9	num_hiddens	6
term	GO:0051052	term_size	75	num_hiddens	6
term	GO:0044030	term_size	11	num_hiddens	6
term	GO:0051054	term_size	51	num_hiddens	6
term	GO:0010569	term_size	6	num_hiddens	6
term	GO:0045830	term_size	7	num_hiddens	6
term	GO:0045739	term_size	11	num_hiddens	6
term	GO:2000279	term_size	8	num_hidd

Traceback (most recent call last):
  File "/Users/yoshitakainoue/code/graph_neural_network_drug_response/notebook/../DrugCell/code/train_drugcell.py", line 211, in <module>
    train_model(
  File "/Users/yoshitakainoue/code/graph_neural_network_drug_response/notebook/../DrugCell/code/train_drugcell.py", line 54, in train_model
    train_label_gpu = torch.autograd.Variable(train_label.cuda(CUDA_ID))
  File "/Users/yoshitakainoue/code/graph_neural_network_drug_response/.venv/lib/python3.9/site-packages/torch/cuda/__init__.py", line 210, in _lazy_init
    raise AssertionError("Torch not compiled with CUDA enabled")
AssertionError: Torch not compiled with CUDA enabled
