In [1]:
import pandas as pd
import numpy as np
import random as rd

from tqdm import tqdm
from joblib import Parallel, delayed

In [2]:
# Create PubChem_id list
pubchem_id = pd.read_csv('../DrugCell/data_rcellminer/pubchem_id_by_nsc.csv', index_col=0).dropna()
pubchem_id

Unnamed: 0,PUBCHEM_ID
17,219123.0
295,4775.0
353,24180741.0
384,54599265.0
534,16685698.0
...,...
760087,60780.0
760091,9941647.0
760423,9955951.0
761431,42611257.0


In [3]:
#  Get smiles from PubChem ID  https://pubchem.ncbi.nlm.nih.gov/idexchange/idexchange.cgi
smiles = pd.read_table('../DrugCell/data_rcellminer/smiles_from_PubchemID.txt', header=None)
smiles.index = pubchem_id.index
smiles = smiles.drop(0, axis=1)
smiles.columns = ['smiles']
smiles = smiles.to_dict()['smiles']

In [4]:
nci60Act = pd.read_csv('../data/nci60Act_ccle.csv', index_col=0)

In [5]:
base = pd.DataFrame()
for i in nci60Act.columns:
    tmp = nci60Act[i].reset_index().dropna()
    tmp.columns = ['nsc', 'drug_response']
    tmp['cell_line'] = [i]*len(tmp)
    base = pd.concat([
        base,
        tmp
    ])

In [6]:
class_nsc = pd.read_csv(
    '../DrugCell/data_rcellminer/class_by_nsc.csv',
)

In [7]:
class_nsc['MECHANISM'].value_counts()

Other          22895
Kinase           483
DNA              274
HDAC              46
TUBB              45
Apoptosis         38
HSP90             16
PSM               13
Ho                13
Methylation       13
BRD               11
Acetalax           4
Name: MECHANISM, dtype: int64

In [8]:
nsc_list = set(class_nsc['NSC'])
class_nsc = {class_nsc['NSC'][i] : class_nsc['MECHANISM'][i] for i in class_nsc.index}

In [9]:
base['label'] = [class_nsc[i] if i in nsc_list else None for i in base['nsc']]
base = base[base['label'] != 'Other'] \
    .reset_index(drop=True) \
    .drop('label', axis=1)
base['smiles'] = [smiles[i] if i in smiles.keys() else None for i in base.nsc]
base = base.dropna()
base = base.reset_index(drop=True)
base = base[['cell_line', 'smiles', 'drug_response']]

In [10]:
base

Unnamed: 0,cell_line,smiles,drug_response
0,MCF7_BREAST,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...,0.703626
1,MCF7_BREAST,C1(=NC(=NN1)N)N,0.203035
2,MCF7_BREAST,CC1C(C(C(C(O1)O[C@H]2CC[C@@]3([C@H]4CC[C@@]5([...,-0.006210
3,MCF7_BREAST,CC(=O)OC1CC2(C(CCC3(C2(CCC3C4=COC(=O)C=C4)O)C)...,0.061604
4,MCF7_BREAST,C1CN1C2=NC(=NC(=N2)N3CC3)N4CC4,0.857356
...,...,...,...
16987,TK10_KIDNEY,O1[As]2O[As]3O[As]1O[As](O2)O3,-1.149281
16988,TK10_KIDNEY,C[C@@H]1[C@H]([C@H](C[C@@H](O1)O[C@H]2CC[C@]3(...,1.048326
16989,TK10_KIDNEY,CC1=C2[C@H](C(=O)[C@@]3([C@H](C[C@@H]4[C@](C3[...,-1.222745
16990,TK10_KIDNEY,C1=NC2=C(N=C(N=C2N1C3[C@H]([C@@H]([C@H](O3)CO)...,-1.081749


In [11]:
len(set(base['cell_line']))

57

In [12]:
len(set(base['smiles']))

309

In [13]:
pd.DataFrame(base['cell_line'].value_counts())

Unnamed: 0,cell_line
NCIH460_LUNG,309
SF268_CENTRAL_NERVOUS_SYSTEM,309
OVCAR8_OVARY,308
A549_LUNG,308
SF295_CENTRAL_NERVOUS_SYSTEM,308
HCT116_LARGE_INTESTINE,308
SNB19_CENTRAL_NERVOUS_SYSTEM,307
HCT15_LARGE_INTESTINE,307
KM12_LARGE_INTESTINE,306
NCIH23_LUNG,306


In [14]:
pd.DataFrame(base['smiles'].value_counts())

Unnamed: 0,smiles
CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C(=O)N[C@@H](CCC(=O)O)C(=O)O,57
CC1=C2[C@H](C(=O)[C@@]3([C@H](C[C@@H]4[C@]([C@H]3[C@@H]([C@@](C2(C)C)(C[C@@H]1OC(=O)[C@@H]([C@H](C5=CC=CC=C5)NC(=O)C6=CC=CC=C6)O)O)OC(=O)C7=CC=CO7)(CO4)OC(=O)C)O)C)OC(=O)C,57
CCOC1=CC2=C(C=C1)C3=C(C2=O)C4=C(C=C(C=C4)[N+](=O)[O-])C(=O)N3CCCN.Cl,57
CN(C)CCCN1C2=C(C3=C(C1=O)C=C(C=C3)[N+](=O)[O-])C(=O)C4=C2C=CC(=C4)OC.Cl,57
CCCCCCCCCCCCCCCCCCOP(=O)([O-])OC1CC[N+](CC1)(C)C,57
...,...
CC(=O)OC1CC2(C(CCC3(C2(CCC3C4=COC(=O)C=C4)O)C)C5(C1=CC(C6C5O6)O)C)O,45
CC(=O)O[C@@H]1C[C@]2([C@@H](CCC2=O)C3=C1[C@]4([C@H](OC(=O)C5=COC(=C54)C3=O)COC)C)C,44
CN(CC1=CC=CC=C1)CC(=C)C2=CC=C(C=C2)C(=O)NC3=CC=CC=C3N,44
C1CCC2(CC1)C(=O)N(C(=O)N2)CCN(CCCl)CCCl,42


In [15]:
indexes = list(base.index)
rd.Random(42).shuffle(indexes)

In [16]:
test = indexes[round(len(indexes)*0.8):]
tmp = indexes[:round(len(indexes)*0.8)]
val = tmp[:len(test)]
train = tmp[len(test):]

In [17]:
train = base.iloc[train]
test = base.iloc[test]
val = base.iloc[val]

In [18]:
print('train: ' + str(train.shape))
print('val: ' + str(val.shape))
print('test: ' + str(test.shape))

train: (10196, 3)
val: (3398, 3)
test: (3398, 3)


In [19]:
df = pd.merge(
    pd.merge(
        pd.DataFrame(train['cell_line'].value_counts()), 
        pd.DataFrame(val['cell_line'].value_counts()), 
        left_index=True, right_index=True
    ), pd.DataFrame(test['cell_line'].value_counts()), 
    left_index=True, right_index=True
)
df.columns = ['train', 'val', 'test']
df.loc['total'] = np.sum(df, axis=0) 

In [20]:
df

Unnamed: 0,train,val,test
RXF393_KIDNEY,197,50,49
COLO205_LARGE_INTESTINE,196,59,49
SF295_CENTRAL_NERVOUS_SYSTEM,194,55,59
OVCAR5_OVARY,193,52,59
HCT15_LARGE_INTESTINE,192,70,45
A549_LUNG,192,56,60
DU145_PROSTATE,191,52,47
SKMEL28_SKIN,191,48,60
OVCAR8_OVARY,190,61,57
NIHOVCAR3_OVARY,189,59,55


In [21]:
test.to_csv(
    '../DrugCell/data_rcellminer/test_rcell_wo_other.txt', sep='\t', 
    header=None, index=None
)

val.to_csv(
    '../DrugCell/data_rcellminer/val_rcell_wo_other.txt', sep='\t', 
    header=None, index=None
)

train.to_csv(
    '../DrugCell/data_rcellminer/train_rcell_wo_other.txt', sep='\t', 
    header=None, index=None
)