In [1]:
import pandas as pd
import numpy as np
import random as rd

from tqdm import tqdm
from joblib import Parallel, delayed

In [2]:
# Create PubChem_id list
pubchem_id = pd.read_csv('../DrugCell/data_rcellminer/pubchem_id_by_nsc.csv', index_col=0).dropna()
pubchem_id

Unnamed: 0,PUBCHEM_ID
17,219123.0
295,4775.0
353,24180741.0
384,54599265.0
534,16685698.0
...,...
760087,60780.0
760091,9941647.0
760423,9955951.0
761431,42611257.0


In [3]:
#  Get smiles from PubChem ID  https://pubchem.ncbi.nlm.nih.gov/idexchange/idexchange.cgi
smiles = pd.read_table('../DrugCell/data_rcellminer/smiles_from_PubchemID.txt', header=None)
smiles.index = pubchem_id.index
smiles = smiles.drop(0, axis=1)
smiles.columns = ['smiles']
smiles = smiles.to_dict()['smiles']

In [4]:
nci60Act = pd.read_csv('../data/nci60Act_ccle.csv', index_col=0)
cell2ind = list(pd.read_table('../DrugCell/data/cell2ind.txt', header=None)[1])
nci60Act = nci60Act[list(set(cell2ind) & set(nci60Act.columns))]

In [5]:
base = pd.DataFrame()
for i in nci60Act.columns:
    tmp = nci60Act[i].reset_index().dropna()
    tmp.columns = ['nsc', 'drug_response']
    tmp['cell_line'] = [i]*len(tmp)
    base = pd.concat([
        base,
        tmp
    ])

In [6]:
class_nsc = pd.read_csv(
    '../DrugCell/data_rcellminer/class_by_nsc.csv',
)

In [7]:
class_nsc['MECHANISM'].value_counts()

Other          22895
Kinase           483
DNA              274
HDAC              46
TUBB              45
Apoptosis         38
HSP90             16
Methylation       13
Ho                13
PSM               13
BRD               11
Acetalax           4
Name: MECHANISM, dtype: int64

In [8]:
nsc_list = set(class_nsc['NSC'])
class_nsc = {class_nsc['NSC'][i] : class_nsc['MECHANISM'][i] for i in class_nsc.index}

In [9]:
base['label'] = [class_nsc[i] if i in nsc_list else None for i in base['nsc']]
base = base[base['label'] != 'Other'] \
    .reset_index(drop=True) \
    .drop('label', axis=1)
base['smiles'] = [smiles[i] if i in smiles.keys() else None for i in base.nsc]
base = base.dropna()
base = base.reset_index(drop=True)
base = base[['cell_line', 'smiles', 'drug_response']]

In [10]:
base

Unnamed: 0,cell_line,smiles,drug_response
0,ACHN_KIDNEY,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...,0.499337
1,ACHN_KIDNEY,C1(=NC(=NN1)N)N,2.032494
2,ACHN_KIDNEY,CC1C(C(C(C(O1)O[C@H]2CC[C@@]3([C@H]4CC[C@@]5([...,0.646581
3,ACHN_KIDNEY,C1CN1C2=NC(=NC(=N2)N3CC3)N4CC4,1.548396
4,ACHN_KIDNEY,CCC(=O)O[C@H]1CC[C@@H]2[C@@]1(CC[C@H]3[C@H]2CC...,-0.200601
...,...,...,...
16377,OVCAR4_OVARY,C[C@@H]1[C@H]([C@H](C[C@@H](O1)O[C@H]2CC[C@]3(...,0.030270
16378,OVCAR4_OVARY,CC1=C2[C@H](C(=O)[C@@]3([C@H](C[C@@H]4[C@](C3[...,-2.319736
16379,OVCAR4_OVARY,C1=NC2=C(N=C(N=C2N1C3[C@H]([C@@H]([C@H](O3)CO)...,-1.794858
16380,OVCAR4_OVARY,CCC1=CC2C[C@@](C3=C(CN(C2)C1)C4=CC=CC=C4N3)(C5...,0.221991


In [11]:
len(set(base['cell_line']))

55

In [12]:
len(set(base['smiles']))

309

In [13]:
pd.DataFrame(base['cell_line'].value_counts())

Unnamed: 0,cell_line
SF268_CENTRAL_NERVOUS_SYSTEM,309
NCIH460_LUNG,309
OVCAR8_OVARY,308
A549_LUNG,308
SF295_CENTRAL_NERVOUS_SYSTEM,308
HCT116_LARGE_INTESTINE,308
HCT15_LARGE_INTESTINE,307
KM12_LARGE_INTESTINE,306
NCIH23_LUNG,306
SW620_LARGE_INTESTINE,305


In [14]:
pd.DataFrame(base['smiles'].value_counts())

Unnamed: 0,smiles
CC(C)[NH-].CC(C)[NH-].O.O.Cl[Pt+2]Cl,55
CC1=C(C(=O)C2=C(C1=O)N3CC4[C@H]([C@]3([C@H]2COC(=O)N)OC)N4)N,55
CC1=C(C(=O)C2=C(C1=O)N3CC4C(C3(C2COC(=O)NCCO)OC)N4)N,55
CC[C@@]1(C2=C(COC1=O)C(=O)N3CC4=CC5=CC=CC=C5N=C4C3=C2)OC(=O)CN(CC)CC.Cl,55
C[C@]12[C@H]([C@H](C[C@H](O1)N3C4=CC=CC=C4C5=C6C(=C7C8=CC=CC=C8N2C7=C53)CNC6=O)N(C)C(=O)C9=CC=CC=C9)OC,55
...,...
CC(=O)OC1CC2(C(CCC3(C2(CCC3C4=COC(=O)C=C4)O)C)C5(C1=CC(C6C5O6)O)C)O,43
CC(=O)O[C@@H]1C[C@]2([C@@H](CCC2=O)C3=C1[C@]4([C@H](OC(=O)C5=COC(=C54)C3=O)COC)C)C,42
CN(CC1=CC=CC=C1)CC(=C)C2=CC=C(C=C2)C(=O)NC3=CC=CC=C3N,42
C1CCC2(CC1)C(=O)N(C(=O)N2)CCN(CCCl)CCCl,40


In [15]:
indexes = list(base.index)
rd.Random(42).shuffle(indexes)

In [16]:
test = indexes[round(len(indexes)*0.8):]
tmp = indexes[:round(len(indexes)*0.8)]
val = tmp[:len(test)]
train = tmp[len(test):]

In [17]:
train = base.iloc[train]
test = base.iloc[test]
val = base.iloc[val]

In [18]:
print('train: ' + str(train.shape))
print('val: ' + str(val.shape))
print('test: ' + str(test.shape))

train: (9830, 3)
val: (3276, 3)
test: (3276, 3)


In [19]:
df = pd.merge(
    pd.merge(
        pd.DataFrame(train['cell_line'].value_counts()), 
        pd.DataFrame(val['cell_line'].value_counts()), 
        left_index=True, right_index=True
    ), pd.DataFrame(test['cell_line'].value_counts()), 
    left_index=True, right_index=True
)
df.columns = ['train', 'val', 'test']
df.loc['total'] = np.sum(df, axis=0) 

In [20]:
df

Unnamed: 0,train,val,test
HT29_LARGE_INTESTINE,198,57,46
ACHN_KIDNEY,195,65,42
U251MG_CENTRAL_NERVOUS_SYSTEM,194,52,58
SKMEL5_SKIN,193,58,54
SN12C_KIDNEY,190,63,52
A549_LUNG,189,59,60
HOP62_LUNG,188,52,62
SF295_CENTRAL_NERVOUS_SYSTEM,187,60,61
SKMEL28_SKIN,187,64,48
UACC62_SKIN,186,43,70


In [21]:
test.to_csv(
    '../DrugCell/data_rcellminer/test_rcell_wo_other.txt', sep='\t', 
    header=None, index=None
)

val.to_csv(
    '../DrugCell/data_rcellminer/val_rcell_wo_other.txt', sep='\t', 
    header=None, index=None
)

train.to_csv(
    '../DrugCell/data_rcellminer/train_rcell_wo_other.txt', sep='\t', 
    header=None, index=None
)