In [1]:
import pandas as pd
import numpy as np
import random as rd

from urllib.request import HTTPError
from tqdm import tqdm
from joblib import Parallel, delayed

# Read nci60byCell Line Table
 Indexes are NSC and columns are Cell Lines' name

In [2]:
nci60Act = pd.read_csv("../data/nci60Act_ccle.csv", index_col=0)
cell2ind = list(pd.read_table("../DrugCell/data/cell2ind.txt", header=None)[1])
nci60Act = nci60Act[list(set(cell2ind) & set(nci60Act.columns))]
nci60Act

Unnamed: 0,786O_KIDNEY,HT29_LARGE_INTESTINE,NCIH322_LUNG,SR786_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,T47D_BREAST,SF295_CENTRAL_NERVOUS_SYSTEM,RXF393_KIDNEY,NCIH522_LUNG,HS578T_BREAST,NCIH226_LUNG,...,HCC2998_LARGE_INTESTINE,HCT15_LARGE_INTESTINE,OVCAR8_OVARY,SF539_CENTRAL_NERVOUS_SYSTEM,RPMI8226_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,SW620_LARGE_INTESTINE,MOLT4_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,IGROV1_OVARY,NCIH460_LUNG,CAKI1_KIDNEY
1,-0.519867,-0.918047,-0.802747,0.325208,1.934731,-0.719253,0.002290,1.914094,-0.815183,-1.164555,...,-0.490868,-0.082824,-0.413599,-0.380720,1.504554,1.294131,1.540716,0.149074,-0.700613,-0.269717
17,0.521200,-0.941890,-0.786423,0.621871,1.509397,0.424922,-0.239018,1.832493,-0.222024,0.100266,...,-0.941890,-0.630920,-0.572734,1.140166,1.698491,-1.029408,1.686372,1.353076,-0.941890,0.717851
89,0.368433,-0.487137,-1.429903,-0.222065,,-1.429903,-0.435620,5.026869,,-0.605885,...,-0.163225,0.236371,0.185498,-0.165433,1.191198,0.658260,0.549935,-0.181827,-0.183471,0.743888
185,-0.061612,-0.283713,0.332463,,,,0.918722,,,-3.502638,...,0.476776,-0.059621,0.095648,0.230402,-0.024337,0.683279,1.675353,-0.852569,,1.675353
295,4.822657,-0.264586,-0.264586,2.919066,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,...,-0.264586,-0.264586,3.362653,-0.264586,,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900911,-0.167151,-0.167151,-0.167151,-0.167151,-0.167151,-0.167151,1.594059,,-0.167151,-0.167151,...,-0.167151,-0.167151,-0.167151,-0.167151,6.930661,-0.167151,-0.167151,-0.167151,-0.167151,-0.167151
900922,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,...,2.343934,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,,-0.169786
900964,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,...,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754
900974,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,,-0.132453,-0.132453,-0.132453,...,7.417381,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453


# Create train data table format

In [3]:
base = pd.DataFrame()
for i in nci60Act.columns:
    tmp = nci60Act[i].reset_index().dropna()
    tmp.columns = ['NSC', 'drug_response']
    tmp['cell_line'] = [i]*len(tmp)
    base = pd.concat([
        base,
        tmp
    ])
base = base.reset_index(drop=True)

In [4]:
len(set(base['NSC']))

24047

# Read class label

In [5]:
class_nsc = pd.read_csv('../DrugCell/data_rcellminer/class_by_nsc.csv')
class_nsc.sort_values('NSC')

Unnamed: 0,NSC,MECHANISM
112,1,Other
1200,17,Other
21579,89,Other
1355,185,Other
2118,295,Other
...,...,...
22699,900911,Other
22877,900922,Other
22701,900964,Other
21608,900974,Other


# Unique number of cell lines for each class
There are 274 cell lines of DNA class

In [6]:
class_nsc.value_counts('MECHANISM')

MECHANISM
Other          22895
Kinase           483
DNA              274
HDAC              46
TUBB              45
Apoptosis         38
HSP90             16
Ho                13
Methylation       13
PSM               13
BRD               11
Acetalax           4
dtype: int64

# Merge label to main table on NSC

In [7]:
base_label = base.merge(class_nsc, on='NSC')
base_label

Unnamed: 0,NSC,drug_response,cell_line,MECHANISM
0,1,-0.519867,786O_KIDNEY,Other
1,1,-0.918047,HT29_LARGE_INTESTINE,Other
2,1,-0.802747,NCIH322_LUNG,Other
3,1,0.325208,SR786_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,Other
4,1,1.934731,T47D_BREAST,Other
...,...,...,...,...
1209388,701664,0.430721,RPMI8226_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,Other
1209389,701664,-3.869090,SW620_LARGE_INTESTINE,Other
1209390,701664,2.979306,MOLT4_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,Other
1209391,701664,-0.123051,NCIH460_LUNG,Other


# Merge SMILES to main table on NSC

In [8]:
smiles = pd.read_csv('../data/nsc_cid_smiles.csv')[['NSC', 'SMILES']].drop_duplicates()
base_smiles = base_label.merge(smiles, on='NSC').drop('NSC', axis=1)
base_smiles

Unnamed: 0,drug_response,cell_line,MECHANISM,SMILES
0,0.521200,786O_KIDNEY,Other,CCCCCCCCCCCCCCCC1=C(C=CC(=C1)O)N
1,-0.941890,HT29_LARGE_INTESTINE,Other,CCCCCCCCCCCCCCCC1=C(C=CC(=C1)O)N
2,-0.786423,NCIH322_LUNG,Other,CCCCCCCCCCCCCCCC1=C(C=CC(=C1)O)N
3,0.621871,SR786_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,Other,CCCCCCCCCCCCCCCC1=C(C=CC(=C1)O)N
4,1.509397,T47D_BREAST,Other,CCCCCCCCCCCCCCCC1=C(C=CC(=C1)O)N
...,...,...,...,...
877939,0.430721,RPMI8226_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,Other,CC(C)(C)C1=CC(=CC(=C1O)C(C)(C)C)C2=CSC(=NC3=CC...
877940,-3.869090,SW620_LARGE_INTESTINE,Other,CC(C)(C)C1=CC(=CC(=C1O)C(C)(C)C)C2=CSC(=NC3=CC...
877941,2.979306,MOLT4_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,Other,CC(C)(C)C1=CC(=CC(=C1O)C(C)(C)C)C2=CSC(=NC3=CC...
877942,-0.123051,NCIH460_LUNG,Other,CC(C)(C)C1=CC(=CC(=C1O)C(C)(C)C)C2=CSC(=NC3=CC...


In [9]:
base_smiles.value_counts('MECHANISM')

MECHANISM
Other        858874
DNA           11820
Kinase         1988
HDAC           1709
TUBB           1679
Ho              615
Apoptosis       610
HSP90           274
PSM             212
Acetalax        163
dtype: int64

In [10]:
set(base_smiles['MECHANISM'])

{'Acetalax',
 'Apoptosis',
 'DNA',
 'HDAC',
 'HSP90',
 'Ho',
 'Kinase',
 'Other',
 'PSM',
 'TUBB'}

# Select DNA 

In [11]:
df_dna = base_smiles[base_smiles['MECHANISM'] == 'DNA'].reset_index(drop=True)[['cell_line', 'SMILES', 'drug_response']]

# This has 11820 datapoints

In [12]:
df_dna.drop_duplicates()

Unnamed: 0,cell_line,SMILES,drug_response
0,786O_KIDNEY,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...,0.682664
1,HT29_LARGE_INTESTINE,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...,0.616085
2,NCIH322_LUNG,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...,-0.356232
3,SR786_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...,0.645528
4,T47D_BREAST,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...,-1.156158
...,...,...,...
11815,SW620_LARGE_INTESTINE,COC1=NC(=NC2=C1N=CN2C3C(C(C(O3)CO)O)O)N,-0.174699
11816,MOLT4_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,COC1=NC(=NC2=C1N=CN2C3C(C(C(O3)CO)O)O)N,2.612786
11817,IGROV1_OVARY,COC1=NC(=NC2=C1N=CN2C3C(C(C(O3)CO)O)O)N,-0.174699
11818,NCIH460_LUNG,COC1=NC(=NC2=C1N=CN2C3C(C(C(O3)CO)O)O)N,-0.174699


# 55 cell lines and 214 drugs

In [13]:
len(set(df_dna['cell_line']))

55

In [14]:
len(set(df_dna['SMILES']))

214

In [15]:
pd.DataFrame(df_dna['cell_line'].value_counts())

Unnamed: 0,cell_line
U251MG_CENTRAL_NERVOUS_SYSTEM,222
KM12_LARGE_INTESTINE,222
A549_LUNG,222
UACC257_SKIN,222
UACC62_SKIN,222
SKMEL28_SKIN,222
EKVX_LUNG,222
OVCAR5_OVARY,222
OVCAR4_OVARY,222
OVCAR8_OVARY,222


In [16]:
pd.DataFrame(df_dna['SMILES'].value_counts())

Unnamed: 0,SMILES
C1C(C(OC1N2C=NC3=C2NC(=NC3=S)N)CO)O,110
C1=CN(C(=O)N=C1N)C2C(C(C(O2)CO)O)O,110
CCC1(C2=C(COC1=O)C(=O)N3CC4=CC5=C(C=CC(=C5CN(C)C)O)N=C4C3=C2)O.Cl,110
CC1C(C(CC(O1)OC2CC(CC3=C2C(=C4C(=C3O)C(=O)C5=C(C4=O)C(=CC=C5)OC)O)(C(=O)C)O)N)O,110
COC1=C(C=CC(=C1)NS(=O)(=O)C)NC2=C3C=CC=CC3=NC4=CC=CC=C42,110
...,...
C1CN2C(=CC=N2)N1,43
C1CC(OC1CO)N2C=CC(=NC2=O)N,42
C1=CC=C(C(=C1)C(=O)NCCCC(C(=O)O)NC(=O)C2=CC=C(C=C2)NCC3=CN=C4C(=N3)C(=NC(=N4)N)N)C(=O)O,41
C1CCC2(CC1)C(=O)N(C(=O)N2)CCN(CCCl)CCCl,40


In [17]:
indexes = list(df_dna.index)
rd.Random(42).shuffle(indexes)

In [18]:
test = indexes[round(len(indexes)*0.8):]
tmp = indexes[:round(len(indexes)*0.8)]
val = tmp[:len(test)]
train = tmp[len(test):]

In [19]:
train = df_dna.iloc[train]
test = df_dna.iloc[test]
val = df_dna.iloc[val]

In [20]:
print('train: ' + str(train.shape))
print('val: ' + str(val.shape))
print('test: ' + str(test.shape))

train: (7092, 3)
val: (2364, 3)
test: (2364, 3)


In [21]:
df = pd.merge(
    pd.merge(
        pd.DataFrame(train['cell_line'].value_counts()), 
        pd.DataFrame(val['cell_line'].value_counts()), 
        left_index=True, right_index=True
    ), pd.DataFrame(test['cell_line'].value_counts()), 
    left_index=True, right_index=True
)
df.columns = ['train', 'val', 'test']
df.loc['total'] = np.sum(df, axis=0) 

In [22]:
df

Unnamed: 0,train,val,test
SKMEL5_SKIN,147,34,36
RPMI8226_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,144,35,39
LOXIMVI_SKIN,142,37,41
CAKI1_KIDNEY,141,40,40
MALME3M_SKIN,141,44,35
UACC257_SKIN,140,43,39
IGROV1_OVARY,137,38,44
HCT116_LARGE_INTESTINE,137,39,45
EKVX_LUNG,137,45,40
A549_LUNG,136,43,43


In [23]:
test.to_csv(
    '../DrugCell/data_rcellminer/test_DNA.txt', sep='\t', 
    header=None, index=None
)

val.to_csv(
    '../DrugCell/data_rcellminer/val_DNA.txt', sep='\t', 
    header=None, index=None
)

train.to_csv(
    '../DrugCell/data_rcellminer/train_DNA.txt', sep='\t', 
    header=None, index=None
)