In [50]:
import pandas as pd
import numpy as np
import random as rd

from urllib.request import HTTPError
from tqdm import tqdm
from joblib import Parallel, delayed

# Read nci60byCell Line Table
 Indexes are NSC and columns are Cell Lines' name

In [51]:
nci60Act = pd.read_csv("../data/nci60Act_ccle.csv", index_col=0)
cell2ind = list(pd.read_table("../DrugCell/data/cell2ind.txt", header=None)[1])
nci60Act = nci60Act[list(set(cell2ind) & set(nci60Act.columns))]
nci60Act

Unnamed: 0,DU145_PROSTATE,SKMEL5_SKIN,SF539_CENTRAL_NERVOUS_SYSTEM,TK10_KIDNEY,A498_KIDNEY,CCRFCEM_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,SF268_CENTRAL_NERVOUS_SYSTEM,OVCAR4_OVARY,RPMI8226_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,U251MG_CENTRAL_NERVOUS_SYSTEM,...,NCIH23_LUNG,M14_SKIN,SKOV3_OVARY,HOP92_LUNG,PC3_PROSTATE,T47D_BREAST,SF295_CENTRAL_NERVOUS_SYSTEM,ACHN_KIDNEY,UACC257_SKIN,HCT15_LARGE_INTESTINE
1,-0.434885,-0.108730,-0.380720,-0.379420,-1.648059,1.642456,-0.357577,2.081990,1.504554,-0.266183,...,-0.600872,-0.508668,-0.868240,2.110059,-0.403983,1.934731,-0.719253,1.657273,-0.407149,-0.082824
17,-0.941890,-0.941890,1.140166,-0.761559,-0.329639,2.270240,0.335572,0.891796,1.698491,-0.843055,...,-0.925414,-0.941890,-0.492953,-0.078754,1.302516,1.509397,0.424922,-0.941890,-0.061915,-0.630920
89,,-0.118614,-0.165433,-0.101929,-0.655707,0.061246,-0.184194,0.012308,1.191198,-0.093513,...,-0.312738,0.430283,-0.126557,-0.068839,,,-1.429903,-0.144006,-0.070365,0.236371
185,,1.375379,0.230402,-0.904724,,1.675353,0.539343,0.487376,-0.024337,-0.171509,...,0.170420,-0.533247,-2.151708,-0.038789,,,,,-0.807890,-0.059621
295,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,,-0.264586,...,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900911,-0.167151,-0.167151,-0.167151,-0.167151,-0.167151,-0.167151,-0.167151,-0.167151,6.930661,,...,-0.167151,-0.167151,-0.167151,,-0.167151,-0.167151,-0.167151,-0.167151,-0.167151,-0.167151
900922,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,...,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786
900964,,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,...,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754
900974,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,...,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453


# Create train data table format

In [52]:
base = pd.DataFrame()
for i in nci60Act.columns:
    tmp = nci60Act[i].reset_index().dropna()
    tmp.columns = ['NSC', 'drug_response']
    tmp['cell_line'] = [i]*len(tmp)
    base = pd.concat([
        base,
        tmp
    ])
base = base.reset_index(drop=True)

In [53]:
len(set(base['NSC']))

24047

# Read class label

In [54]:
class_nsc = pd.read_csv('../DrugCell/data_rcellminer/class_by_nsc.csv')
class_nsc.sort_values('NSC')

Unnamed: 0,NSC,MECHANISM
112,1,Other
1200,17,Other
21579,89,Other
1355,185,Other
2118,295,Other
...,...,...
22699,900911,Other
22877,900922,Other
22701,900964,Other
21608,900974,Other


# Unique number of cell lines for each class
There are 274 cell lines of DNA class

In [55]:
class_nsc.value_counts('MECHANISM')

MECHANISM
Other          22895
Kinase           483
DNA              274
HDAC              46
TUBB              45
Apoptosis         38
HSP90             16
Ho                13
Methylation       13
PSM               13
BRD               11
Acetalax           4
dtype: int64

# Merge label to main table on NSC

In [56]:
base_label = base.merge(class_nsc, on='NSC')
base_label

Unnamed: 0,NSC,drug_response,cell_line,MECHANISM
0,1,-0.434885,DU145_PROSTATE,Other
1,1,-0.108730,SKMEL5_SKIN,Other
2,1,-0.380720,SF539_CENTRAL_NERVOUS_SYSTEM,Other
3,1,-0.379420,TK10_KIDNEY,Other
4,1,-1.648059,A498_KIDNEY,Other
...,...,...,...,...
1209388,631672,-0.671981,HOP92_LUNG,Other
1209389,631672,-0.983003,SF295_CENTRAL_NERVOUS_SYSTEM,Other
1209390,631672,-0.272537,ACHN_KIDNEY,Other
1209391,631672,-1.324494,UACC257_SKIN,Other


# Merge SMILES to main table on NSC

In [57]:
smiles = pd.read_csv('../data/nsc_cid_smiles.csv')[['NSC', 'SMILES']]
base_smiles = base_label.merge(smiles, on='NSC').drop('NSC', axis=1)

In [58]:
base_smiles

Unnamed: 0,drug_response,cell_line,MECHANISM,SMILES
0,-0.434885,DU145_PROSTATE,Other,CC1=CC(=O)C=CC1=O
1,-0.108730,SKMEL5_SKIN,Other,CC1=CC(=O)C=CC1=O
2,-0.380720,SF539_CENTRAL_NERVOUS_SYSTEM,Other,CC1=CC(=O)C=CC1=O
3,-0.379420,TK10_KIDNEY,Other,CC1=CC(=O)C=CC1=O
4,-1.648059,A498_KIDNEY,Other,CC1=CC(=O)C=CC1=O
...,...,...,...,...
1177841,-0.671981,HOP92_LUNG,Other,CCC(=NNC(=O)C1=CC2=CC=CC=C2C=C1O)CC(=O)CCC(=O)...
1177842,-0.983003,SF295_CENTRAL_NERVOUS_SYSTEM,Other,CCC(=NNC(=O)C1=CC2=CC=CC=C2C=C1O)CC(=O)CCC(=O)...
1177843,-0.272537,ACHN_KIDNEY,Other,CCC(=NNC(=O)C1=CC2=CC=CC=C2C=C1O)CC(=O)CCC(=O)...
1177844,-1.324494,UACC257_SKIN,Other,CCC(=NNC(=O)C1=CC2=CC=CC=C2C=C1O)CC(=O)CCC(=O)...


In [59]:
base_smiles.value_counts('MECHANISM')

MECHANISM
Other          1127388
Kinase           25733
DNA              14343
TUBB              2365
HDAC              2358
Apoptosis         1912
HSP90              869
PSM                699
Methylation        694
Ho                 670
BRD                597
Acetalax           218
dtype: int64

# Select DNA 

In [60]:
df_dna = base_smiles[base_smiles['MECHANISM'] == 'DNA'].reset_index(drop=True)[['cell_line', 'SMILES', 'drug_response']]

# This has 14343 datapoints

In [61]:
df_dna.drop_duplicates()

Unnamed: 0,cell_line,SMILES,drug_response
0,DU145_PROSTATE,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...,0.507493
1,SKMEL5_SKIN,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...,0.319437
2,SF539_CENTRAL_NERVOUS_SYSTEM,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...,0.589970
3,TK10_KIDNEY,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...,-1.641942
4,A498_KIDNEY,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...,-0.891882
...,...,...,...
14338,PC3_PROSTATE,CC(C(=O)O)OC1=CC=C(C=C1)OC2=CN=C3C=CC(=CC3=N2)Cl,-0.420205
14339,SF295_CENTRAL_NERVOUS_SYSTEM,CC(C(=O)O)OC1=CC=C(C=C1)OC2=CN=C3C=CC(=CC3=N2)Cl,-0.420205
14340,ACHN_KIDNEY,CC(C(=O)O)OC1=CC=C(C=C1)OC2=CN=C3C=CC(=CC3=N2)Cl,0.091027
14341,UACC257_SKIN,CC(C(=O)O)OC1=CC=C(C=C1)OC2=CN=C3C=CC(=CC3=N2)Cl,-0.420205


# 55 cell lines and 244 drugs

In [62]:
len(set(df_dna['cell_line']))

55

In [63]:
len(set(df_dna['SMILES']))

244

In [64]:
pd.DataFrame(df_dna['cell_line'].value_counts())

Unnamed: 0,cell_line
UACC62_SKIN,269
KM12_LARGE_INTESTINE,269
UACC257_SKIN,269
SF295_CENTRAL_NERVOUS_SYSTEM,269
OVCAR4_OVARY,269
OVCAR8_OVARY,269
OVCAR5_OVARY,268
HT29_LARGE_INTESTINE,268
A549_LUNG,268
MOLT4_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,268


In [65]:
pd.DataFrame(df_dna['SMILES'].value_counts())

Unnamed: 0,SMILES
COC1=CC(=CC(=C1O)OC)C2C3C(COC3=O)C(C4=CC5=C(C=C24)OCO5)OC6C(C(C7C(O6)COC(O7)C8=CC=CS8)O)O,165
CC1C(C(CC(O1)OC2CC(CC3=C2C(=C4C(=C3O)C(=O)C5=C(C4=O)C(=CC=C5)OC)O)(C(=O)CO)O)N)O.Cl,163
CC1C(C(CC(O1)OC2CC(CC3=C2C(=C4C(=C3O)C(=O)C5=C(C4=O)C(=CC=C5)OC)O)(C(=O)C)O)N)O.Cl,110
CC1=C(C(=O)C2=C(C1=O)N3CC4C(C3(C2COC(=O)N)OC)N4)N,110
C1=CN2C3C(C(C(O3)CO)O)OC2=NC1=N.Cl,110
...,...
CN(C)CCCN1C2=C3C(=C(C=C2)[N+](=O)[O-])NC4=C(C3=N1)C=C(C=C4)OC.CS(=O)(=O)O,42
C1CC(OC1CO)N2C=CC(=NC2=O)N,42
C1=CC=C(C(=C1)C(=O)NCCCC(C(=O)O)NC(=O)C2=CC=C(C=C2)NCC3=CN=C4C(=N3)C(=NC(=N4)N)N)C(=O)O,41
C1CCC2(CC1)C(=O)N(C(=O)N2)CCN(CCCl)CCCl,40


In [75]:
pubchem_id =  pd.read_csv('../data/nsc_cid_smiles.csv')
# pubchem_id = {pubchem_id['CID'][i]:pubchem_id['SMILES'][i] for i in pubchem_id.index}

In [71]:
pubchem_id[24360]

'CCC1(C2=C(COC1=O)C(=O)N3CC4=CC5=CC=CC=C5N=C4C3=C2)O'

In [74]:
df_dna[df_dna['SMILES'] == pubchem_id[24360]]

Unnamed: 0,cell_line,SMILES,drug_response
1694,DU145_PROSTATE,CCC1(C2=C(COC1=O)C(=O)N3CC4=CC5=CC=CC=C5N=C4C3...,0.928772
1695,SKMEL5_SKIN,CCC1(C2=C(COC1=O)C(=O)N3CC4=CC5=CC=CC=C5N=C4C3...,0.264117
1696,SF539_CENTRAL_NERVOUS_SYSTEM,CCC1(C2=C(COC1=O)C(=O)N3CC4=CC5=CC=CC=C5N=C4C3...,0.762820
1697,TK10_KIDNEY,CCC1(C2=C(COC1=O)C(=O)N3CC4=CC5=CC=CC=C5N=C4C3...,-2.020112
1698,A498_KIDNEY,CCC1(C2=C(COC1=O)C(=O)N3CC4=CC5=CC=CC=C5N=C4C3...,0.087184
...,...,...,...
5184,T47D_BREAST,CCC1(C2=C(COC1=O)C(=O)N3CC4=CC5=CC=CC=C5N=C4C3...,0.667738
5185,SF295_CENTRAL_NERVOUS_SYSTEM,CCC1(C2=C(COC1=O)C(=O)N3CC4=CC5=CC=CC=C5N=C4C3...,1.206083
5186,ACHN_KIDNEY,CCC1(C2=C(COC1=O)C(=O)N3CC4=CC5=CC=CC=C5N=C4C3...,-0.149960
5187,UACC257_SKIN,CCC1(C2=C(COC1=O)C(=O)N3CC4=CC5=CC=CC=C5N=C4C3...,-0.853218


In [78]:
pubchem_id[pubchem_id['SMILES'] == 'CCC1(C2=C(COC1=O)C(=O)N3CC4=CC5=CC=CC=C5N=C4C3=C2)O']

Unnamed: 0,NSC,CID,SMILES
1641,94600,24360,CCC1(C2=C(COC1=O)C(=O)N3CC4=CC5=CC=CC=C5N=C4C3...
3386,302991,2538,CCC1(C2=C(COC1=O)C(=O)N3CC4=CC5=CC=CC=C5N=C4C3...


In [17]:
indexes = list(df_dna.index)
rd.Random(42).shuffle(indexes)

In [18]:
test = indexes[round(len(indexes)*0.8):]
tmp = indexes[:round(len(indexes)*0.8)]
val = tmp[:len(test)]
train = tmp[len(test):]

In [19]:
train = df_dna.iloc[train]
test = df_dna.iloc[test]
val = df_dna.iloc[val]

In [20]:
print('train: ' + str(train.shape))
print('val: ' + str(val.shape))
print('test: ' + str(test.shape))

train: (8605, 3)
val: (2869, 3)
test: (2869, 3)


In [21]:
df = pd.merge(
    pd.merge(
        pd.DataFrame(train['cell_line'].value_counts()), 
        pd.DataFrame(val['cell_line'].value_counts()), 
        left_index=True, right_index=True
    ), pd.DataFrame(test['cell_line'].value_counts()), 
    left_index=True, right_index=True
)
df.columns = ['train', 'val', 'test']
df.loc['total'] = np.sum(df, axis=0) 

In [22]:
test.to_csv(
    '../DrugCell/data_rcellminer/test_DNA.txt', sep='\t', 
    header=None, index=None
)

val.to_csv(
    '../DrugCell/data_rcellminer/val_DNA.txt', sep='\t', 
    header=None, index=None
)

train.to_csv(
    '../DrugCell/data_rcellminer/train_DNA.txt', sep='\t', 
    header=None, index=None
)