In [1]:
import pandas as pd
import numpy as np
import random as rd

from urllib.request import HTTPError
from tqdm import tqdm
from joblib import Parallel, delayed

# Read nci60byCell Line Table
 Indexes are NSC and columns are Cell Lines' name

In [2]:
nci60Act = pd.read_csv("../data/nci60Act_ccle.csv", index_col=0)
cell2ind = list(pd.read_table("../DrugCell/data/cell2ind.txt", header=None)[1])
nci60Act = nci60Act[list(set(cell2ind) & set(nci60Act.columns))]
nci60Act

Unnamed: 0,SF295_CENTRAL_NERVOUS_SYSTEM,SN12C_KIDNEY,HOP92_LUNG,NCIH522_LUNG,SF268_CENTRAL_NERVOUS_SYSTEM,SF539_CENTRAL_NERVOUS_SYSTEM,NCIH226_LUNG,SNB75_CENTRAL_NERVOUS_SYSTEM,TK10_KIDNEY,U251MG_CENTRAL_NERVOUS_SYSTEM,...,SR786_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,IGROV1_OVARY,MCF7_BREAST,BT549_BREAST,PC3_PROSTATE,OVCAR5_OVARY,LOXIMVI_SKIN,HCT15_LARGE_INTESTINE,A549_LUNG,MOLT4_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE
1,-0.719253,-0.390689,2.110059,1.914094,-0.357577,-0.380720,-1.164555,-0.175915,-0.379420,-0.266183,...,0.325208,0.149074,-0.271314,-0.231499,-0.403983,-0.593334,-0.193375,-0.082824,-0.826223,1.540716
17,0.424922,-0.315533,-0.078754,1.832493,0.335572,1.140166,0.100266,0.330808,-0.761559,-0.843055,...,0.621871,1.353076,-0.354110,1.483613,1.302516,-0.422527,0.839155,-0.630920,-0.941890,1.686372
89,-1.429903,-0.184805,-0.068839,5.026869,-0.184194,-0.165433,-0.605885,-0.216723,-0.101929,-0.093513,...,-0.222065,-0.181827,,,,-0.145313,0.228788,0.236371,-0.433001,0.549935
185,,1.391604,-0.038789,,0.539343,0.230402,-3.502638,-1.125208,-0.904724,-0.171509,...,,-0.852569,,,,-0.853432,0.389991,-0.059621,0.500858,1.675353
295,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,...,2.919066,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900911,-0.167151,-0.167151,,,-0.167151,-0.167151,-0.167151,-0.167151,-0.167151,,...,-0.167151,-0.167151,-0.167151,-0.167151,-0.167151,-0.167151,,-0.167151,-0.167151,-0.167151
900922,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,...,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,,-0.169786
900964,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,...,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754
900974,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,...,-0.132453,-0.132453,-0.132453,,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453


# Create train data table format

In [3]:
base = pd.DataFrame()
for i in nci60Act.columns:
    tmp = nci60Act[i].reset_index().dropna()
    tmp.columns = ['NSC', 'drug_response']
    tmp['cell_line'] = [i]*len(tmp)
    base = pd.concat([
        base,
        tmp
    ])
base = base.reset_index(drop=True)

In [4]:
len(set(base['NSC']))

24047

# Read class label

In [5]:
class_nsc = pd.read_csv('../DrugCell/data_rcellminer/class_by_nsc.csv')
class_nsc.sort_values('NSC')

Unnamed: 0,NSC,MECHANISM
112,1,Other
1200,17,Other
21579,89,Other
1355,185,Other
2118,295,Other
...,...,...
22699,900911,Other
22877,900922,Other
22701,900964,Other
21608,900974,Other


# Unique number of cell lines for each class
There are 274 cell lines of DNA class

In [6]:
class_nsc.value_counts('MECHANISM')

MECHANISM
Other          22895
Kinase           483
DNA              274
HDAC              46
TUBB              45
Apoptosis         38
HSP90             16
Ho                13
Methylation       13
PSM               13
BRD               11
Acetalax           4
dtype: int64

# Merge label to main table on NSC

In [7]:
base_label = base.merge(class_nsc, on='NSC')
base_label

Unnamed: 0,NSC,drug_response,cell_line,MECHANISM
0,1,-0.719253,SF295_CENTRAL_NERVOUS_SYSTEM,Other
1,1,-0.390689,SN12C_KIDNEY,Other
2,1,2.110059,HOP92_LUNG,Other
3,1,1.914094,NCIH522_LUNG,Other
4,1,-0.357577,SF268_CENTRAL_NERVOUS_SYSTEM,Other
...,...,...,...,...
1209388,765429,-0.348806,PC3_PROSTATE,Other
1209389,765429,-0.952551,OVCAR5_OVARY,Other
1209390,765429,-0.048225,LOXIMVI_SKIN,Other
1209391,765429,0.410343,HCT15_LARGE_INTESTINE,Other


# Merge SMILES to main table on NSC

In [8]:
smiles = pd.read_csv('../data/nsc_cid_smiles.csv')[['NSC', 'SMILES']]
base_smiles = base_label.merge(smiles, on='NSC').drop('NSC', axis=1)

In [9]:
base_smiles

Unnamed: 0,drug_response,cell_line,MECHANISM,SMILES
0,-0.719253,SF295_CENTRAL_NERVOUS_SYSTEM,Other,CC1=CC(=O)C=CC1=O
1,-0.390689,SN12C_KIDNEY,Other,CC1=CC(=O)C=CC1=O
2,2.110059,HOP92_LUNG,Other,CC1=CC(=O)C=CC1=O
3,1.914094,NCIH522_LUNG,Other,CC1=CC(=O)C=CC1=O
4,-0.357577,SF268_CENTRAL_NERVOUS_SYSTEM,Other,CC1=CC(=O)C=CC1=O
...,...,...,...,...
1177841,-0.348806,PC3_PROSTATE,Other,CC1=NC(C2=C(N1)C(=CC3=CC(=C(C(=C3)OC)OC)OC)CCC...
1177842,-0.952551,OVCAR5_OVARY,Other,CC1=NC(C2=C(N1)C(=CC3=CC(=C(C(=C3)OC)OC)OC)CCC...
1177843,-0.048225,LOXIMVI_SKIN,Other,CC1=NC(C2=C(N1)C(=CC3=CC(=C(C(=C3)OC)OC)OC)CCC...
1177844,0.410343,HCT15_LARGE_INTESTINE,Other,CC1=NC(C2=C(N1)C(=CC3=CC(=C(C(=C3)OC)OC)OC)CCC...


In [10]:
base_smiles.value_counts('MECHANISM')

MECHANISM
Other          1127388
Kinase           25733
DNA              14343
TUBB              2365
HDAC              2358
Apoptosis         1912
HSP90              869
PSM                699
Methylation        694
Ho                 670
BRD                597
Acetalax           218
dtype: int64

# Select DNA 

In [11]:
df_dna = base_smiles[base_smiles['MECHANISM'] == 'DNA'].reset_index(drop=True)[['cell_line', 'SMILES', 'drug_response']]

# This has 14343 datapoints

In [12]:
df_dna.drop_duplicates()

Unnamed: 0,cell_line,SMILES,drug_response
0,SF295_CENTRAL_NERVOUS_SYSTEM,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...,0.536910
1,SN12C_KIDNEY,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...,0.602244
2,HOP92_LUNG,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...,-1.386098
3,NCIH522_LUNG,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...,0.013134
4,SF268_CENTRAL_NERVOUS_SYSTEM,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...,0.510978
...,...,...,...
14338,OVCAR5_OVARY,CCCCCCCCCCCCCCCC(=O)NC1=NC(=O)N(C=C1)C2C(C(C(O...,-0.724381
14339,LOXIMVI_SKIN,CCCCCCCCCCCCCCCC(=O)NC1=NC(=O)N(C=C1)C2C(C(C(O...,0.436674
14340,HCT15_LARGE_INTESTINE,CCCCCCCCCCCCCCCC(=O)NC1=NC(=O)N(C=C1)C2C(C(C(O...,-0.717086
14341,A549_LUNG,CCCCCCCCCCCCCCCC(=O)NC1=NC(=O)N(C=C1)C2C(C(C(O...,0.353908


# 55 cell lines and 244 drugs

In [13]:
len(set(df_dna['cell_line']))

55

In [14]:
len(set(df_dna['SMILES']))

244

In [15]:
pd.DataFrame(df_dna['cell_line'].value_counts())

Unnamed: 0,cell_line
SF295_CENTRAL_NERVOUS_SYSTEM,269
OVCAR8_OVARY,269
UACC257_SKIN,269
UACC62_SKIN,269
OVCAR4_OVARY,269
KM12_LARGE_INTESTINE,269
NCIH460_LUNG,268
A549_LUNG,268
OVCAR5_OVARY,268
SKMEL28_SKIN,268


In [16]:
pd.DataFrame(df_dna['SMILES'].value_counts())

Unnamed: 0,SMILES
COC1=CC(=CC(=C1O)OC)C2C3C(COC3=O)C(C4=CC5=C(C=C24)OCO5)OC6C(C(C7C(O6)COC(O7)C8=CC=CS8)O)O,165
CC1C(C(CC(O1)OC2CC(CC3=C2C(=C4C(=C3O)C(=O)C5=C(C4=O)C(=CC=C5)OC)O)(C(=O)CO)O)N)O.Cl,163
CC1=C(C(=O)C2=C(C1=O)N3CC4C(C3(C2COC(=O)N)OC)N4)N,110
CC1=C(C(CCC1)(C)C)C=CC(=CC=CC(=CC(=O)NC2=CC=C(C=C2)O)C)C,110
CC1C(C(CC(O1)OC2CC(CC3=C2C(=C4C(=C3O)C(=O)C5=C(C4=O)C(=CC=C5)OC)O)(C(=O)C)O)N)O.Cl,110
...,...
CN(C)CCCN1C2=C3C(=C(C=C2)[N+](=O)[O-])NC4=C(C3=N1)C=C(C=C4)OC.CS(=O)(=O)O,42
C1CC(OC1CO)N2C=CC(=NC2=O)N,42
C1=CC=C(C(=C1)C(=O)NCCCC(C(=O)O)NC(=O)C2=CC=C(C=C2)NCC3=CN=C4C(=N3)C(=NC(=N4)N)N)C(=O)O,41
C1CCC2(CC1)C(=O)N(C(=O)N2)CCN(CCCl)CCCl,40


In [17]:
indexes = list(df_dna.index)
rd.Random(42).shuffle(indexes)

In [18]:
test = indexes[round(len(indexes)*0.8):]
tmp = indexes[:round(len(indexes)*0.8)]
val = tmp[:len(test)]
train = tmp[len(test):]

In [19]:
train = df_dna.iloc[train]
test = df_dna.iloc[test]
val = df_dna.iloc[val]

In [20]:
print('train: ' + str(train.shape))
print('val: ' + str(val.shape))
print('test: ' + str(test.shape))

train: (8605, 3)
val: (2869, 3)
test: (2869, 3)


In [21]:
df = pd.merge(
    pd.merge(
        pd.DataFrame(train['cell_line'].value_counts()), 
        pd.DataFrame(val['cell_line'].value_counts()), 
        left_index=True, right_index=True
    ), pd.DataFrame(test['cell_line'].value_counts()), 
    left_index=True, right_index=True
)
df.columns = ['train', 'val', 'test']
df.loc['total'] = np.sum(df, axis=0) 

In [22]:
test.to_csv(
    '../DrugCell/data_rcellminer/test_DNA.txt', sep='\t', 
    header=None, index=None
)

val.to_csv(
    '../DrugCell/data_rcellminer/val_DNA.txt', sep='\t', 
    header=None, index=None
)

train.to_csv(
    '../DrugCell/data_rcellminer/train_DNA.txt', sep='\t', 
    header=None, index=None
)