In [1]:
import pandas as pd
import numpy as np
import random as rd

from urllib.request import HTTPError
from tqdm import tqdm
from joblib import Parallel, delayed

# Read nci60byCell Line Table
 Indexes are NSC and columns are Cell Lines' name

In [2]:
nci60Act = pd.read_csv("../data/nci60Act_ccle.csv", index_col=0)
cell2ind = list(pd.read_table("../DrugCell/data/cell2ind.txt", header=None)[1])
nci60Act = nci60Act[list(set(cell2ind) & set(nci60Act.columns))]
nci60Act

Unnamed: 0,MOLT4_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,ACHN_KIDNEY,U251MG_CENTRAL_NERVOUS_SYSTEM,MALME3M_SKIN,SKOV3_OVARY,MDAMB435S_SKIN,EKVX_LUNG,DU145_PROSTATE,A549_LUNG,CCRFCEM_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,...,OVCAR8_OVARY,SW620_LARGE_INTESTINE,786O_KIDNEY,A498_KIDNEY,PC3_PROSTATE,UACC257_SKIN,CAKI1_KIDNEY,SF539_CENTRAL_NERVOUS_SYSTEM,HCT116_LARGE_INTESTINE,COLO205_LARGE_INTESTINE
1,1.540716,1.657273,-0.266183,-0.233261,-0.868240,-0.386710,-0.264901,-0.434885,-0.826223,1.642456,...,-0.413599,1.294131,-0.519867,-1.648059,-0.403983,-0.407149,-0.269717,-0.380720,0.732038,0.008989
17,1.686372,-0.941890,-0.843055,-0.756130,-0.492953,-0.941890,-0.676210,-0.941890,-0.941890,2.270240,...,-0.572734,-1.029408,0.521200,-0.329639,1.302516,-0.061915,0.717851,1.140166,-0.941890,-0.041714
89,0.549935,-0.144006,-0.093513,0.318373,-0.126557,,-0.063351,,-0.433001,0.061246,...,0.185498,0.658260,0.368433,-0.655707,,-0.070365,0.743888,-0.165433,0.347687,0.152000
185,1.675353,,-0.171509,-0.917027,-2.151708,,-0.147899,,0.500858,1.675353,...,0.095648,0.683279,-0.061612,,,-0.807890,1.675353,0.230402,,0.174485
295,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,...,3.362653,-0.264586,4.822657,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900911,-0.167151,-0.167151,,-0.167151,-0.167151,-0.167151,-0.167151,-0.167151,-0.167151,-0.167151,...,-0.167151,-0.167151,-0.167151,-0.167151,-0.167151,-0.167151,-0.167151,-0.167151,-0.167151,-0.167151
900922,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,,-0.169786,...,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786
900964,-0.158754,-0.158754,-0.158754,1.610430,-0.158754,-0.158754,-0.158754,,-0.158754,-0.158754,...,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,
900974,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,...,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453


# Create train data table format

In [3]:
base = pd.DataFrame()
for i in nci60Act.columns:
    tmp = nci60Act[i].reset_index().dropna()
    tmp.columns = ['NSC', 'drug_response']
    tmp['cell_line'] = [i]*len(tmp)
    base = pd.concat([
        base,
        tmp
    ])
base = base.reset_index(drop=True)

In [4]:
len(set(base['NSC']))

24047

# Read class label

In [5]:
class_nsc = pd.read_csv('../DrugCell/data_rcellminer/class_by_nsc.csv')
class_nsc.sort_values('NSC')

Unnamed: 0,NSC,MECHANISM
112,1,Other
1200,17,Other
21579,89,Other
1355,185,Other
2118,295,Other
...,...,...
22699,900911,Other
22877,900922,Other
22701,900964,Other
21608,900974,Other


# Unique number of cell lines for each class
There are 274 cell lines of DNA class

In [6]:
class_nsc.value_counts('MECHANISM')

MECHANISM
Other          22895
Kinase           483
DNA              274
HDAC              46
TUBB              45
Apoptosis         38
HSP90             16
Ho                13
Methylation       13
PSM               13
BRD               11
Acetalax           4
dtype: int64

# Merge label to main table on NSC

In [7]:
base_label = base.merge(class_nsc, on='NSC')
base_label

Unnamed: 0,NSC,drug_response,cell_line,MECHANISM
0,1,1.540716,MOLT4_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,Other
1,1,1.657273,ACHN_KIDNEY,Other
2,1,-0.266183,U251MG_CENTRAL_NERVOUS_SYSTEM,Other
3,1,-0.233261,MALME3M_SKIN,Other
4,1,-0.868240,SKOV3_OVARY,Other
...,...,...,...,...
1209388,768570,-0.769549,PC3_PROSTATE,Other
1209389,768570,-0.769549,UACC257_SKIN,Other
1209390,768570,-0.769549,CAKI1_KIDNEY,Other
1209391,768570,0.776115,HCT116_LARGE_INTESTINE,Other


# Merge SMILES to main table on NSC

In [8]:
smiles = pd.read_csv('../data/nsc_cid_smiles.csv')[['NSC', 'SMILES']].drop_duplicates()
base_smiles = base_label.merge(smiles, on='NSC').drop('NSC', axis=1)
base_smiles

Unnamed: 0,drug_response,cell_line,MECHANISM,SMILES
0,1.686372,MOLT4_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,Other,CCCCCCCCCCCCCCCC1=C(C=CC(=C1)O)N
1,-0.941890,ACHN_KIDNEY,Other,CCCCCCCCCCCCCCCC1=C(C=CC(=C1)O)N
2,-0.843055,U251MG_CENTRAL_NERVOUS_SYSTEM,Other,CCCCCCCCCCCCCCCC1=C(C=CC(=C1)O)N
3,-0.756130,MALME3M_SKIN,Other,CCCCCCCCCCCCCCCC1=C(C=CC(=C1)O)N
4,-0.492953,SKOV3_OVARY,Other,CCCCCCCCCCCCCCCC1=C(C=CC(=C1)O)N
...,...,...,...,...
877939,-0.310412,786O_KIDNEY,Other,CC(=O)N1C=CC2=C1N=CN=C2Cl
877940,-0.310412,UACC257_SKIN,Other,CC(=O)N1C=CC2=C1N=CN=C2Cl
877941,-0.310412,CAKI1_KIDNEY,Other,CC(=O)N1C=CC2=C1N=CN=C2Cl
877942,-0.310412,HCT116_LARGE_INTESTINE,Other,CC(=O)N1C=CC2=C1N=CN=C2Cl


In [9]:
base_smiles.value_counts('MECHANISM')

MECHANISM
Other        858874
DNA           11820
Kinase         1988
HDAC           1709
TUBB           1679
Ho              615
Apoptosis       610
HSP90           274
PSM             212
Acetalax        163
dtype: int64

In [10]:
set(base_smiles['MECHANISM'])

{'Acetalax',
 'Apoptosis',
 'DNA',
 'HDAC',
 'HSP90',
 'Ho',
 'Kinase',
 'Other',
 'PSM',
 'TUBB'}

# Select DNA 

In [11]:
df_dna = base_smiles[base_smiles['MECHANISM'] == 'DNA'].reset_index(drop=True)[['cell_line', 'SMILES', 'drug_response']]

# This has 11820 datapoints

In [12]:
df_dna.drop_duplicates()

Unnamed: 0,cell_line,SMILES,drug_response
0,MOLT4_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...,0.598806
1,ACHN_KIDNEY,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...,0.499337
2,U251MG_CENTRAL_NERVOUS_SYSTEM,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...,0.351135
3,MALME3M_SKIN,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...,-0.733147
4,SKOV3_OVARY,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...,-1.444659
...,...,...,...
11815,UACC257_SKIN,CCC1=C2CN3C(=CC4=C(C3=O)COC(=O)C4(CC)O)C2=NC5=...,-1.155968
11816,CAKI1_KIDNEY,CCC1=C2CN3C(=CC4=C(C3=O)COC(=O)C4(CC)O)C2=NC5=...,0.991410
11817,SF539_CENTRAL_NERVOUS_SYSTEM,CCC1=C2CN3C(=CC4=C(C3=O)COC(=O)C4(CC)O)C2=NC5=...,0.991410
11818,HCT116_LARGE_INTESTINE,CCC1=C2CN3C(=CC4=C(C3=O)COC(=O)C4(CC)O)C2=NC5=...,0.902428


# 55 cell lines and 214 drugs

In [13]:
len(set(df_dna['cell_line']))

55

In [14]:
len(set(df_dna['SMILES']))

214

In [15]:
pd.DataFrame(df_dna['cell_line'].value_counts())

Unnamed: 0,cell_line
OVCAR8_OVARY,222
UACC62_SKIN,222
U251MG_CENTRAL_NERVOUS_SYSTEM,222
OVCAR4_OVARY,222
SKMEL28_SKIN,222
EKVX_LUNG,222
UACC257_SKIN,222
A549_LUNG,222
OVCAR5_OVARY,222
SF295_CENTRAL_NERVOUS_SYSTEM,222


In [16]:
pd.DataFrame(df_dna['SMILES'].value_counts())

Unnamed: 0,SMILES
C1=CN(C(=O)N=C1N)C2C(C(C(O2)CO)O)O,110
CC1C(C(CC(O1)OC2CC(CC3=C2C(=C4C(=C3O)C(=O)C5=C(C4=O)C(=CC=C5)OC)O)(C(=O)C)O)N)O,110
CCC1(C2=C(COC1=O)C(=O)N3CC4=CC5=C(C=CC(=C5CN(C)C)O)N=C4C3=C2)O.Cl,110
C1C(C(OC1N2C=NC3=C2NC(=NC3=S)N)CO)O,110
COC1=C(C=CC(=C1)NS(=O)(=O)C)NC2=C3C=CC=CC3=NC4=CC=CC=C42,110
...,...
C1C(C(OC1N2C=NC(=NC2=O)N)CO)O,43
C1CC(OC1CO)N2C=CC(=NC2=O)N,42
C1=CC=C(C(=C1)C(=O)NCCCC(C(=O)O)NC(=O)C2=CC=C(C=C2)NCC3=CN=C4C(=N3)C(=NC(=N4)N)N)C(=O)O,41
C1CCC2(CC1)C(=O)N(C(=O)N2)CCN(CCCl)CCCl,40


In [17]:
indexes = list(df_dna.index)
rd.Random(42).shuffle(indexes)

In [18]:
test = indexes[round(len(indexes)*0.8):]
tmp = indexes[:round(len(indexes)*0.8)]
val = tmp[:len(test)]
train = tmp[len(test):]

In [19]:
train = df_dna.iloc[train]
test = df_dna.iloc[test]
val = df_dna.iloc[val]

In [20]:
print('train: ' + str(train.shape))
print('val: ' + str(val.shape))
print('test: ' + str(test.shape))

train: (7092, 3)
val: (2364, 3)
test: (2364, 3)


In [21]:
df = pd.merge(
    pd.merge(
        pd.DataFrame(train['cell_line'].value_counts()), 
        pd.DataFrame(val['cell_line'].value_counts()), 
        left_index=True, right_index=True
    ), pd.DataFrame(test['cell_line'].value_counts()), 
    left_index=True, right_index=True
)
df.columns = ['train', 'val', 'test']
df.loc['total'] = np.sum(df, axis=0) 

In [22]:
df

Unnamed: 0,train,val,test
NCIH322_LUNG,146,38,36
LOXIMVI_SKIN,145,41,34
HT29_LARGE_INTESTINE,143,43,35
NCIH460_LUNG,142,27,52
A498_KIDNEY,141,36,39
IGROV1_OVARY,141,34,44
EKVX_LUNG,140,36,46
U251MG_CENTRAL_NERVOUS_SYSTEM,139,51,32
CAKI1_KIDNEY,139,42,40
COLO205_LARGE_INTESTINE,138,45,37


In [23]:
test.to_csv(
    '../DrugCell/data_rcellminer/test_DNA.txt', sep='\t', 
    header=None, index=None
)

val.to_csv(
    '../DrugCell/data_rcellminer/val_DNA.txt', sep='\t', 
    header=None, index=None
)

train.to_csv(
    '../DrugCell/data_rcellminer/train_DNA.txt', sep='\t', 
    header=None, index=None
)