In [1]:
import pandas as pd
import numpy as np
import random as rd

from urllib.request import HTTPError
from tqdm import tqdm
from joblib import Parallel, delayed

# Read nci60byCell Line Table
 Indexes are NSC and columns are Cell Lines' name

In [2]:
nci60Act = pd.read_csv("../data/nci60Act_ccle.csv", index_col=0)
cell2ind = list(pd.read_table("../DrugCell/data/cell2ind.txt", header=None)[1])
nci60Act = nci60Act[list(set(cell2ind) & set(nci60Act.columns))]
nci60Act

Unnamed: 0,CAKI1_KIDNEY,SNB75_CENTRAL_NERVOUS_SYSTEM,OVCAR8_OVARY,T47D_BREAST,NCIH23_LUNG,M14_SKIN,HT29_LARGE_INTESTINE,A549_LUNG,COLO205_LARGE_INTESTINE,OVCAR4_OVARY,...,SR786_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,HL60_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,SN12C_KIDNEY,HCT15_LARGE_INTESTINE,IGROV1_OVARY,SF539_CENTRAL_NERVOUS_SYSTEM,OVCAR5_OVARY,TK10_KIDNEY,HCT116_LARGE_INTESTINE,HOP92_LUNG
1,-0.269717,-0.175915,-0.413599,1.934731,-0.600872,-0.508668,-0.918047,-0.826223,0.008989,2.081990,...,0.325208,1.439737,-0.390689,-0.082824,0.149074,-0.380720,-0.593334,-0.379420,0.732038,2.110059
17,0.717851,0.330808,-0.572734,1.509397,-0.925414,-0.941890,-0.941890,-0.941890,-0.041714,0.891796,...,0.621871,1.792720,-0.315533,-0.630920,1.353076,1.140166,-0.422527,-0.761559,-0.941890,-0.078754
89,0.743888,-0.216723,0.185498,,-0.312738,0.430283,-0.487137,-0.433001,0.152000,0.012308,...,-0.222065,1.054381,-0.184805,0.236371,-0.181827,-0.165433,-0.145313,-0.101929,0.347687,-0.068839
185,1.675353,-1.125208,0.095648,,0.170420,-0.533247,-0.283713,0.500858,0.174485,0.487376,...,,0.381192,1.391604,-0.059621,-0.852569,0.230402,-0.853432,-0.904724,,-0.038789
295,-0.264586,-0.264586,3.362653,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,...,2.919066,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900911,-0.167151,-0.167151,-0.167151,-0.167151,-0.167151,-0.167151,-0.167151,-0.167151,-0.167151,-0.167151,...,-0.167151,,-0.167151,-0.167151,-0.167151,-0.167151,-0.167151,-0.167151,-0.167151,
900922,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,,-0.169786,-0.169786,...,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786
900964,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,,-0.158754,...,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754
900974,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,...,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453


# Create train data table format

In [3]:
base = pd.DataFrame()
for i in nci60Act.columns:
    tmp = nci60Act[i].reset_index().dropna()
    tmp.columns = ['NSC', 'drug_response']
    tmp['cell_line'] = [i]*len(tmp)
    base = pd.concat([
        base,
        tmp
    ])
base = base.reset_index(drop=True)

In [4]:
base

Unnamed: 0,NSC,drug_response,cell_line
0,1,-0.269717,CAKI1_KIDNEY
1,17,0.717851,CAKI1_KIDNEY
2,89,0.743888,CAKI1_KIDNEY
3,185,1.675353,CAKI1_KIDNEY
4,295,-0.264586,CAKI1_KIDNEY
...,...,...,...
1219790,900909,-0.189827,HOP92_LUNG
1219791,900922,-0.169786,HOP92_LUNG
1219792,900964,-0.158754,HOP92_LUNG
1219793,900974,-0.132453,HOP92_LUNG


# Read class label

In [5]:
class_nsc = pd.read_csv('../DrugCell/data_rcellminer/class_by_nsc.csv')
class_nsc

Unnamed: 0,NSC,MECHANISM
0,762419,Other
1,719239,Other
2,777573,Other
3,774651,Other
4,765776,Other
...,...,...
23846,810341,Methylation
23847,808507,Methylation
23848,785299,Methylation
23849,790865,Methylation


# Unique number of cell lines for each class
There are 274 cell lines of DNA class

In [6]:
class_nsc.value_counts('MECHANISM')

MECHANISM
Other          22895
Kinase           483
DNA              274
HDAC              46
TUBB              45
Apoptosis         38
HSP90             16
Ho                13
Methylation       13
PSM               13
BRD               11
Acetalax           4
dtype: int64

# Merge label to main table on NSC

In [7]:
base_label = base.merge(class_nsc, on='NSC')
base_label

Unnamed: 0,NSC,drug_response,cell_line,MECHANISM
0,1,-0.269717,CAKI1_KIDNEY,Other
1,1,-0.175915,SNB75_CENTRAL_NERVOUS_SYSTEM,Other
2,1,-0.413599,OVCAR8_OVARY,Other
3,1,1.934731,T47D_BREAST,Other
4,1,-0.600872,NCIH23_LUNG,Other
...,...,...,...,...
1209388,709521,-0.462382,SF539_CENTRAL_NERVOUS_SYSTEM,Other
1209389,709521,-1.286388,OVCAR5_OVARY,Other
1209390,709521,-0.237036,TK10_KIDNEY,Other
1209391,709521,0.356641,HCT116_LARGE_INTESTINE,Other


# Get SMILES from pubchem id

In [8]:
def get_SMILES_from_pubchemID(pubchem_id):
    """
    Get the SMILES from pubchem_id using PubChem API

    Parameters
    ----------

    pubchem_id : pd.DataFrame

    Returns
    -------

    df : pd.DataFrame

    """

    df = pd.DataFrame()
    for i in tqdm(
        list(range(500, pubchem_id.shape[0], 500)) + [pubchem_id.shape[0]]
    ):
        cid = ""
        for j in pubchem_id["CID"][i - 500 : i]:
            cid = str(j) + "," + cid

        try:
            df = pd.concat(
                [
                    pd.read_csv(
                        "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/"
                        + cid
                        + "/property/CanonicalSMILES/CSV"
                    ),
                    df,
                ]
            )
        except HTTPError as e:
            # if we got error, devided by more small number to get where we got the error.
            # Not sure why I didn't get error again though...
            for i in np.array_split(np.array(cid.split(",")), 2):
                t = ""
                for l in i:
                    t += l + ","
                try:
                    df = pd.concat(
                        [
                            pd.read_csv(
                                "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/"
                                + t
                                + "/property/CanonicalSMILES/CSV"
                            ),
                            df,
                        ]
                    )
                except HTTPError as e:
                    if e.code == 403:
                        print("error")
    df = (
        df.drop_duplicates("CanonicalSMILES")
        .sort_values("CID")
        .reset_index(drop=True)
    )
    df = pubchem_id.reset_index().merge(df, on="CID")[
        ["index", "CanonicalSMILES"]
    ]
    df.columns = ["NSC", "SMILES"]

    return df

In [9]:
pubchem_id = pd.read_csv('../data/nci60PubChemID.csv', index_col=0).dropna()
pubchem_id = pubchem_id.sort_values('PUBCHEM_ID')
pubchem_id.columns = ['CID']

In [10]:
df = get_SMILES_from_pubchemID(pubchem_id)

100%|██████████| 34/34 [00:10<00:00,  3.33it/s]


# Merge SMILES to main table on NSC

In [11]:
base_smiles = base_label.merge(df, on='NSC').drop('NSC', axis=1)
base_smiles

Unnamed: 0,drug_response,cell_line,MECHANISM,SMILES
0,0.717851,CAKI1_KIDNEY,Other,CCCCCCCCCCCCCCCC1=C(C=CC(=C1)O)N
1,0.330808,SNB75_CENTRAL_NERVOUS_SYSTEM,Other,CCCCCCCCCCCCCCCC1=C(C=CC(=C1)O)N
2,-0.572734,OVCAR8_OVARY,Other,CCCCCCCCCCCCCCCC1=C(C=CC(=C1)O)N
3,1.509397,T47D_BREAST,Other,CCCCCCCCCCCCCCCC1=C(C=CC(=C1)O)N
4,-0.925414,NCIH23_LUNG,Other,CCCCCCCCCCCCCCCC1=C(C=CC(=C1)O)N
...,...,...,...,...
822755,-0.462382,SF539_CENTRAL_NERVOUS_SYSTEM,Other,COC1=CC=C(C=C1)C2=NN=C3C(=NC4=CC=CC=C4N3C2)C5=...
822756,-1.286388,OVCAR5_OVARY,Other,COC1=CC=C(C=C1)C2=NN=C3C(=NC4=CC=CC=C4N3C2)C5=...
822757,-0.237036,TK10_KIDNEY,Other,COC1=CC=C(C=C1)C2=NN=C3C(=NC4=CC=CC=C4N3C2)C5=...
822758,0.356641,HCT116_LARGE_INTESTINE,Other,COC1=CC=C(C=C1)C2=NN=C3C(=NC4=CC=CC=C4N3C2)C5=...


In [12]:
base_smiles

Unnamed: 0,drug_response,cell_line,MECHANISM,SMILES
0,0.717851,CAKI1_KIDNEY,Other,CCCCCCCCCCCCCCCC1=C(C=CC(=C1)O)N
1,0.330808,SNB75_CENTRAL_NERVOUS_SYSTEM,Other,CCCCCCCCCCCCCCCC1=C(C=CC(=C1)O)N
2,-0.572734,OVCAR8_OVARY,Other,CCCCCCCCCCCCCCCC1=C(C=CC(=C1)O)N
3,1.509397,T47D_BREAST,Other,CCCCCCCCCCCCCCCC1=C(C=CC(=C1)O)N
4,-0.925414,NCIH23_LUNG,Other,CCCCCCCCCCCCCCCC1=C(C=CC(=C1)O)N
...,...,...,...,...
822755,-0.462382,SF539_CENTRAL_NERVOUS_SYSTEM,Other,COC1=CC=C(C=C1)C2=NN=C3C(=NC4=CC=CC=C4N3C2)C5=...
822756,-1.286388,OVCAR5_OVARY,Other,COC1=CC=C(C=C1)C2=NN=C3C(=NC4=CC=CC=C4N3C2)C5=...
822757,-0.237036,TK10_KIDNEY,Other,COC1=CC=C(C=C1)C2=NN=C3C(=NC4=CC=CC=C4N3C2)C5=...
822758,0.356641,HCT116_LARGE_INTESTINE,Other,COC1=CC=C(C=C1)C2=NN=C3C(=NC4=CC=CC=C4N3C2)C5=...


In [13]:
base_smiles.value_counts('MECHANISM')

MECHANISM
Other        811190
DNA            6535
HDAC           1598
Kinase         1332
TUBB           1272
Ho              363
Apoptosis       313
PSM             102
Acetalax         55
dtype: int64

In [14]:
set(base_smiles['MECHANISM'])

{'Acetalax',
 'Apoptosis',
 'DNA',
 'HDAC',
 'Ho',
 'Kinase',
 'Other',
 'PSM',
 'TUBB'}

In [15]:
base_smiles[
    (base_smiles['MECHANISM'] == 'DNA') |
    (base_smiles['MECHANISM'] == 'HDAC') |
    (base_smiles['MECHANISM'] == 'Kinase') |
    (base_smiles['MECHANISM'] == 'TUBB')
]

Unnamed: 0,drug_response,cell_line,MECHANISM,SMILES
364,0.541612,CAKI1_KIDNEY,DNA,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...
365,-1.460314,SNB75_CENTRAL_NERVOUS_SYSTEM,DNA,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...
366,0.644535,OVCAR8_OVARY,DNA,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...
367,-1.156158,T47D_BREAST,DNA,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...
368,0.510736,NCIH23_LUNG,DNA,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...
...,...,...,...,...
821679,0.014236,SF539_CENTRAL_NERVOUS_SYSTEM,HDAC,C=C(CN1CCC2=C(C1)NC3=CC=CC=C23)C4=CC=C(C=C4)C(...
821680,-0.678015,OVCAR5_OVARY,HDAC,C=C(CN1CCC2=C(C1)NC3=CC=CC=C23)C4=CC=C(C=C4)C(...
821681,0.109471,TK10_KIDNEY,HDAC,C=C(CN1CCC2=C(C1)NC3=CC=CC=C23)C4=CC=C(C=C4)C(...
821682,0.497262,HCT116_LARGE_INTESTINE,HDAC,C=C(CN1CCC2=C(C1)NC3=CC=CC=C23)C4=CC=C(C=C4)C(...


# Select DNA 

In [16]:
df_dna = base_smiles[base_smiles['MECHANISM'] == 'DNA'].reset_index(drop=True)[['cell_line', 'SMILES', 'drug_response']]

# This has 6779 datapoints

In [17]:
df_dna

Unnamed: 0,cell_line,SMILES,drug_response
0,CAKI1_KIDNEY,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...,0.541612
1,SNB75_CENTRAL_NERVOUS_SYSTEM,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...,-1.460314
2,OVCAR8_OVARY,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...,0.644535
3,T47D_BREAST,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...,-1.156158
4,NCIH23_LUNG,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...,0.510736
...,...,...,...
6530,SF539_CENTRAL_NERVOUS_SYSTEM,CCC1(C2=C(COC1=O)C(=O)N3CC4=C(C3=C2)N=C5C=C(C(...,1.107856
6531,OVCAR5_OVARY,CCC1(C2=C(COC1=O)C(=O)N3CC4=C(C3=C2)N=C5C=C(C(...,-1.271092
6532,TK10_KIDNEY,CCC1(C2=C(COC1=O)C(=O)N3CC4=C(C3=C2)N=C5C=C(C(...,-1.271092
6533,HCT116_LARGE_INTESTINE,CCC1(C2=C(COC1=O)C(=O)N3CC4=C(C3=C2)N=C5C=C(C(...,-0.020776


# 57 cell lines and 122 drugs

In [18]:
len(set(df_dna['cell_line']))

55

In [19]:
len(set(df_dna['SMILES']))

122

In [20]:
pd.DataFrame(df_dna['cell_line'].value_counts())

Unnamed: 0,cell_line
SF539_CENTRAL_NERVOUS_SYSTEM,122
UACC257_SKIN,122
KM12_LARGE_INTESTINE,122
NCIH460_LUNG,122
SN12C_KIDNEY,122
U251MG_CENTRAL_NERVOUS_SYSTEM,122
UACC62_SKIN,122
SF295_CENTRAL_NERVOUS_SYSTEM,122
SKMEL28_SKIN,122
SF268_CENTRAL_NERVOUS_SYSTEM,122


In [21]:
pd.DataFrame(df_dna['SMILES'].value_counts())

Unnamed: 0,SMILES
CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C(=O)NC(CCC(=O)O)C(=O)O,55
CCC1(C2=C(COC1=O)C(=O)N3CC4=C(C5=CC=CC=C5N=C4C3=C2)C=NNC(=O)C(CC6=CN=CN6)N)O.Cl,55
CCC1(C2=C(COC1=O)C(=O)N3CC4=C(C5=CC=CC=C5N=C4C3=C2)C=NC6=CC=C(C=C6)NC7C8COC(=O)C8C(C9=CC1=C(C=C79)OCO1)C1=CC(=C(C(=C1)OC)O)OC)O,55
CCC1(C2=C(COC1=O)C(=O)N3CC4=CC5=CC=CC=C5N=C4C3=C2)N,55
CCC1(C2=C(COC1=O)C(=O)N3CC4=C(C5=CC=CC=C5N=C4C3=C2)C=NNC(=O)C[N+]6=CC=CC=C6)O.[Cl-],55
...,...
CC1=C(C=CC(=C1)N(CCOS(=O)(=O)C)CCOS(=O)(=O)C)C=O,46
CCC1(C2=C(COC1=O)C(=O)N3CC4=C(C3=C2)N=C5C=C(C(=CC5=C4)OC)OC)O,43
C1C(ON=C1Cl)C(C(=O)O)N,43
CC1=C2C=C[N+](=CC2=C(C3=C1NC4=CC=CC=C43)C)C.[I-],43


In [22]:
indexes = list(df_dna.index)
rd.Random(42).shuffle(indexes)

In [23]:
test = indexes[round(len(indexes)*0.8):]
tmp = indexes[:round(len(indexes)*0.8)]
val = tmp[:len(test)]
train = tmp[len(test):]

In [24]:
train = df_dna.iloc[train]
test = df_dna.iloc[test]
val = df_dna.iloc[val]

In [25]:
print('train: ' + str(train.shape))
print('val: ' + str(val.shape))
print('test: ' + str(test.shape))

train: (3921, 3)
val: (1307, 3)
test: (1307, 3)


In [26]:
df = pd.merge(
    pd.merge(
        pd.DataFrame(train['cell_line'].value_counts()), 
        pd.DataFrame(val['cell_line'].value_counts()), 
        left_index=True, right_index=True
    ), pd.DataFrame(test['cell_line'].value_counts()), 
    left_index=True, right_index=True
)
df.columns = ['train', 'val', 'test']
df.loc['total'] = np.sum(df, axis=0) 

In [27]:
df

Unnamed: 0,train,val,test
OVCAR8_OVARY,87,13,22
IGROV1_OVARY,80,23,18
CAKI1_KIDNEY,80,18,23
HCC2998_LARGE_INTESTINE,80,17,23
MOLT4_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,80,13,28
UACC62_SKIN,79,21,22
NCIH460_LUNG,79,21,22
A549_LUNG,78,15,29
SF268_CENTRAL_NERVOUS_SYSTEM,76,27,19
SF295_CENTRAL_NERVOUS_SYSTEM,76,27,19


In [28]:
test.to_csv(
    '../DrugCell/data_rcellminer/test_DNA.txt', sep='\t', 
    header=None, index=None
)

val.to_csv(
    '../DrugCell/data_rcellminer/val_DNA.txt', sep='\t', 
    header=None, index=None
)

train.to_csv(
    '../DrugCell/data_rcellminer/train_DNA.txt', sep='\t', 
    header=None, index=None
)