In [1]:
import pandas as pd
import numpy as np
import random as rd

from urllib.request import HTTPError
from tqdm import tqdm
from joblib import Parallel, delayed

# Read nci60byCell Line Table
 Indexes are NSC and columns are Cell Lines' name

In [26]:
nci60Act = pd.read_csv('../data/nci60Act_ccle.csv', index_col=0)
nci60Act

Unnamed: 0,MCF7_BREAST,MDAMB231_BREAST,HS578T_BREAST,BT549_BREAST,T47D_BREAST,SF268_CENTRAL_NERVOUS_SYSTEM,SF295_CENTRAL_NERVOUS_SYSTEM,SF539_CENTRAL_NERVOUS_SYSTEM,SNB19_CENTRAL_NERVOUS_SYSTEM,SNB75_CENTRAL_NERVOUS_SYSTEM,...,SKOV3_OVARY,PC3_PROSTATE,DU145_PROSTATE,786O_KIDNEY,A498_KIDNEY,ACHN_KIDNEY,CAKI1_KIDNEY,RXF393_KIDNEY,SN12C_KIDNEY,TK10_KIDNEY
1,-0.271314,-0.303539,-0.815183,-0.231499,1.934731,-0.357577,-0.719253,-0.380720,-1.281589,-0.175915,...,-0.868240,-0.403983,-0.434885,-0.519867,-1.648059,1.657273,-0.269717,0.002290,-0.390689,-0.379420
17,-0.354110,-0.304675,-0.222024,1.483613,1.509397,0.335572,0.424922,1.140166,-0.941890,0.330808,...,-0.492953,1.302516,-0.941890,0.521200,-0.329639,-0.941890,0.717851,-0.239018,-0.315533,-0.761559
89,,,,,,-0.184194,-1.429903,-0.165433,-1.429903,-0.216723,...,-0.126557,,,0.368433,-0.655707,-0.144006,0.743888,-0.435620,-0.184805,-0.101929
185,,,,,,0.539343,,0.230402,-0.765829,-1.125208,...,-2.151708,,,-0.061612,,,1.675353,0.918722,1.391604,-0.904724
295,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,...,-0.264586,-0.264586,-0.264586,4.822657,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586,-0.264586
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900911,-0.167151,-0.167151,-0.167151,-0.167151,-0.167151,-0.167151,-0.167151,-0.167151,-0.167151,-0.167151,...,-0.167151,-0.167151,-0.167151,-0.167151,-0.167151,-0.167151,-0.167151,1.594059,-0.167151,-0.167151
900922,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,...,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786,-0.169786
900964,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,...,-0.158754,-0.158754,,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754,-0.158754
900974,-0.132453,-0.132453,-0.132453,,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,...,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,-0.132453,,-0.132453,-0.132453


# Create train data table format

In [27]:
base = pd.DataFrame()
for i in nci60Act.columns:
    tmp = nci60Act[i].reset_index().dropna()
    tmp.columns = ['NSC', 'drug_response']
    tmp['cell_line'] = [i]*len(tmp)
    base = pd.concat([
        base,
        tmp
    ])
base = base.reset_index(drop=True)

In [4]:
base

Unnamed: 0,NSC,drug_response,cell_line
0,1,-0.271314,MCF7_BREAST
1,17,-0.354110,MCF7_BREAST
2,295,-0.264586,MCF7_BREAST
3,353,0.744838,MCF7_BREAST
4,384,-0.041323,MCF7_BREAST
...,...,...,...
1266165,900911,-0.167151,TK10_KIDNEY
1266166,900922,-0.169786,TK10_KIDNEY
1266167,900964,-0.158754,TK10_KIDNEY
1266168,900974,-0.132453,TK10_KIDNEY


# Read class label

In [5]:
class_nsc = pd.read_csv('../DrugCell/data_rcellminer/class_by_nsc.csv')
class_nsc

Unnamed: 0,NSC,MECHANISM
0,762419,Other
1,719239,Other
2,777573,Other
3,774651,Other
4,765776,Other
...,...,...
23846,810341,Methylation
23847,808507,Methylation
23848,785299,Methylation
23849,790865,Methylation


# Unique number of cell lines for each class
There are 274 cell lines of DNA class

In [29]:
class_nsc.value_counts('MECHANISM')

MECHANISM
Other          22895
Kinase           483
DNA              274
HDAC              46
TUBB              45
Apoptosis         38
HSP90             16
Ho                13
Methylation       13
PSM               13
BRD               11
Acetalax           4
dtype: int64

# Merge label to main table on NSC

In [30]:
base_label = base.merge(class_nsc, on='NSC')
base_label

Unnamed: 0,NSC,drug_response,cell_line,MECHANISM
0,1,-0.271314,MCF7_BREAST,Other
1,1,-0.303539,MDAMB231_BREAST,Other
2,1,-0.815183,HS578T_BREAST,Other
3,1,-0.231499,BT549_BREAST,Other
4,1,1.934731,T47D_BREAST,Other
...,...,...,...,...
1255376,626744,1.719118,OVCAR4_OVARY,Other
1255377,626744,-1.697994,OVCAR5_OVARY,Other
1255378,626744,1.419889,OVCAR8_OVARY,Other
1255379,626744,0.681835,SKOV3_OVARY,Other


# Get SMILES from pubchem id

In [31]:
def get_SMILES_from_pubchemID(pubchem_id):
    """
    Get the SMILES from pubchem_id using PubChem API

    Parameters
    ----------

    pubchem_id : pd.DataFrame

    Returns
    -------

    df : pd.DataFrame

    """

    df = pd.DataFrame()
    for i in tqdm(
        list(range(500, pubchem_id.shape[0], 500)) + [pubchem_id.shape[0]]
    ):
        cid = ""
        for j in pubchem_id["CID"][i - 500 : i]:
            cid = str(j) + "," + cid

        try:
            df = pd.concat(
                [
                    pd.read_csv(
                        "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/"
                        + cid
                        + "/property/CanonicalSMILES/CSV"
                    ),
                    df,
                ]
            )
        except HTTPError as e:
            # if we got error, devided by more small number to get where we got the error.
            # Not sure why I didn't get error again though...
            for i in np.array_split(np.array(cid.split(",")), 2):
                t = ""
                for l in i:
                    t += l + ","
                try:
                    df = pd.concat(
                        [
                            pd.read_csv(
                                "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/"
                                + t
                                + "/property/CanonicalSMILES/CSV"
                            ),
                            df,
                        ]
                    )
                except HTTPError as e:
                    if e.code == 403:
                        print("error")
    df = (
        df.drop_duplicates("CanonicalSMILES")
        .sort_values("CID")
        .reset_index(drop=True)
    )
    df = pubchem_id.reset_index().merge(df, on="CID")[
        ["index", "CanonicalSMILES"]
    ]
    df.columns = ["NSC", "SMILES"]

    return df

In [9]:
pubchem_id = pd.read_csv('../data/nci60PubChemID.csv', index_col=0).dropna()
pubchem_id = pubchem_id.sort_values('PUBCHEM_ID')
pubchem_id.columns = ['CID']

In [10]:
df = get_SMILES_from_pubchemID(pubchem_id)

100%|██████████| 34/34 [00:10<00:00,  3.26it/s]


# Merge SMILES to main table on NSC

In [11]:
base_smiles = base_label.merge(df, on='NSC').drop('NSC', axis=1)
base_smiles

Unnamed: 0,drug_response,cell_line,MECHANISM,SMILES
0,-0.354110,MCF7_BREAST,Other,CCCCCCCCCCCCCCCC1=C(C=CC(=C1)O)N
1,-0.304675,MDAMB231_BREAST,Other,CCCCCCCCCCCCCCCC1=C(C=CC(=C1)O)N
2,-0.222024,HS578T_BREAST,Other,CCCCCCCCCCCCCCCC1=C(C=CC(=C1)O)N
3,1.483613,BT549_BREAST,Other,CCCCCCCCCCCCCCCC1=C(C=CC(=C1)O)N
4,1.509397,T47D_BREAST,Other,CCCCCCCCCCCCCCCC1=C(C=CC(=C1)O)N
...,...,...,...,...
854144,1.719118,OVCAR4_OVARY,Other,CC1C(N(C(CC(=O)N1)C2=CC=CC=C2)N=O)C3=CC=CC=C3
854145,-1.697994,OVCAR5_OVARY,Other,CC1C(N(C(CC(=O)N1)C2=CC=CC=C2)N=O)C3=CC=CC=C3
854146,1.419889,OVCAR8_OVARY,Other,CC1C(N(C(CC(=O)N1)C2=CC=CC=C2)N=O)C3=CC=CC=C3
854147,0.681835,SKOV3_OVARY,Other,CC1C(N(C(CC(=O)N1)C2=CC=CC=C2)N=O)C3=CC=CC=C3


# Select DNA 

In [32]:
df = base_smiles[base_smiles['MECHANISM'] == 'DNA'].reset_index(drop=True)[['cell_line', 'SMILES', 'drug_response']]

# This has 6779 datapoints

In [34]:
df

Unnamed: 0,cell_line,SMILES,drug_response
0,MCF7_BREAST,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...,0.703626
1,MDAMB231_BREAST,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...,-1.219032
2,HS578T_BREAST,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...,-1.892792
3,BT549_BREAST,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...,-0.877267
4,T47D_BREAST,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...,-1.156158
...,...,...,...
6774,ACHN_KIDNEY,CN(C)CCCN1C2=C3C(=C(C=C2)[N+](=O)[O-])NC4=C(C3...,0.626710
6775,CAKI1_KIDNEY,CN(C)CCCN1C2=C3C(=C(C=C2)[N+](=O)[O-])NC4=C(C3...,-0.085429
6776,RXF393_KIDNEY,CN(C)CCCN1C2=C3C(=C(C=C2)[N+](=O)[O-])NC4=C(C3...,-0.193796
6777,SN12C_KIDNEY,CN(C)CCCN1C2=C3C(=C(C=C2)[N+](=O)[O-])NC4=C(C3...,1.555142


# 57 cell lines and 122 drugs

In [35]:
len(set(df['cell_line']))

57

In [36]:
len(set(df['SMILES']))

122

In [16]:
pd.DataFrame(df['cell_line'].value_counts())

Unnamed: 0,cell_line
SKMEL28_SKIN,122
U251MG_CENTRAL_NERVOUS_SYSTEM,122
OVCAR8_OVARY,122
NIHOVCAR3_OVARY,122
KM12_LARGE_INTESTINE,122
HT29_LARGE_INTESTINE,122
HCT116_LARGE_INTESTINE,122
UACC257_SKIN,122
OVCAR4_OVARY,122
UACC62_SKIN,122


In [17]:
pd.DataFrame(df['SMILES'].value_counts())

Unnamed: 0,SMILES
CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C(=O)NC(CCC(=O)O)C(=O)O,57
CCC1(C2=C(COC1=O)C(=O)N3CC4=C(C5=CC=CC=C5N=C4C3=C2)C=NNC(=O)C(CC6=CNC7=CC=CC=C76)N)O.Cl,57
CCC1(C2=C(COC1=O)C(=O)N3CC4=C(C5=CC=CC=C5N=C4C3=C2)C=NC6=CC=CC=C6NC7C8COC(=O)C8C(C9=CC1=C(C=C79)OCO1)C1=CC(=C(C(=C1)OC)O)OC)O.Cl,57
CCC1(C2=C(COC1=O)C(=O)N3CC4=C(C5=CC=CC=C5N=C4C3=C2)C=NC6=CC=C(C=C6)NC7C8COC(=O)C8C(C9=CC1=C(C=C79)OCO1)C1=CC(=C(C(=C1)OC)O)OC)O,57
CCC1(C2=C(COC1=O)C(=O)N3CC4=CC5=CC=CC=C5N=C4C3=C2)N,57
...,...
CC1=C(C=CC(=C1)N(CCOS(=O)(=O)C)CCOS(=O)(=O)C)C=O,48
CCC1(C2=C(COC1=O)C(=O)N3CC4=C(C3=C2)N=C5C=C(C(=CC5=C4)OC)OC)O,45
CC1=C2C=C[N+](=CC2=C(C3=C1NC4=CC=CC=C43)C)C.[I-],45
C1C(ON=C1Cl)C(C(=O)O)N,45


In [18]:
indexes = list(df.index)
rd.Random(42).shuffle(indexes)

In [19]:
test = indexes[round(len(indexes)*0.8):]
tmp = indexes[:round(len(indexes)*0.8)]
val = tmp[:len(test)]
train = tmp[len(test):]

In [20]:
train = df.iloc[train]
test = df.iloc[test]
val = df.iloc[val]

In [21]:
print('train: ' + str(train.shape))
print('val: ' + str(val.shape))
print('test: ' + str(test.shape))

train: (4067, 3)
val: (1356, 3)
test: (1356, 3)


In [22]:
df = pd.merge(
    pd.merge(
        pd.DataFrame(train['cell_line'].value_counts()), 
        pd.DataFrame(val['cell_line'].value_counts()), 
        left_index=True, right_index=True
    ), pd.DataFrame(test['cell_line'].value_counts()), 
    left_index=True, right_index=True
)
df.columns = ['train', 'val', 'test']
df.loc['total'] = np.sum(df, axis=0) 

In [25]:
df

Unnamed: 0,train,val,test
EKVX_LUNG,84,17,21
SKMEL2_SKIN,82,15,22
NCIH460_LUNG,82,20,20
M14_SKIN,81,21,17
UACC257_SKIN,77,21,24
SW620_LARGE_INTESTINE,77,23,21
NCIH522_LUNG,77,15,29
ACHN_KIDNEY,76,22,21
DU145_PROSTATE,76,17,18
HT29_LARGE_INTESTINE,75,23,24


In [24]:
# test.to_csv(
#     '../DrugCell/data_rcellminer/test_rcell_wo_other.txt', sep='\t', 
#     header=None, index=None
# )

# val.to_csv(
#     '../DrugCell/data_rcellminer/val_rcell_wo_other.txt', sep='\t', 
#     header=None, index=None
# )

# train.to_csv(
#     '../DrugCell/data_rcellminer/train_rcell_wo_other.txt', sep='\t', 
#     header=None, index=None
# )