In [1]:
import pandas as pd
import numpy as np

In [2]:
# Create PubChem_id list
pubchem_id = pd.read_csv('../DrugCell/data/pubchem_id_by_nsc.csv', index_col=0).dropna()
pubchem_id
pubchem_id['PUBCHEM_ID'].astype(int).to_csv('../DrugCell/data/pubchem_id.csv', index=None, header=None)
pubchem_id.head()

Unnamed: 0,PUBCHEM_ID
17,219123.0
295,4775.0
353,24180741.0
384,54599265.0
534,16685698.0


In [21]:
#  Get SMILES from PubChem ID  https://pubchem.ncbi.nlm.nih.gov/idexchange/idexchange.cgi
tmp = pd.read_table('../DrugCell/data/SMILES_from_PubchemID.txt', header=None)
tmp.index = pubchem_id.index
tmp = tmp.drop(0, axis=1)
tmp.shape

(16522, 1)

In [4]:
train = pd.read_table('../DrugCell/data/drugcell_train.txt', header=None)
drugAnnot = pd.DataFrame()

for i in set(train[1]):
    drugAnnot = pd.concat([drugAnnot, tmp[tmp[1] == i]])

drugAnnot = drugAnnot.sort_index()
drugAnnot.columns = ['SMILES']

In [31]:
len(set(tmp[1]) & set(train[1]))

37

In [6]:
nci60Act = pd.read_csv('../DrugCell/data/nci60Act.csv', index_col=0)

# need to change the name to match to the table.
nci60Act = nci60Act.rename(
    columns={
        'BR:HS 578T': 'BR:HS 578 T', 
        'BR:T-47D': 'BR:T47D',
        'CNS:SF-295': 'CNS:SF295',
        'CNS:SF-268': 'CNS:SF268',
        'CNS:SF-539': 'CNS:SF539',
        'CNS:SNB-19': 'CNS:SNB19',
        'CO:HT29': 'CO:HT 29',
        'CO:SW-620': 'SW620',
        'LE:HL-60(TB)': 'LE:HL-60',
        'ME:LOX IMVI': 'ME:LOXIMVI',
        'LC:A549/ATCC': 'LC:A549',
        'OV:IGROV1': 'OV:IGROV 1',
        'RE:RXF 393': 'RE:RXF-393',
        'RE:TK-10': 'RE:TK10' 
    }
)

# These are not listed in the table and train data
nci60Act = nci60Act.drop([
    'ME:MDA-N', 
    'OV:NCI/ADR-RES',
    'RE:UO-31'
], axis=1)

In [7]:
col = []
for i in nci60Act.columns:
    tmp = i.split(':')
    if len(tmp) == 2:
        col.append(tmp[1])
    else:
        col.append(tmp[0])

nci60Act.columns = col

In [8]:
sanger_cell_lines = pd.read_csv('../DrugCell/data/sanger_to_ccle.csv', index_col=0)

In [9]:
ccle_dict = {
    i.upper():j for i,j in zip(
        sanger_cell_lines['Sanger name'], 
        sanger_cell_lines['CCLE Label']
    )
}

In [10]:
nci60Act.columns = [
    ccle_dict[i.replace(' ', '-')] for i in nci60Act.columns
]

In [11]:
sanger_cell_lines[sanger_cell_lines['Sanger name'].str.contains('RX')]

Unnamed: 0,Sanger name,CCLE Label
567,RXF-393,RXF393_KIDNEY


In [12]:
base = pd.DataFrame()
for i in nci60Act.columns:
    tmp = pd.DataFrame((nci60Act[i].dropna()))
    tmp.columns = ['Drug Response']
    tmp['Cell Line'] = [i]*len(tmp)
    tmp = pd.merge(tmp, drugAnnot, left_index=True, right_index=True)
    base = pd.concat([base, tmp])
    
base = base[['Cell Line', 'SMILES', 'Drug Response']]
base = base.reset_index(drop=True)

In [13]:
base.head()

Unnamed: 0,Cell Line,SMILES,Drug Response
0,MCF7_BREAST,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...,0.703626
1,MCF7_BREAST,CCC(=O)OCN1C(=O)C=CC1=O,-0.496895
2,MCF7_BREAST,CC1=C(C(=O)C2=C(C1=O)N3C[C@H]4[C@@H]([C@@]3([C...,1.178418
3,MCF7_BREAST,C1=CC(=CC=C1CC2=CC=C(C=C2)[As](=O)(O)O)[As](=O...,1.219245
4,MCF7_BREAST,C1=CC=C2C=C3C(=CC2=C1)NC(=N3)CNC4=CC=C(C=C4)C(...,2.688648


In [14]:
test = pd.DataFrame()
for i in set(train[0]):
    t = (base[base['Cell Line'] == i])
    if len(t) > 0:
        test = pd.concat([test, t])

test = test.reset_index(drop=True)

In [15]:
test.head()

Unnamed: 0,Cell Line,SMILES,Drug Response
0,BT549_BREAST,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...,-0.877267
1,BT549_BREAST,CCC(=O)OCN1C(=O)C=CC1=O,-0.403443
2,BT549_BREAST,CC1=C(C(=O)C2=C(C1=O)N3C[C@H]4[C@@H]([C@@]3([C...,-0.76738
3,BT549_BREAST,C1=CC(=CC=C1CC2=CC=C(C=C2)[As](=O)(O)O)[As](=O...,-0.535425
4,BT549_BREAST,C1=CC=C2C=C3C(=CC2=C1)NC(=N3)CNC4=CC=C(C=C4)C(...,-0.301961


In [16]:
test.to_csv(
    '../DrugCell/data/rcellminer_test.txt', sep='\t', 
    header=None, index=None
)

In [17]:
test

Unnamed: 0,Cell Line,SMILES,Drug Response
0,BT549_BREAST,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...,-0.877267
1,BT549_BREAST,CCC(=O)OCN1C(=O)C=CC1=O,-0.403443
2,BT549_BREAST,CC1=C(C(=O)C2=C(C1=O)N3C[C@H]4[C@@H]([C@@]3([C...,-0.767380
3,BT549_BREAST,C1=CC(=CC=C1CC2=CC=C(C=C2)[As](=O)(O)O)[As](=O...,-0.535425
4,BT549_BREAST,C1=CC=C2C=C3C(=CC2=C1)NC(=N3)CNC4=CC=C(C=C4)C(...,-0.301961
...,...,...,...
1962,DU145_PROSTATE,COC1=NC(=NC2=C1N=CN2[C@H]3[C@H]([C@@H]([C@H](O...,-0.174699
1963,DU145_PROSTATE,CNC(=O)C1=CC=CC=C1SC2=CC3=C(C=C2)C(=NN3)/C=C/C...,-0.441911
1964,DU145_PROSTATE,CC1=C2[C@H](C(=O)[C@@]3([C@H](C[C@@H]4[C@](C3[...,0.514966
1965,DU145_PROSTATE,COC1=CC(=CC(=C1O)OC)[C@H]2[C@@H]3C(COC3=O)C(C4...,0.345417


In [18]:
stat = (pd.DataFrame([len(set(test['Cell Line'])), len(set(test['SMILES']))]).T)
stat.columns = ['unique cell line', 'unique SMILES']
stat.index = ['num']

In [20]:
print(stat.to_markdown())

|     |   unique cell line |   unique SMILES |
|:----|-------------------:|----------------:|
| num |                 55 |              37 |
