In [1]:
import pandas as pd
import numpy as np

In [2]:
# Create PubChem_id list
pubchem_id = pd.read_csv('../DrugCell/data/pubchem_id_by_nsc.csv', index_col=0).dropna()
pubchem_id
pubchem_id['PUBCHEM_ID'].astype(int).to_csv('../DrugCell/data/pubchem_id.csv', index=None, header=None)
pubchem_id.head()

Unnamed: 0,PUBCHEM_ID
17,219123.0
295,4775.0
353,24180741.0
384,54599265.0
534,16685698.0


In [3]:
#  Get SMILES from PubChem ID  https://pubchem.ncbi.nlm.nih.gov/idexchange/idexchange.cgi
tmp = pd.read_table('../DrugCell/data/SMILES_from_PubchemID.txt', header=None)
tmp.index = pubchem_id.index
tmp = tmp.drop(0, axis=1)
tmp.head()

Unnamed: 0,1
17,CCCCCCCCCCCCCCCC1=C(C=CC(=C1)O)N
295,C1=CC=C(C=C1)CCCC(=O)O
353,CCN(CC)CCCNC1=C2C=C(C=CC2=NC3=C1C=CC(=C3)Cl)OC.Cl
384,CCCCCN(CCCCC)CCCNC1=C2CCCCC2=NC3=C1C=C(C=C3)Cl...
534,CC(=O)O[Hg].C1=CC=C2C(=C1)C=CC3=CC=CC=C32


In [4]:
train = pd.read_table('../DrugCell/data/drugcell_train.txt', header=None)
drugAnnot = pd.DataFrame()

for i in set(train[1]):
    drugAnnot = pd.concat([drugAnnot, tmp[tmp[1] == i]])

drugAnnot = drugAnnot.sort_index()
drugAnnot.columns = ['SMILES']

In [5]:
drugAnnot.head()

Unnamed: 0,SMILES
740,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...
19630,CCC(=O)OCN1C(=O)C=CC1=O
26980,CC1=C(C(=O)C2=C(C1=O)N3C[C@H]4[C@@H]([C@@]3([C...
48300,C1=CC(=CC=C1CC2=CC=C(C=C2)[As](=O)(O)O)[As](=O...
60043,C1=CC=C2C=C3C(=CC2=C1)NC(=N3)CNC4=CC=C(C=C4)C(...


In [6]:
nci60Act = pd.read_csv('../DrugCell/data/nci60Act.csv', index_col=0)
nci60Act = nci60Act.rename(
    columns={
        'BR:HS 578T': 'BR:HS 578 T', 
        'BR:T-47D': 'BR:T47D',
        'CNS:SF-295': 'CNS:SF295',
        'CNS:SNB-19': 'CNS:SNB19',
        'CO:HT29': 'CO:HT 29',
        'CO:SW-620': 'SW620',
        'LE:HL-60(TB)': 'LE:HL-60',
        'ME:LOX IMVI': 'ME:LOXIMVI',
        'LC:A549/ATCC': 'LC:A549',
        'OV:IGROV1': 'OV:IGROV 1'
    }
)

nci60Act = nci60Act.drop([
    'CNS:SF-268', 'CNS:SF-539', 'CNS:SNB-75', 
    'CO:HCC-2998', 'LE:CCRF-CEM', 'ME:M14',
    'ME:MDA-N', 'LC:EKVX', 'LC:HOP-62', 
    'LC:HOP-92', 'OV:OVCAR-5', 'OV:NCI/ADR-RES',
    'RE:RXF 393', 'RE:SN12C', 'RE:TK-10', 'RE:UO-31'
], axis=1)

In [7]:
col = []
for i in nci60Act.columns:
    tmp = i.split(':')
    if len(tmp) == 2:
        col.append(tmp[1])
    else:
        col.append(tmp[0])

nci60Act.columns = col

In [8]:
sanger_cell_lines = pd.read_table('../DrugCell/data/sanger_cell_lines.txt')[[
    'Sanger name', 'CCLE Label'
]].dropna().reset_index(drop=True)

In [9]:
ccle_dict = {
    i.upper():j for i,j in zip(
        sanger_cell_lines['Sanger name'], 
        sanger_cell_lines['CCLE Label']
    )
}

In [10]:
sanger_cell_lines.head()

Unnamed: 0,Sanger name,CCLE Label
0,DMS-53,DMS53_LUNG
1,SW1116,SW1116_LARGE_INTESTINE
2,UM-UC-3,UMUC3_URINARY_TRACT
3,HOS,HOS_BONE
4,RVH-421,RVH421_SKIN


In [11]:
nci60Act.columns = [
    ccle_dict[i.replace(' ', '-')] for i in nci60Act.columns
]

In [12]:
base = pd.DataFrame()
for i in nci60Act.columns:
    tmp = pd.DataFrame((nci60Act[i].dropna()))
    tmp.columns = ['Drug Response']
    tmp['Cell Line'] = [i]*len(tmp)
    tmp = pd.merge(tmp, drugAnnot, left_index=True, right_index=True)
    base = pd.concat([base, tmp])
    
base = base[['Cell Line', 'SMILES', 'Drug Response']]
base = base.reset_index(drop=True)

In [13]:
base.head()

Unnamed: 0,Cell Line,SMILES,Drug Response
0,MCF7_BREAST,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...,0.703626
1,MCF7_BREAST,CCC(=O)OCN1C(=O)C=CC1=O,-0.496895
2,MCF7_BREAST,CC1=C(C(=O)C2=C(C1=O)N3C[C@H]4[C@@H]([C@@]3([C...,1.178418
3,MCF7_BREAST,C1=CC(=CC=C1CC2=CC=C(C=C2)[As](=O)(O)O)[As](=O...,1.219245
4,MCF7_BREAST,C1=CC=C2C=C3C(=CC2=C1)NC(=N3)CNC4=CC=C(C=C4)C(...,2.688648


In [14]:
test = pd.DataFrame()
for i in set(train[0]):
    t = (base[base['Cell Line'] == i])
    if len(t) > 0:
        test = pd.concat([test, t])

test = test.reset_index(drop=True)

In [15]:
test.head()

Unnamed: 0,Cell Line,SMILES,Drug Response
0,786O_KIDNEY,CN(CC1=CN=C2C(=N1)C(=NC(=N2)N)N)C3=CC=C(C=C3)C...,0.682664
1,786O_KIDNEY,CCC(=O)OCN1C(=O)C=CC1=O,0.924344
2,786O_KIDNEY,CC1=C(C(=O)C2=C(C1=O)N3C[C@H]4[C@@H]([C@@]3([C...,0.328549
3,786O_KIDNEY,C1=CC(=CC=C1CC2=CC=C(C=C2)[As](=O)(O)O)[As](=O...,0.945579
4,786O_KIDNEY,C1=CC=C2C=C3C(=CC2=C1)NC(=N3)CNC4=CC=C(C=C4)C(...,-0.301961


In [16]:
test.to_csv(
    '../DrugCell/data/rcellminer_test.txt', sep='\t', 
    header=None, index=None
)