# Process NCI compound data

In [1]:
import pandas as pd
import re
import numpy as np
NSC_REF_DIR = "../data/cancer_cell/nci_nsc_name.tab"
DATA_DIR = "../data/cancer_cell/mmc7.csv"

# Autoreload modules
%load_ext autoreload
%autoreload 2

# Add local path
import sys
sys.path.append("/Users/Jphild/Documents/Papers/_***Thesis/MRP7Pred/")

from mrp7pred.utils import standardize_smiles

import warnings
warnings.filterwarnings("ignore")

#### 1. Load NSC reference data

In [2]:
df_ref = pd.read_csv(NSC_REF_DIR, sep="\t")

In [3]:
df_ref.head()

Unnamed: 0,E_NSC,E_NAME,E_NAMESET(0),E_NAMESET(1),E_NAMESET(2),E_NAMESET(3),E_NAMESET(4),E_NAMESET(5),E_NAMESET(6),E_NAMESET(7),...,E_NAMESET(60),E_NAMESET(61),E_NAMESET(62),E_NAMESET(63),E_NAMESET(64),E_NAMESET(65),E_NAMESET(66),E_NAMESET(67),E_CAS,E_SMILES
0,186,"8-hydroxy-3,4,5-trimethyl-6-oxo-4,6-dihydro-3H...","8-hydroxy-3,4,5-trimethyl-6-oxo-4,6-dihydro-3H...","3H-2-Benzopyran-7-carboxylic acid, 4,6-dihydro...",Antimycin,Citriain,Citrinin,,,,...,,,,,,,,,518-75-2,CC2=C1[C@H]([C@H](OC=C1C(=C(C(=O)O)C2=O)O)C)C
1,740,"N-(4-(((2,4-diamino-6-pteridinyl)methyl)(methy...","N-(4-(((2,4-diamino-6-pteridinyl)methyl)(methy...",Amethopterin,CL 14377,"EMT 25,299","Glutamic acid, {N-[p-[[(2,} 4-diamino-6-pterid...",HDMTX,"L-Glutamic acid, {N-[4-[[2,} 4-diamino-6-pteri...",Metatrexan,...,,,,,,,,,59-05-2,NC1=C2C(=NC(=N1)N)N=CC(=N2)CN(C)C3=CC=C(C(N[C@...
2,742,O-(1lambda~5~-diazenylideneacetyl)serine,O-(1lambda~5~-diazenylideneacetyl)serine (ACD/...,o-Diazoacetyl-L-serine,"Acetic acid, diazo-, ester with serine",Azaserin,Azaserine(USAN),AZASERINE,AZS,C.I. 337,...,,,,,,,,,115-02-6,C([C@H](COC(=O)C=[N+]=[N-])N)(=O)O
3,750,4-((methylsulfonyl)oxy)butyl methanesulfonate,4-((methylsulfonyl)oxy)butyl methanesulfonate ...,AN 33501,Busulfan,Busulphan,Buzulfan,C.B. 2041,Citosulfan,CB 2041,...,,,,,,,,,55-98-1,C[S](=O)(=O)OCCCCO[S](C)(=O)=O
4,752,2-amino-9H-purin-6-yl hydrosulfide,2-amino-9H-purin-6-yl hydrosulfide (ACD/Name),2-amino-9H-purine-6-thiol (ACD/Name),BW 5071,"Guanine, thio- (VAN)",Lanvis,NSC 752,"Purine-6(1H)-thione, 2-amino-","Purine-6-thiol, 2-amino-",...,,,,,,,,,154-42-7,SC1=C2C(=NC(=N1)N)[NH]C=N2


In [67]:
df_ref.shape

(1413, 72)

In [68]:
df_ref.loc[0, "E_NAMESET(67)"] is np.nan

True

In [69]:
# Find the simplest names
df_ref["NAME_SIMPLE"] = np.nan
for index in range(len(df_ref)):
    name_l = [v for v in df_ref.loc[index, "E_NAMESET(0)":"E_NAMESET(67)"] if (v is not np.nan and v != "???")]
    if len(name_l) < 1: # no name
        simplest_name == f"NSC{df_ref.loc[index, 'E_NSC']}"
    else:
        simplest_name = sorted(name_l, key=len)[0]
    df_ref.loc[index, "NAME_SIMPLE"] = simplest_name
    
df_ref[["E_NSC", "NAME_SIMPLE"]]

Unnamed: 0,E_NSC,NAME_SIMPLE
0,186,Citriain
1,740,MTX
2,742,AZS
3,750,GT 41
4,752,TG
...,...,...
1408,698691,"N-ethyl-5-(2-(2-(5-(ethylamino)-1,3,4-thiadiaz..."
1409,698790,26-(hexopyranosyloxy)-22-methoxyfurost-5-en-3-...
1410,698792,3-((2-O-(6-deoxyhexopyranosyl)-3-O-hexopyranos...
1411,699477,3-((2-O-(6-deoxyhexopyranosyl)-3-O-hexopyranos...


In [81]:
df_ref = df_ref.rename(columns={
    "E_NSC" : "nsc_id",
    "E_SMILES" : "smiles",
    "E_CAS" : "cas",
    "NAME_SIMPLE" : "name"
})
df_ref.head()

Unnamed: 0,nsc_id,E_NAME,E_NAMESET(0),E_NAMESET(1),E_NAMESET(2),E_NAMESET(3),E_NAMESET(4),E_NAMESET(5),E_NAMESET(6),E_NAMESET(7),...,E_NAMESET(61),E_NAMESET(62),E_NAMESET(63),E_NAMESET(64),E_NAMESET(65),E_NAMESET(66),E_NAMESET(67),cas,smiles,name
0,186,"8-hydroxy-3,4,5-trimethyl-6-oxo-4,6-dihydro-3H...","8-hydroxy-3,4,5-trimethyl-6-oxo-4,6-dihydro-3H...","3H-2-Benzopyran-7-carboxylic acid, 4,6-dihydro...",Antimycin,Citriain,Citrinin,,,,...,,,,,,,,518-75-2,CC2=C1[C@H]([C@H](OC=C1C(=C(C(=O)O)C2=O)O)C)C,Citriain
1,740,"N-(4-(((2,4-diamino-6-pteridinyl)methyl)(methy...","N-(4-(((2,4-diamino-6-pteridinyl)methyl)(methy...",Amethopterin,CL 14377,"EMT 25,299","Glutamic acid, {N-[p-[[(2,} 4-diamino-6-pterid...",HDMTX,"L-Glutamic acid, {N-[4-[[2,} 4-diamino-6-pteri...",Metatrexan,...,,,,,,,,59-05-2,NC1=C2C(=NC(=N1)N)N=CC(=N2)CN(C)C3=CC=C(C(N[C@...,MTX
2,742,O-(1lambda~5~-diazenylideneacetyl)serine,O-(1lambda~5~-diazenylideneacetyl)serine (ACD/...,o-Diazoacetyl-L-serine,"Acetic acid, diazo-, ester with serine",Azaserin,Azaserine(USAN),AZASERINE,AZS,C.I. 337,...,,,,,,,,115-02-6,C([C@H](COC(=O)C=[N+]=[N-])N)(=O)O,AZS
3,750,4-((methylsulfonyl)oxy)butyl methanesulfonate,4-((methylsulfonyl)oxy)butyl methanesulfonate ...,AN 33501,Busulfan,Busulphan,Buzulfan,C.B. 2041,Citosulfan,CB 2041,...,,,,,,,,55-98-1,C[S](=O)(=O)OCCCCO[S](C)(=O)=O,GT 41
4,752,2-amino-9H-purin-6-yl hydrosulfide,2-amino-9H-purin-6-yl hydrosulfide (ACD/Name),2-amino-9H-purine-6-thiol (ACD/Name),BW 5071,"Guanine, thio- (VAN)",Lanvis,NSC 752,"Purine-6(1H)-thione, 2-amino-","Purine-6-thiol, 2-amino-",...,,,,,,,,154-42-7,SC1=C2C(=NC(=N1)N)[NH]C=N2,TG


#### 2. Load Cancer Cell paper data

This data includes the Pearson correlation coefficients for **gene mRNA expression (dCP)** and **drug sensitivity (dlogGI50)**

Positive means more ABC transporter expression, more sensitive the tested cells become to the compound.

Negative means more ABC transporter expression, less sensitive the cells be, which indicates potential substrates.

In [82]:
df = pd.read_csv(DATA_DIR)

In [83]:
df.head()

Unnamed: 0,NSC No.,ABC-G8,ABC-G5,ABC-G4,ABC-G2,ABC-G1,ABC-F3,ABC-F2,ABC-F1,ABC-E1,...,ABC-A10,ABC-A9,ABC-A8,ABC-A7,ABC-A6,ABC-A5,ABC-A4,ABC-A3,ABC-A2,ABC-A1
0,2013,0.04,0.16,0.0,-0.01,-0.14,0.06,0.11,-0.22,-0.05,...,-0.08,-0.04,-0.03,0.04,-0.12,-0.02,-0.17,0.2,0.02,0.05
1,3053,-0.05,0.04,0.2,0.22,0.24,-0.21,-0.28,-0.19,-0.34,...,-0.17,0.07,0.28,-0.12,0.0,-0.1,0.14,-0.11,-0.13,-0.27
2,3970,-0.07,-0.03,-0.04,0.16,-0.24,-0.1,-0.23,-0.15,-0.39,...,-0.35,0.14,0.06,0.09,0.07,-0.07,-0.08,0.03,-0.08,-0.29
3,5890,-0.01,0.21,-0.04,0.21,-0.28,0.24,-0.03,-0.11,-0.21,...,-0.35,0.14,-0.09,0.1,0.01,0.0,0.02,0.27,0.19,-0.13
4,8120,0.14,0.02,0.05,-0.1,0.15,-0.02,0.1,0.08,0.14,...,0.09,-0.15,-0.12,-0.05,-0.21,0.07,0.07,-0.18,-0.06,0.0


In [84]:
df.shape

(1429, 49)

In [85]:
df_c10 = df[["NSC No.", "ABC-C10"]]
df_c10 = df_c10.rename(columns={"NSC No." : "nsc_id", "ABC-C10" : "abcc10"})
df_c10["nsc_id"] = df_c10["nsc_id"].apply(lambda x: ''.join(re.findall(r"\((\d+)\)", x)) if not x.isnumeric() else x)


In [86]:
df_c10.head()

Unnamed: 0,nsc_id,abcc10
0,2013,-0.21
1,3053,-0.18
2,3970,-0.05
3,5890,0.01
4,8120,-0.01


In [87]:
# Test if all nsc_id's are numeric
all(df_c10['nsc_id'].str.isnumeric())

True

#### 3. Map compound info (name, smiles) to nsc_id

In [158]:
df_join = df_ref\
    .set_index("nsc_id")\
    .join(
        df_c10.set_index("nsc_id")
    )[["name", "cas", "smiles", "abcc10"]]

In [159]:
df_join.head()

Unnamed: 0_level_0,name,cas,smiles,abcc10
nsc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
186,Citriain,518-75-2,CC2=C1[C@H]([C@H](OC=C1C(=C(C(=O)O)C2=O)O)C)C,
740,MTX,59-05-2,NC1=C2C(=NC(=N1)N)N=CC(=N2)CN(C)C3=CC=C(C(N[C@...,
742,AZS,115-02-6,C([C@H](COC(=O)C=[N+]=[N-])N)(=O)O,
750,GT 41,55-98-1,C[S](=O)(=O)OCCCCO[S](C)(=O)=O,
752,TG,154-42-7,SC1=C2C(=NC(=N1)N)[NH]C=N2,


In [160]:
df_join = df_join.reset_index()
df_join.head()

Unnamed: 0,nsc_id,name,cas,smiles,abcc10
0,186,Citriain,518-75-2,CC2=C1[C@H]([C@H](OC=C1C(=C(C(=O)O)C2=O)O)C)C,
1,740,MTX,59-05-2,NC1=C2C(=NC(=N1)N)N=CC(=N2)CN(C)C3=CC=C(C(N[C@...,
2,742,AZS,115-02-6,C([C@H](COC(=O)C=[N+]=[N-])N)(=O)O,
3,750,GT 41,55-98-1,C[S](=O)(=O)OCCCCO[S](C)(=O)=O,
4,752,TG,154-42-7,SC1=C2C(=NC(=N1)N)[NH]C=N2,


In [162]:
d = df_c10.set_index("nsc_id").to_dict("index")
df_join["abcc10"] = df_join["nsc_id"].apply(lambda x: d[str(x)].get("abcc10"))

In [163]:
df_join.shape

(1413, 5)

In [164]:
df_join.head()

Unnamed: 0,nsc_id,name,cas,smiles,abcc10
0,186,Citriain,518-75-2,CC2=C1[C@H]([C@H](OC=C1C(=C(C(=O)O)C2=O)O)C)C,-0.23
1,740,MTX,59-05-2,NC1=C2C(=NC(=N1)N)N=CC(=N2)CN(C)C3=CC=C(C(N[C@...,0.12
2,742,AZS,115-02-6,C([C@H](COC(=O)C=[N+]=[N-])N)(=O)O,-0.03
3,750,GT 41,55-98-1,C[S](=O)(=O)OCCCCO[S](C)(=O)=O,0.04
4,752,TG,154-42-7,SC1=C2C(=NC(=N1)N)[NH]C=N2,0.08


In [165]:
df_join.to_csv("../data/joined_abcc10_nsc.csv")

#### 4. Identify potential substrates/non-substrates

In [4]:
THRESHOLD = -0.20

df_join = pd.read_csv("../data/cancer_cell/joined_abcc10_nsc_nscid-name.csv", index_col=0)
df_join.head()

Unnamed: 0,nsc_id,name,cas,smiles,abcc10,nsc_id_name
0,186,Citriain,518-75-2,CC2=C1[C@H]([C@H](OC=C1C(=C(C(=O)O)C2=O)O)C)C,-0.23,NSC186
1,740,MTX,59-05-2,NC1=C2C(=NC(=N1)N)N=CC(=N2)CN(C)C3=CC=C(C(N[C@...,0.12,NSC740
2,742,AZS,115-02-6,C([C@H](COC(=O)C=[N+]=[N-])N)(=O)O,-0.03,NSC742
3,750,GT 41,55-98-1,C[S](=O)(=O)OCCCCO[S](C)(=O)=O,0.04,NSC750
4,752,TG,154-42-7,SC1=C2C(=NC(=N1)N)[NH]C=N2,0.08,NSC752


In [5]:
df_substrate = df_join[df_join["abcc10"]<=THRESHOLD]
df_nonsubstrate = df_join[df_join["abcc10"]>THRESHOLD]

In [6]:
len(df_substrate), len(df_nonsubstrate)

(101, 1312)

Count the results:

|Threshold|Substrate|Non-substrate|
|---|---|---|
|-0.20|103|1326|
|-0.25|51|1362|
|-0.30|19|1394|

#### 5. Check duplicates with previous dataset

Use standardized smiles as keys

In [7]:
df_previous = pd.read_csv("../data/manual/merged.csv", index_col=0)
df_previous.head()

Unnamed: 0,name,smiles,label
0,paclitaxel,CC1=C2C(C(=O)C3(C(CC4C(C3C(C(C2(C)C)(CC1OC(=O)...,1
1,vincristine,CCC1(CC2CC(C3=C(CCN(C2)C1)C4=CC=CC=C4N3)(C5=C(...,1
2,LTC4,CCCCCC=CCC=CC=CC=CC(C(CCCC(=O)O)O)SCC(C(=O)NCC...,1
3,E217bG,CC12CCC3C(C1CCC2OC4C(C(C(C(O4)C(=O)O)O)O)O)CCC...,1
4,gemcitabine,C1=CN(C(=O)N=C1N)C2C(C(C(O2)CO)O)(F)F,1


In [23]:
df_nci = pd.read_csv("../data/cancer_cell/joined_abcc10_nsc_nscid-name.csv", index_col=0)
df_nci.head()

Unnamed: 0,nsc_id,name,cas,smiles,abcc10,nsc_id_name
0,186,Citriain,518-75-2,CC2=C1[C@H]([C@H](OC=C1C(=C(C(=O)O)C2=O)O)C)C,-0.23,NSC186
1,740,MTX,59-05-2,NC1=C2C(=NC(=N1)N)N=CC(=N2)CN(C)C3=CC=C(C(N[C@...,0.12,NSC740
2,742,AZS,115-02-6,C([C@H](COC(=O)C=[N+]=[N-])N)(=O)O,-0.03,NSC742
3,750,GT 41,55-98-1,C[S](=O)(=O)OCCCCO[S](C)(=O)=O,0.04,NSC750
4,752,TG,154-42-7,SC1=C2C(=NC(=N1)N)[NH]C=N2,0.08,NSC752


In [24]:
df_nci = df_nci.drop("name", axis=1).rename(columns={"nsc_id_name" : "name"})
df_nci.head()

Unnamed: 0,nsc_id,cas,smiles,abcc10,name
0,186,518-75-2,CC2=C1[C@H]([C@H](OC=C1C(=C(C(=O)O)C2=O)O)C)C,-0.23,NSC186
1,740,59-05-2,NC1=C2C(=NC(=N1)N)N=CC(=N2)CN(C)C3=CC=C(C(N[C@...,0.12,NSC740
2,742,115-02-6,C([C@H](COC(=O)C=[N+]=[N-])N)(=O)O,-0.03,NSC742
3,750,55-98-1,C[S](=O)(=O)OCCCCO[S](C)(=O)=O,0.04,NSC750
4,752,154-42-7,SC1=C2C(=NC(=N1)N)[NH]C=N2,0.08,NSC752


In [25]:
THRESHOLD = -0.2
df_nci["label"] = df_nci["abcc10"].apply(lambda x: 1 if x <= THRESHOLD else 0)

In [26]:
df_all = pd.concat([df_previous, df_nci[["name", "smiles", "label"]]])
df_all.head()

Unnamed: 0,name,smiles,label
0,paclitaxel,CC1=C2C(C(=O)C3(C(CC4C(C3C(C(C2(C)C)(CC1OC(=O)...,1
1,vincristine,CCC1(CC2CC(C3=C(CCN(C2)C1)C4=CC=CC=C4N3)(C5=C(...,1
2,LTC4,CCCCCC=CCC=CC=CC=CC(C(CCCC(=O)O)O)SCC(C(=O)NCC...,1
3,E217bG,CC12CCC3C(C1CCC2OC4C(C(C(C(O4)C(=O)O)O)O)O)CCC...,1
4,gemcitabine,C1=CN(C(=O)N=C1N)C2C(C(C(O2)CO)O)(F)F,1


In [27]:
df_all.shape

(1529, 3)

In [28]:
df_all["std_smiles"] = df_all["smiles"].apply(standardize_smiles)
df_all.head()

RDKit ERROR: [08:45:01] Explicit valence for atom # 7 Sn, 5, is greater than permitted
RDKit ERROR: [08:45:01] Explicit valence for atom # 28 N, 4, is greater than permitted
RDKit ERROR: [08:45:01] Explicit valence for atom # 1 Sn, 5, is greater than permitted
RDKit ERROR: [08:45:01] Explicit valence for atom # 25 B, 7, is greater than permitted
RDKit ERROR: [08:45:01] Explicit valence for atom # 18 N, 4, is greater than permitted
RDKit ERROR: [08:45:01] Explicit valence for atom # 31 N, 4, is greater than permitted
RDKit ERROR: [08:45:01] Explicit valence for atom # 10 N, 4, is greater than permitted
RDKit ERROR: [08:45:01] Explicit valence for atom # 32 N, 4, is greater than permitted
RDKit ERROR: [08:45:01] Explicit valence for atom # 0 Pb, 5, is greater than permitted
RDKit ERROR: [08:45:01] Explicit valence for atom # 0 Tl, 7, is greater than permitted
RDKit ERROR: [08:45:01] Explicit valence for atom # 2 Ga, 9, is greater than permitted
RDKit ERROR: [08:45:01] Explicit valence fo

Unnamed: 0,name,smiles,label,std_smiles
0,paclitaxel,CC1=C2C(C(=O)C3(C(CC4C(C3C(C(C2(C)C)(CC1OC(=O)...,1,CC(=O)OC1C(=O)C2(C)C(O)CC3OCC3(OC(C)=O)C2C(OC(...
1,vincristine,CCC1(CC2CC(C3=C(CCN(C2)C1)C4=CC=CC=C4N3)(C5=C(...,1,CCC1(O)CC2CN(CCc3c([nH]c4ccccc34)C(C(=O)OC)(c3...
2,LTC4,CCCCCC=CCC=CC=CC=CC(C(CCCC(=O)O)O)SCC(C(=O)NCC...,1,CCCCCC=CCC=CC=CC=CC(SCC(NC(=O)CCC(N)C(=O)O)C(=...
3,E217bG,CC12CCC3C(C1CCC2OC4C(C(C(C(O4)C(=O)O)O)O)O)CCC...,1,CC12CCC3c4ccc(O)cc4CCC3C1CCC2OC1OC(C(=O)O)C(O)...
4,gemcitabine,C1=CN(C(=O)N=C1N)C2C(C(C(O2)CO)O)(F)F,1,Nc1ccn(C2OC(CO)C(O)C2(F)F)c(=O)n1


In [29]:
# Count smiles cannot be standardized
len(df_all[df_all["std_smiles"]=="error"])

23

In [30]:
df_all[df_all["std_smiles"]=="error"]

Unnamed: 0,name,smiles,label,std_smiles
477,NSC353527,[NH+](CC)(CC)CC.[Sn]([O+]=C([NH2+]C1=CC=C(OCC)...,0,error
643,NSC623527,CCCCCCCCCCCCCCCCCCOC[C@@H](COCCOCC[N]1(CCCC1)C...,0,error
654,NSC624659,[Br-][Sn]2([Br-])([C-]1=CC=CC=C1)[C-]4=C([NH+]...,0,error
745,NSC633558,[S@+]12CC[S@+]3CC[S@+]4CC[S@@+]5CC[S@+]6CC[S@@...,0,error
829,NSC639966,C(CCCCCCCCCO[P](=O)(O)OC1CC[N](CC1)(C)C)CCCCCCCC,0,error
885,NSC643826,O(C[C@@H](N(C)CCCCCCCCCCCCCCCCCC)CO[P](=O)(O)O...,0,error
886,NSC643827,N(C[C@@H](O[P](=O)(O)OCC[N](C)(C)C)COC)(CCCCCC...,0,error
887,NSC643828,[C@H](OC)(CN(CCCCCCCCCCCCCCCCCC)C)CO[P](=O)(O)...,0,error
899,NSC644908,[Pb]4([C-]1=CC=CC=C1)([C-]2=CC=CC=C2)([C-]3=CC...,0,error
900,NSC644916,[Tl+3]1([O+]=C([CH-]C(=[O+]1)C2=CC=CC=C2)C3=CC...,0,error


In [31]:
df_all = df_all[df_all["std_smiles"]!="error"]
df_all.shape

(1506, 4)

In [32]:
df_all = df_all.drop(["smiles"], axis=1)

In [33]:
df_all.head()

Unnamed: 0,name,label,std_smiles
0,paclitaxel,1,CC(=O)OC1C(=O)C2(C)C(O)CC3OCC3(OC(C)=O)C2C(OC(...
1,vincristine,1,CCC1(O)CC2CN(CCc3c([nH]c4ccccc34)C(C(=O)OC)(c3...
2,LTC4,1,CCCCCC=CCC=CC=CC=CC(SCC(NC(=O)CCC(N)C(=O)O)C(=...
3,E217bG,1,CC12CCC3c4ccc(O)cc4CCC3C1CCC2OC1OC(C(=O)O)C(O)...
4,gemcitabine,1,Nc1ccn(C2OC(CO)C(O)C2(F)F)c(=O)n1


In [34]:
df_all.to_csv("../data/all_compounds_with_std_smiles.csv")