# Create the csv files containing CID, SMILES notations and labels

In [1]:
import pandas as pd 

# load the BioAssay PubChem AID 2732 dataset
# https://pubchem.ncbi.nlm.nih.gov/bioassay/2732
df = pd.read_csv('input/pubchem_CHOP_bioassay.csv', sep=';')

# To avoid truncation of some columns during data frame display
pd.set_option('display.max_columns', None) 

# Display the data frame
print('Shape of the data frame: ', df.shape)

df.head()

Shape of the data frame:  (219165, 10)


Unnamed: 0,PUBCHEM_RESULT_TAG,SID,CID,SMILES,target,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Pct Inhibition,Luminescence Value
0,1,17416326,2840784.0,CCCCC(=O)NC1=C(C2=C(S1)C(=C(CC2)C=O)Cl)C(=O)OCC,Active,100,,,116.56,35520
1,2,14741113,2010180.0,CC1=CC(=CC=C1)CS(=O)(=O)C2=NN=C(O2)[C@H](CC3=C...,Active,99,,,115.34,69880
2,3,14742004,3696529.0,CC1=CC=C(C=C1)C(=O)C2=C(OC3=C2C(=O)C(=O)C4=CC=...,Active,99,,,115.21,39160
3,4,863128,664366.0,CC[C@@H](C)[C@@H](C1=NN=C(O1)S(=O)(=O)CC2=C(C=...,Active,99,,,114.88,144040
4,5,17409458,6418635.0,C1=CC=C(C=C1)C2=NC(=C(N=N2)C(F)(F)F)SC3=CC=C(C...,Active,98,,,114.24,17400


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 219165 entries, 0 to 219164
Data columns (total 10 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   PUBCHEM_RESULT_TAG         219165 non-null  int64  
 1   SID                        219165 non-null  int64  
 2   CID                        219164 non-null  float64
 3   SMILES                     219164 non-null  object 
 4   target                     219165 non-null  object 
 5   PUBCHEM_ACTIVITY_SCORE     219165 non-null  int64  
 6   PUBCHEM_ACTIVITY_URL       0 non-null       float64
 7   PUBCHEM_ASSAYDATA_COMMENT  0 non-null       float64
 8   Pct Inhibition             219165 non-null  object 
 9   Luminescence Value         219165 non-null  int64  
dtypes: float64(3), int64(4), object(3)
memory usage: 16.7+ MB


In [3]:
# Remove duplicates in general
df.drop_duplicates()
df.shape  # 359,035 means no duplicates

(219165, 10)

In [4]:
# Remove the compounds missing CID-s
df = df[df['CID'].notna()]
df.shape

(219164, 10)

In [5]:
# Turn CID float data type into integer
df = df.copy()
df['CID'] = df['CID'].astype('int64') 
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 219164 entries, 0 to 219164
Data columns (total 10 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   PUBCHEM_RESULT_TAG         219164 non-null  int64  
 1   SID                        219164 non-null  int64  
 2   CID                        219164 non-null  int64  
 3   SMILES                     219164 non-null  object 
 4   target                     219164 non-null  object 
 5   PUBCHEM_ACTIVITY_SCORE     219164 non-null  int64  
 6   PUBCHEM_ACTIVITY_URL       0 non-null       float64
 7   PUBCHEM_ASSAYDATA_COMMENT  0 non-null       float64
 8   Pct Inhibition             219164 non-null  object 
 9   Luminescence Value         219164 non-null  int64  
dtypes: float64(2), int64(5), object(3)
memory usage: 18.4+ MB


In [6]:
# Remove duplicates in CIDs 
df = df.drop_duplicates(subset=['CID'], keep=False) 
df.shape  # 6,446 are duplicates 

(218176, 10)

In [7]:
df = df [['CID', 'SMILES', 'target']]
df.head()

Unnamed: 0,CID,SMILES,target
0,2840784,CCCCC(=O)NC1=C(C2=C(S1)C(=C(CC2)C=O)Cl)C(=O)OCC,Active
1,2010180,CC1=CC(=CC=C1)CS(=O)(=O)C2=NN=C(O2)[C@H](CC3=C...,Active
2,3696529,CC1=CC=C(C=C1)C(=O)C2=C(OC3=C2C(=O)C(=O)C4=CC=...,Active
3,664366,CC[C@@H](C)[C@@H](C1=NN=C(O1)S(=O)(=O)CC2=C(C=...,Active
4,6418635,C1=CC=C(C=C1)C2=NC(=C(N=N2)C(F)(F)F)SC3=CC=C(C...,Active


In [8]:
# Counting of values in column PUBCHEM_ACTIVITY_OUTCOME
df['target'].value_counts()

target
Inactive    209952
Active        8224
Name: count, dtype: int64

In [9]:
# Set the binary targets to "1" for an active inhibitors and "0" for others 
df = df.copy()

# Create a dictionary
target = {'Active' : 1,
          'Inactive' : 0}

# Replace the names with binary values set up in the dictionary targets
df.target = [target[item] for item in df.target]


# Check how many unique values there are in the 'target' column
df['target'].unique()

array([1, 0], dtype=int64)

In [10]:
# Shuffle the resulting data sets
df = df.sample(
    frac = 1,        # Return entire dataframe
    random_state=1   # Make result reproducible
    ).reset_index(drop=True)

# Display the data frame
print('Shape of the data frame: ', df.shape)

df.head()

Shape of the data frame:  (218176, 3)


Unnamed: 0,CID,SMILES,target
0,2553149,CC1=CC(=NN1C2=CC=C(C=C2)C(=O)OCC(=O)C3=C(N(C(=...,0
1,1842208,COC1=CC=CC=C1N2C(=C(C3=CC=CC=C3C2=O)C=NCC4=CC=...,0
2,2081879,CC1=CC(=NO1)NC(=O)CSC2=NC3=CC=CC=C3C(=O)N2C4=C...,0
3,1072534,CCOC(=O)C1=C(C2=C(S1)N=C(C3=C2CC(N(C3)C)(C)C)N...,0
4,1728284,CC1=CC=C(C=C1)C(=O)NC(=S)NNC(=O)COC2=CC=CC=C2C,1


In [11]:
df.to_csv('CHOP_full_dataset_for_CID_list.csv')

[<a href="#content">Back to top</a>]

## Create csv files <a name="3"></a>

In [12]:
# First part of df
df_1 = pd.read_csv('df1.csv', index_col=[0])
df1= pd.merge(df, df_1, on=['CID', 'SMILES'])
print(df1.shape)
df1.head()

(54899, 4)


Unnamed: 0,CID,SMILES,target,UPAC
0,2553149,CC1=CC(=NN1C2=CC=C(C=C2)C(=O)OCC(=O)C3=C(N(C(=...,0,"[2-[1-(2-methoxyethyl)-2,5-dimethylpyrrol-3-yl..."
1,1728284,CC1=CC=C(C=C1)C(=O)NC(=S)NNC(=O)COC2=CC=CC=C2C,1,4-methyl-N-[[[2-(2-methylphenoxy)acetyl]amino]...
2,2878770,COC1=CC=C(C=C1)C2=CC(=O)CC(C2)C3=CC=CC=C3,0,3-(4-methoxyphenyl)-5-phenylcyclohex-2-en-1-one
3,1823283,C1=CC=C2C(=C1)C(=O)/C(=C/C3=CC=CS3)/S2,0,(2Z)-2-(thiophen-2-ylmethylidene)-1-benzothiop...
4,1438309,CC1=CC=C(C=C1)C(=O)C2CCN(CC2)CC(=O)NC3=NC(=CS3)C,0,2-[4-(4-methylbenzoyl)piperidin-1-yl]-N-(4-met...


In [13]:
# Remain only the necessary columns
df1 = df1[['UPAC', 'target']] 
print(df1.shape)
df1.head()

(54899, 2)


Unnamed: 0,UPAC,target
0,"[2-[1-(2-methoxyethyl)-2,5-dimethylpyrrol-3-yl...",0
1,4-methyl-N-[[[2-(2-methylphenoxy)acetyl]amino]...,1
2,3-(4-methoxyphenyl)-5-phenylcyclohex-2-en-1-one,0
3,(2Z)-2-(thiophen-2-ylmethylidene)-1-benzothiop...,0
4,2-[4-(4-methylbenzoyl)piperidin-1-yl]-N-(4-met...,0


In [14]:
# Generate the csv file
df1.to_csv('df1_targets.csv')

In [15]:
# First part of df
df_2 = pd.read_csv('df2.csv', index_col=[0])
df2= pd.merge(df, df_2, on=['CID', 'SMILES'])
print(df2.shape)
df2.head()

(54821, 4)


Unnamed: 0,CID,SMILES,target,UPAC
0,1842208,COC1=CC=CC=C1N2C(=C(C3=CC=CC=C3C2=O)C=NCC4=CC=...,0,3-hydroxy-2-(2-methoxyphenyl)-4-(pyridin-2-ylm...
1,2081879,CC1=CC(=NO1)NC(=O)CSC2=NC3=CC=CC=C3C(=O)N2C4=C...,0,2-[3-(3-chloro-4-methoxyphenyl)-4-oxoquinazoli...
2,5007365,CC1=C(C(=O)N=C2C=CC=CC2=N1)NNC3=CC=C(C=C3)[N+]...,0,"4-methyl-3-[2-(4-nitrophenyl)hydrazinyl]-1,5-b..."
3,5523524,COC1=CC(=C(C=C1)C(=O)N/N=C\2/N(C(=O)CS2)CC3=CC...,0,"N-[(Z)-[3-(1,3-benzodioxol-5-ylmethyl)-4-oxo-1..."
4,801109,C1CCN(CC1)C2=NC=NC3=C2C=NN3CC4=CC=CC=C4,0,"1-benzyl-4-piperidin-1-ylpyrazolo[3,4-d]pyrimi..."


In [16]:
# Remain only the necessary columns
df2 = df2[['UPAC', 'target']] 
print(df2.shape)
df2.head()

(54821, 2)


Unnamed: 0,UPAC,target
0,3-hydroxy-2-(2-methoxyphenyl)-4-(pyridin-2-ylm...,0
1,2-[3-(3-chloro-4-methoxyphenyl)-4-oxoquinazoli...,0
2,"4-methyl-3-[2-(4-nitrophenyl)hydrazinyl]-1,5-b...",0
3,"N-[(Z)-[3-(1,3-benzodioxol-5-ylmethyl)-4-oxo-1...",0
4,"1-benzyl-4-piperidin-1-ylpyrazolo[3,4-d]pyrimi...",0


In [17]:
# Generate the csv file
df2.to_csv('df2_targets.csv')

In [18]:
# Second part of df
df_3 = pd.read_csv('df3.csv', index_col=[0])
df3= pd.merge(df, df_3, on=['CID', 'SMILES'])
print(df3.shape)
df3.head()

(54835, 4)


Unnamed: 0,CID,SMILES,target,UPAC
0,1094870,CC1=C(C(=NO1)C2=C(C=CC=C2Cl)Cl)C(=O)N3CCN(CC3)...,0,"[3-(2,6-dichlorophenyl)-5-methyl-1,2-oxazol-4-..."
1,1247043,C1CCC2=C(C1)C=C(S2)C(=O)NC3=NOC=C3,0,"N-(1,2-oxazol-3-yl)-4,5,6,7-tetrahydro-1-benzo..."
2,820933,C1=CC=C(C=C1)C2=C(N3C=CC=CC3=N2)C=O,0,"2-phenylimidazo[1,2-a]pyridine-3-carbaldehyde"
3,3126076,CC1=CC(=C(C=C1)C(=O)C)OC(=O)C2=CC(=CC=C2)F,0,(2-acetyl-5-methylphenyl) 3-fluorobenzoate
4,2547792,CC1=C(C=C(C=C1)S(=O)(=O)N2CCOCC2)C(=O)NNC(=O)C...,0,2-methyl-N'-(2-methyl-5-morpholin-4-ylsulfonyl...


In [19]:
# Remain only the necessary columns
df3 = df3[['UPAC', 'target']] 
print(df3.shape)
df3.head()

(54835, 2)


Unnamed: 0,UPAC,target
0,"[3-(2,6-dichlorophenyl)-5-methyl-1,2-oxazol-4-...",0
1,"N-(1,2-oxazol-3-yl)-4,5,6,7-tetrahydro-1-benzo...",0
2,"2-phenylimidazo[1,2-a]pyridine-3-carbaldehyde",0
3,(2-acetyl-5-methylphenyl) 3-fluorobenzoate,0
4,2-methyl-N'-(2-methyl-5-morpholin-4-ylsulfonyl...,0


In [20]:
# Generate the csv file
df3.to_csv('df3_targets.csv')

In [21]:
# First part of df
df_4 = pd.read_csv('df4.csv', index_col=[0])
df4= pd.merge(df, df_4, on=['CID', 'SMILES'])
print(df4.shape)
df4.head()

(53538, 4)


Unnamed: 0,CID,SMILES,target,UPAC
0,1072534,CCOC(=O)C1=C(C2=C(S1)N=C(C3=C2CC(N(C3)C)(C)C)N...,0,"ethyl 1-amino-7,8,8-trimethyl-5-morpholin-4-yl..."
1,2340492,C1=CC=C2C(=C1)C=C(C(=O)O2)C(=O)NCCC3=CNC4=CC=C...,0,N-[2-(1H-indol-3-yl)ethyl]-2-oxochromene-3-car...
2,5389480,CC1=C(C(=O)OC2=C(C(=C(C=C12)Cl)O)CN3CCCCC3)C,0,"6-chloro-7-hydroxy-3,4-dimethyl-8-(piperidin-1..."
3,4330067,CN(C)C1(CCCCC1)CNC(=O)C2=CC(=C(C=C2)Cl)S(=O)(=...,0,4-chloro-N-[[1-(dimethylamino)cyclohexyl]methy...
4,2221349,CCCNC(=O)NCC1=CC=C(C=C1)N2CCCC2=O,0,1-[[4-(2-oxopyrrolidin-1-yl)phenyl]methyl]-3-p...


In [22]:
# Remain only the necessary columns
df4 = df4[['UPAC', 'target']] 
print(df4.shape)
df4.head()

(53538, 2)


Unnamed: 0,UPAC,target
0,"ethyl 1-amino-7,8,8-trimethyl-5-morpholin-4-yl...",0
1,N-[2-(1H-indol-3-yl)ethyl]-2-oxochromene-3-car...,0
2,"6-chloro-7-hydroxy-3,4-dimethyl-8-(piperidin-1...",0
3,4-chloro-N-[[1-(dimethylamino)cyclohexyl]methy...,0
4,1-[[4-(2-oxopyrrolidin-1-yl)phenyl]methyl]-3-p...,0


In [23]:
# Generate the csv file
df4.to_csv('df4_targets.csv')

[<a href="#content">Back to top</a>]