# Generation of the initial list with compounds out of the PubChem bioassay dataset

### Content   <a name="content"></a>

1. [Load PubChem BioAssay activators of TTR transcriptors ](#1)
2. [Extract the necessary columns](#2)
3. [Reduce the samples labled 0](#3)
4. [Create csv files](#4)

## Load PubChem BioAssay TTR receptor <a name="1"></a>

In [1]:
import pandas as pd 

# load the BioAssay PubChem AID 2732 dataset
# https://pubchem.ncbi.nlm.nih.gov/bioassay/2732
df = pd.read_csv('pubchem_CHOP_bioassay.csv', sep=';', low_memory=False)

# To avoid truncation of some columns during data frame display
pd.set_option('display.max_columns', None) 

# Display the data frame
print('Shape of the data frame: ', df.shape)

df.head()

Shape of the data frame:  (219165, 10)


Unnamed: 0,PUBCHEM_RESULT_TAG,SID,CID,SMILES,target,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Pct Inhibition,Luminescence Value
0,1,17416326,2840784.0,CCCCC(=O)NC1=C(C2=C(S1)C(=C(CC2)C=O)Cl)C(=O)OCC,Active,100,,,116.56,35520
1,2,14741113,2010180.0,CC1=CC(=CC=C1)CS(=O)(=O)C2=NN=C(O2)[C@H](CC3=C...,Active,99,,,115.34,69880
2,3,14742004,3696529.0,CC1=CC=C(C=C1)C(=O)C2=C(OC3=C2C(=O)C(=O)C4=CC=...,Active,99,,,115.21,39160
3,4,863128,664366.0,CC[C@@H](C)[C@@H](C1=NN=C(O1)S(=O)(=O)CC2=C(C=...,Active,99,,,114.88,144040
4,5,17409458,6418635.0,C1=CC=C(C=C1)C2=NC(=C(N=N2)C(F)(F)F)SC3=CC=C(C...,Active,98,,,114.24,17400


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 219165 entries, 0 to 219164
Data columns (total 10 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   PUBCHEM_RESULT_TAG         219165 non-null  int64  
 1   SID                        219165 non-null  int64  
 2   CID                        219164 non-null  float64
 3   SMILES                     219164 non-null  object 
 4   target                     219165 non-null  object 
 5   PUBCHEM_ACTIVITY_SCORE     219165 non-null  int64  
 6   PUBCHEM_ACTIVITY_URL       0 non-null       float64
 7   PUBCHEM_ASSAYDATA_COMMENT  0 non-null       float64
 8   Pct Inhibition             219165 non-null  object 
 9   Luminescence Value         219165 non-null  int64  
dtypes: float64(3), int64(4), object(3)
memory usage: 16.7+ MB


In [3]:
# Remove duplicates in general
df.drop_duplicates()
df.shape 

(219165, 10)

In [4]:
# Remove duplicates in CIDs 
df = df.drop_duplicates(subset=['CID'], keep=False) 
df.shape  # 6,446 are duplicates 

(218177, 10)

In [5]:
# Remove the compounds missing CID-s
df = df[df['CID'].notna()]
df.shape

(218176, 10)

In [6]:
# Turn CID float data type into integer
df['CID'] = df['CID'].astype(int) 
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 218176 entries, 0 to 219164
Data columns (total 10 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   PUBCHEM_RESULT_TAG         218176 non-null  int64  
 1   SID                        218176 non-null  int64  
 2   CID                        218176 non-null  int32  
 3   SMILES                     218176 non-null  object 
 4   target                     218176 non-null  object 
 5   PUBCHEM_ACTIVITY_SCORE     218176 non-null  int64  
 6   PUBCHEM_ACTIVITY_URL       0 non-null       float64
 7   PUBCHEM_ASSAYDATA_COMMENT  0 non-null       float64
 8   Pct Inhibition             218176 non-null  object 
 9   Luminescence Value         218176 non-null  int64  
dtypes: float64(2), int32(1), int64(4), object(3)
memory usage: 17.5+ MB


In [7]:
# Check for isomers , i.e. same CIDs but different SIDs  
boolean = not df["CID"].is_unique 
boolean

False

[<a href="#content">Back to top</a>]

## Extract the necessary columns and edit data appropriately <a name="2"></a>

In [8]:
df_target = df [['CID', 'SMILES', 'target']]
df_target.head()

Unnamed: 0,CID,SMILES,target
0,2840784,CCCCC(=O)NC1=C(C2=C(S1)C(=C(CC2)C=O)Cl)C(=O)OCC,Active
1,2010180,CC1=CC(=CC=C1)CS(=O)(=O)C2=NN=C(O2)[C@H](CC3=C...,Active
2,3696529,CC1=CC=C(C=C1)C(=O)C2=C(OC3=C2C(=O)C(=O)C4=CC=...,Active
3,664366,CC[C@@H](C)[C@@H](C1=NN=C(O1)S(=O)(=O)CC2=C(C=...,Active
4,6418635,C1=CC=C(C=C1)C2=NC(=C(N=N2)C(F)(F)F)SC3=CC=C(C...,Active


In [9]:
# Counting of values in column PUBCHEM_ACTIVITY_OUTCOME
df_target['target'].value_counts()

target
Inactive    209952
Active        8224
Name: count, dtype: int64

In [10]:
# Set the binary targets to "1" for an active inhibitors and "0" for others 
df_target = df_target.copy()

# Create a dictionary
target = {'Active' : 1,
          'Inactive' : 0}

# Replace the names with binary values set up in the dictionary targets
df_target.target = [target[item] for item in df_target.target]

# Rename "unique combination" into "target"
# df_target = df_target.rename(columns={'OUTCOME': 'target'})

# Check how many unique values there are in the 'target' column
df_target['target'].unique()

array([1, 0], dtype=int64)

In [11]:
# Create a data set only with compounds that are active inhibitors 
df_target_1= df_target[df_target['target']==1]

# Display the data set from 10 samples of active inhibitors 
print('Shape of df_target_1: ', df_target_1.shape)
df_target_1.head()

Shape of df_target_1:  (8224, 3)


Unnamed: 0,CID,SMILES,target
0,2840784,CCCCC(=O)NC1=C(C2=C(S1)C(=C(CC2)C=O)Cl)C(=O)OCC,1
1,2010180,CC1=CC(=CC=C1)CS(=O)(=O)C2=NN=C(O2)[C@H](CC3=C...,1
2,3696529,CC1=CC=C(C=C1)C(=O)C2=C(OC3=C2C(=O)C(=O)C4=CC=...,1
3,664366,CC[C@@H](C)[C@@H](C1=NN=C(O1)S(=O)(=O)CC2=C(C=...,1
4,6418635,C1=CC=C(C=C1)C2=NC(=C(N=N2)C(F)(F)F)SC3=CC=C(C...,1


[<a href="#content">Back to top</a>]

## Reduce the samples labled 0 <a name="3"></a>

In [12]:
# Load data from PubChem BioAssay AID 1996 
df_solubility = pd.read_csv('pubchem_solubility.csv')

# Display the data frame
print('Shape of df_solubility: ', df_solubility.shape)
df_solubility.head()

Shape of df_solubility:  (57859, 30)


Unnamed: 0,PUBCHEM_RESULT_TAG,SID,CID,PUBCHEM_EXT_DATASOURCE_SMILES,OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Solubility at pH 7.4_Qualifier_Mean,Solubility at pH 7.4_Mean,Test Concentration_1,Solubility at pH 7.4_Qualifier_1,Solubility at pH 7.4_1,Solubility at pH 7.4_Comment_1,Solubility of Astemizole at pH 7.4_Qualifier_1,Solubility of Astemizole at pH 7.4_1,Solubility of Sulfamethizole at pH 7.4_Qualifier_1,Solubility of Sulfamethizole at pH 7.4_1,Solubility of Imipramine HCl at pH 7.4_Qualifier_1,Solubility of Imipramine HCl at pH 7.4_1,Test Concentration_2,Solubility at pH 7.4_Qualifier_2,Solubility at pH 7.4_2,Solubility at pH 7.4_Comment_2,Solubility of Astemizole at pH 7.4_Qualifier_2,Solubility of Astemizole at pH 7.4_2,Solubility of Sulfamethizole at pH 7.4_Qualifier_2,Solubility of Sulfamethizole at pH 7.4_2,Solubility of Imipramine HCl at pH 7.4_Qualifier_2,Solubility of Imipramine HCl at pH 7.4_2
0,1,24826444,2374148,COC1=CC=C(C=C1)OCC2=NNC(=S)N2N,Active,40,,,=,10.85,200,<,0.1,Below LOQ,=,14.2,>,40.5,>,47.5,200.0,=,21.6,,=,27.7,>,40.5,>,47.5
1,2,49669186,5295761,CC1=CC(=C(C=C1)NC2=NC3=CC=CC=C3N4C2=NN=C4)Cl,Inactive,0,,,<,0.1,200,<,0.1,Below LOQ,=,12.3,>,40.5,>,47.5,,,,,,,,,,
2,3,49669768,22431387,CC1=CC(=CC=C1)CCNC2=NC3=C(C=C(C=C3)C)N4C2=NN=C4,Inactive,0,,,<,0.1,200,<,0.1,Below LOQ,=,12.3,>,40.5,>,47.5,,,,,,,,,,
3,4,8139962,14296,CC1=C(N=C(C(=N1)C)C)C,Inactive,0,,,<,0.1,200,<,0.1,Below LOQ,=,11.4,>,40.5,>,47.5,,,,,,,,,,
4,5,24803435,865684,CC1=CC2=NC(=C(C=C2C=C1)C#N)NC3=CC=CC=C3OC,Inactive,0,,,<,0.2,200,<,0.2,Below LOQ,=,11.4,>,40.5,>,47.5,,,,,,,,,,


In [13]:
# Create the desired data frame with CID and SID for sifting 
df_solubility = df_solubility[ 'CID']

In [14]:
# Sift the main data frame with targets using the solubility data frame 
df = pd.merge(df_target, df_solubility, on="CID")

In [15]:
df['target'].value_counts()

target
0    24185
1      291
Name: count, dtype: int64

In [16]:
# Samples labled 0
df_0 = df[df['target']==0]

# Extract every 2th row of samples labled 0   
n = 2
df_0 = df_0[df_0.index % n == 0] 
df_0.shape

(12092, 3)

In [17]:
# Concatenate the sifted target 0 with the original target 1
df = pd.concat([df_0, df_target_1])

# Count the target
df['target'].value_counts()

target
0    12092
1     8224
Name: count, dtype: int64

[<a href="#content">Back to top</a>]

## Create csv files <a name="4"></a>

In [18]:
# Create a dataframe with CID and SMILES columns
df_smiles = df[['CID', 'SMILES']]

df_smiles = df_smiles.sample(
    frac = 1,        # Return entire dataframe
    random_state=1   # Make result reproducible
    ).reset_index(drop=True)

# Create a csv file from the resulted dataframe
df_smiles.to_csv('smiles.csv', index=False)  

In [19]:
# Create a dataframe with the targets
df = df[['SMILES', 'target']]

# Create a csv file with the targets
df.to_csv('targets.csv', index=False)

[<a href="#content">Back to top</a>]