# Generation of the initial list with compounds out of the PubChem bioassay dataset

### Content   <a name="content"></a>

1. [Load PubChem BioAssay D1 receptor](#1)
2. [Extract the necessary columns and edit data appropriately](#2)
3. [Create csv files](#3)

## Load PubChem BioAssay D1 receptor <a name="1"></a>

In [1]:
import pandas as pd 

# load the BioAssay PubChem AID 504652 dataset
# https://pubchem.ncbi.nlm.nih.gov/bioassay/504652
df_d1 = pd.read_csv('bioassy_PubChem_D1.csv')

# To avoid truncation of some columns during data frame display
pd.set_option('display.max_columns', None) 

# Display the data frame
print('Shape of the data frame: ', df_d1.shape)

df_d1.head()

Shape of the data frame:  (359035, 10)


Unnamed: 0,PUBCHEM_RESULT_TAG,SID,CID,SMILES,PUBCHEM_ACTIVITY_OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Activity at 10.0 uM,Compound QC
0,1,14737807,9551164.0,C1CCC2=C(C1)C3=C(S2)N=C4C(=CC=CN4C3=O)C(=O)NCC...,Active,50,,,-112.471,
1,2,855827,4917.0,CN1CCN(CC1)CCCN2C3=CC=CC=C3SC4=C2C=C(C=C4)Cl,Active,50,,,-111.528,
2,3,56463488,4350931.0,CN1CCC2=CC=CC=C2CC3=C(CC1)C4=CC=CC=C4N3,Active,50,,,-106.814,
3,4,56463458,11957685.0,CC1=CC(=CC=C1)C2CN(CCC3=C(C(=C(C=C23)O)O)Cl)C.Br,Active,50,,,-106.293,
4,5,855962,6602611.0,C1CN(CCN1CCCN2C3=CC=CC=C3SC4=C2C=C(C=C4)C(F)(F...,Active,50,,,-106.287,


In [2]:
df_d1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 359035 entries, 0 to 359034
Data columns (total 10 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   PUBCHEM_RESULT_TAG         359035 non-null  int64  
 1   SID                        359035 non-null  int64  
 2   CID                        359034 non-null  float64
 3   SMILES                     359034 non-null  object 
 4   PUBCHEM_ACTIVITY_OUTCOME   359035 non-null  object 
 5   PUBCHEM_ACTIVITY_SCORE     359035 non-null  int64  
 6   PUBCHEM_ACTIVITY_URL       0 non-null       float64
 7   PUBCHEM_ASSAYDATA_COMMENT  0 non-null       float64
 8   Activity at 10.0 uM        359035 non-null  float64
 9   Compound QC                0 non-null       float64
dtypes: float64(5), int64(3), object(2)
memory usage: 27.4+ MB


In [3]:
# Remove duplicates in general
df_d1.drop_duplicates()
df_d1.shape  # 359,035 means no duplicates

(359035, 10)

In [4]:
# Remove duplicates in CIDs 
df_d1 = df_d1.drop_duplicates(subset=['CID'], keep=False) 
df_d1.shape  # 6,446 are duplicates 

(352589, 10)

In [5]:
# Remove the compounds missing CID-s
df_d1 = df_d1[df_d1['CID'].notna()]
df_d1.shape

(352588, 10)

In [6]:
# Turn CID float data type into integer
df_d1['CID'] = df_d1['CID'].astype(int) 
df_d1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 352588 entries, 0 to 359034
Data columns (total 10 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   PUBCHEM_RESULT_TAG         352588 non-null  int64  
 1   SID                        352588 non-null  int64  
 2   CID                        352588 non-null  int32  
 3   SMILES                     352588 non-null  object 
 4   PUBCHEM_ACTIVITY_OUTCOME   352588 non-null  object 
 5   PUBCHEM_ACTIVITY_SCORE     352588 non-null  int64  
 6   PUBCHEM_ACTIVITY_URL       0 non-null       float64
 7   PUBCHEM_ASSAYDATA_COMMENT  0 non-null       float64
 8   Activity at 10.0 uM        352588 non-null  float64
 9   Compound QC                0 non-null       float64
dtypes: float64(4), int32(1), int64(3), object(2)
memory usage: 28.2+ MB


In [7]:
# Check for isomers , i.e. same CIDs but different SIDs  
boolean = not df_d1["CID"].is_unique 
boolean

False

[<a href="#content">Back to top</a>]

## Extract the necessary columns and edit data appropriately <a name="2"></a>

In [8]:
df_target = df_d1 [['CID', 'SMILES', 'PUBCHEM_ACTIVITY_OUTCOME']]
df_target.head()

Unnamed: 0,CID,SMILES,PUBCHEM_ACTIVITY_OUTCOME
0,9551164,C1CCC2=C(C1)C3=C(S2)N=C4C(=CC=CN4C3=O)C(=O)NCC...,Active
1,4917,CN1CCN(CC1)CCCN2C3=CC=CC=C3SC4=C2C=C(C=C4)Cl,Active
2,4350931,CN1CCC2=CC=CC=C2CC3=C(CC1)C4=CC=CC=C4N3,Active
3,11957685,CC1=CC(=CC=C1)C2CN(CCC3=C(C(=C(C=C23)O)O)Cl)C.Br,Active
4,6602611,C1CN(CCN1CCCN2C3=CC=CC=C3SC4=C2C=C(C=C4)C(F)(F...,Active


In [9]:
# Counting of values in column PUBCHEM_ACTIVITY_OUTCOME
df_target['PUBCHEM_ACTIVITY_OUTCOME'].value_counts()

PUBCHEM_ACTIVITY_OUTCOME
Inactive    341362
Active       11226
Name: count, dtype: int64

In [10]:
# Set the binary targets to "1" for an active inhibitors and "0" for others 
df_target = df_target.copy()

# Create a dictionary
target = {'Active' : 1,
          'Inactive' : 0}

# Replace the names with binary values set up in the dictionary targets
df_target.PUBCHEM_ACTIVITY_OUTCOME = [target[item] for item in df_target.PUBCHEM_ACTIVITY_OUTCOME]

# Rename "unique combination" into "target"
df_target = df_target.rename(columns={'PUBCHEM_ACTIVITY_OUTCOME': 'target'})

# Check how many unique values there are in the 'target' column
df_target['target'].unique()

array([1, 0], dtype=int64)

In [11]:
# Create a data set only with compounds that are active inhibitors 
df_target_1 = df_target[df_target['target']==1]

# Display the data set from 10 samples of active inhibitors 
print('Shape of df_target_1: ', df_target_1.shape)
df_target_1.head()

Shape of df_target_1:  (11226, 3)


Unnamed: 0,CID,SMILES,target
0,9551164,C1CCC2=C(C1)C3=C(S2)N=C4C(=CC=CN4C3=O)C(=O)NCC...,1
1,4917,CN1CCN(CC1)CCCN2C3=CC=CC=C3SC4=C2C=C(C=C4)Cl,1
2,4350931,CN1CCC2=CC=CC=C2CC3=C(CC1)C4=CC=CC=C4N3,1
3,11957685,CC1=CC(=CC=C1)C2CN(CCC3=C(C(=C(C=C23)O)O)Cl)C.Br,1
4,6602611,C1CN(CCN1CCCN2C3=CC=CC=C3SC4=C2C=C(C=C4)C(F)(F...,1


In [12]:
# Load data from PubChem BioAssay AID 1996 
# https://pubchem.ncbi.nlm.nih.gov/bioassay/1996
df_solubility = pd.read_csv('pubchem_solubility.csv')

# Display the data frame
print('Shape of df_solubility: ', df_solubility.shape)
df_solubility.head()

Shape of df_solubility:  (57859, 30)


Unnamed: 0,PUBCHEM_RESULT_TAG,SID,CID,PUBCHEM_EXT_DATASOURCE_SMILES,OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Solubility at pH 7.4_Qualifier_Mean,Solubility at pH 7.4_Mean,Test Concentration_1,Solubility at pH 7.4_Qualifier_1,Solubility at pH 7.4_1,Solubility at pH 7.4_Comment_1,Solubility of Astemizole at pH 7.4_Qualifier_1,Solubility of Astemizole at pH 7.4_1,Solubility of Sulfamethizole at pH 7.4_Qualifier_1,Solubility of Sulfamethizole at pH 7.4_1,Solubility of Imipramine HCl at pH 7.4_Qualifier_1,Solubility of Imipramine HCl at pH 7.4_1,Test Concentration_2,Solubility at pH 7.4_Qualifier_2,Solubility at pH 7.4_2,Solubility at pH 7.4_Comment_2,Solubility of Astemizole at pH 7.4_Qualifier_2,Solubility of Astemizole at pH 7.4_2,Solubility of Sulfamethizole at pH 7.4_Qualifier_2,Solubility of Sulfamethizole at pH 7.4_2,Solubility of Imipramine HCl at pH 7.4_Qualifier_2,Solubility of Imipramine HCl at pH 7.4_2
0,1,24826444,2374148,COC1=CC=C(C=C1)OCC2=NNC(=S)N2N,Active,40,,,=,10.85,200,<,0.1,Below LOQ,=,14.2,>,40.5,>,47.5,200.0,=,21.6,,=,27.7,>,40.5,>,47.5
1,2,49669186,5295761,CC1=CC(=C(C=C1)NC2=NC3=CC=CC=C3N4C2=NN=C4)Cl,Inactive,0,,,<,0.1,200,<,0.1,Below LOQ,=,12.3,>,40.5,>,47.5,,,,,,,,,,
2,3,49669768,22431387,CC1=CC(=CC=C1)CCNC2=NC3=C(C=C(C=C3)C)N4C2=NN=C4,Inactive,0,,,<,0.1,200,<,0.1,Below LOQ,=,12.3,>,40.5,>,47.5,,,,,,,,,,
3,4,8139962,14296,CC1=C(N=C(C(=N1)C)C)C,Inactive,0,,,<,0.1,200,<,0.1,Below LOQ,=,11.4,>,40.5,>,47.5,,,,,,,,,,
4,5,24803435,865684,CC1=CC2=NC(=C(C=C2C=C1)C#N)NC3=CC=CC=C3OC,Inactive,0,,,<,0.2,200,<,0.2,Below LOQ,=,11.4,>,40.5,>,47.5,,,,,,,,,,


In [13]:
# Create the desired data frame with CID and SID for sifting 
df_solubility = df_solubility[ 'CID']

In [14]:
# Sift the main data frame with targets using the solubility data frame 
df = pd.merge(df_target, df_solubility, on="CID")

In [15]:
df['target'].value_counts()

target
0    48382
1     1362
Name: count, dtype: int64

In [16]:
# Separate only target 1 
df_1 = df[df['target']==1]

# Shuffle the samples labled 1 
df_1 = df_1.sample(
    frac = 1,        # Return entire dataframe
    random_state=1   # Make result reproducible
    ).reset_index(drop=True)

# Extract 1154 samples from the target 0 dataset
df_target_1 = df_1.iloc[:1154]

In [17]:
# Separate only target 0 
df_0 = df[df['target']==0]

# Shuffle the resulting data set
df_0 = df_0.sample(
    frac = 1,        # Return entire dataframe
    random_state=1   # Make result reproducible
    ).reset_index(drop=True)

# Extract 2823 samples from the target 1 dataset
df_target_0 = df_0.iloc[:2023]

In [18]:
# Concatenate the sifted target 0 with the original target 1
df = pd.concat([df_target_0, df_target_1])

# Count the target
df['target'].value_counts()

target
0    2023
1    1154
Name: count, dtype: int64

[<a href="#content">Back to top</a>]

## Create csv files <a name="3"></a>

In [19]:
# Create a dataframe with CID and SMILES columns
df_smiles = df[['CID', 'SMILES']]

df_smiles = df_smiles.sample(
    frac = 1,        # Return entire dataframe
    random_state=1   # Make result reproducible
    ).reset_index(drop=True)

# Create a csv file from the resulted dataframe
df_smiles.to_csv('smiles.csv', index=False)  

In [20]:
# Create a dataframe with the targets
df = df[['CID', 'target', 'SMILES']]

# Create a csv file with the targets
df.to_csv('targets.csv', index=False)

[<a href="#content">Back to top</a>]