# Generation of the initial list with compounds out of the PubChem bioassay dataset

### Content   <a name="content"></a>

1. [Load PubChem BioAssay activators of TTR transcriptors ](#1)
2. [Extract the necessary columns](#2)
3. [Reduce the samples labled 0](#3)
4. [Create csv files](#4)

## Load PubChem BioAssay TTR receptor <a name="1"></a>

In [1]:
import pandas as pd 

# load the BioAssay PubChem AID 1117267 dataset
# https://pubchem.ncbi.nlm.nih.gov/bioassay/1117267
df = pd.read_csv('pubchem_AID_1117267.csv')

# To avoid truncation of some columns during data frame display
pd.set_option('display.max_columns', None) 

# Display the data frame
print('Shape of the data frame: ', df.shape)

df.head()

Shape of the data frame:  (91943, 5)


Unnamed: 0,SID,CID,SMILES,OUTCOME,Activation at 16.7 uM
0,26728960,2800385.0,COC1=C(C(=C(C=C1)[N+](=O)[O-])OC)C(=O)NC(=O)NC...,Active,126.49
1,26663938,2012002.0,CCCCNC1=NC2=C(C3=C(S2)CN(CC3)C)C(=O)N1C4=CC(=C...,Active,101.81
2,26671452,3447219.0,COC1=CC=C(C=C1)C2CC3=C(C(N(C4=CC=CC=C4N3)C(=O)...,Active,101.52
3,17402808,9633471.0,C1=CC(=CC=C1C#N)OC2=NC=C(C=C2)/C=N/O,Active,80.77
4,17434061,891452.0,COC1=C(C=C(C=C1)C(=O)NC2=CC=CC=C2C(=O)O)Cl,Active,78.36


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91943 entries, 0 to 91942
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   SID                    91943 non-null  int64  
 1   CID                    91942 non-null  float64
 2   SMILES                 91942 non-null  object 
 3   OUTCOME                91943 non-null  object 
 4   Activation at 16.7 uM  91943 non-null  float64
dtypes: float64(2), int64(1), object(2)
memory usage: 3.5+ MB


In [3]:
# Remove duplicates in general
df.drop_duplicates()
df.shape 

(91943, 5)

In [4]:
# Remove duplicates in CIDs 
df = df.drop_duplicates(subset=['CID'], keep=False) 
df.shape  # 6,446 are duplicates 

(91877, 5)

In [5]:
# Remove the compounds missing CID-s
df = df[df['CID'].notna()]
df.shape

(91876, 5)

In [6]:
# Turn CID float data type into integer
df['CID'] = df['CID'].astype(int) 
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 91876 entries, 0 to 91942
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   SID                    91876 non-null  int64  
 1   CID                    91876 non-null  int32  
 2   SMILES                 91876 non-null  object 
 3   OUTCOME                91876 non-null  object 
 4   Activation at 16.7 uM  91876 non-null  float64
dtypes: float64(1), int32(1), int64(1), object(2)
memory usage: 3.9+ MB


In [7]:
# Check for isomers , i.e. same CIDs but different SIDs  
boolean = not df["CID"].is_unique 
boolean

False

[<a href="#content">Back to top</a>]

## Extract the necessary columns and edit data appropriately <a name="2"></a>

In [8]:
df_target = df [['CID', 'SID', 'OUTCOME']]
df_target.head()

Unnamed: 0,CID,SID,OUTCOME
0,2800385,26728960,Active
1,2012002,26663938,Active
2,3447219,26671452,Active
3,9633471,17402808,Active
4,891452,17434061,Active


In [9]:
# Counting of values in column PUBCHEM_ACTIVITY_OUTCOME
df_target['OUTCOME'].value_counts()

OUTCOME
Inactive    90722
Active       1154
Name: count, dtype: int64

In [10]:
# Set the binary targets to "1" for an active inhibitors and "0" for others 
df_target = df_target.copy()

# Create a dictionary
target = {'Active' : 1,
          'Inactive' : 0}

# Replace the names with binary values set up in the dictionary targets
df_target.OUTCOME = [target[item] for item in df_target.OUTCOME]

# Rename "unique combination" into "target"
df_target = df_target.rename(columns={'OUTCOME': 'target'})

# Check how many unique values there are in the 'target' column
df_target['target'].unique()

array([1, 0], dtype=int64)

In [11]:
# Create a data set only with compounds that are active inhibitors 
df_target_1= df_target[df_target['target']==1]

# Display the data set from 10 samples of active inhibitors 
print('Shape of df_target_1: ', df_target_1.shape)
df_target_1.head()

Shape of df_target_1:  (1154, 3)


Unnamed: 0,CID,SID,target
0,2800385,26728960,1
1,2012002,26663938,1
2,3447219,26671452,1
3,9633471,17402808,1
4,891452,17434061,1


[<a href="#content">Back to top</a>]

## Reduce the samples labled 0 <a name="3"></a>

In [12]:
# Load data from PubChem BioAssay AID 1996 
df_solubility = pd.read_csv('pubchem_solubility.csv')

# Display the data frame
print('Shape of df_solubility: ', df_solubility.shape)
df_solubility.head()

Shape of df_solubility:  (57859, 30)


Unnamed: 0,PUBCHEM_RESULT_TAG,SID,CID,PUBCHEM_EXT_DATASOURCE_SMILES,OUTCOME,PUBCHEM_ACTIVITY_SCORE,PUBCHEM_ACTIVITY_URL,PUBCHEM_ASSAYDATA_COMMENT,Solubility at pH 7.4_Qualifier_Mean,Solubility at pH 7.4_Mean,Test Concentration_1,Solubility at pH 7.4_Qualifier_1,Solubility at pH 7.4_1,Solubility at pH 7.4_Comment_1,Solubility of Astemizole at pH 7.4_Qualifier_1,Solubility of Astemizole at pH 7.4_1,Solubility of Sulfamethizole at pH 7.4_Qualifier_1,Solubility of Sulfamethizole at pH 7.4_1,Solubility of Imipramine HCl at pH 7.4_Qualifier_1,Solubility of Imipramine HCl at pH 7.4_1,Test Concentration_2,Solubility at pH 7.4_Qualifier_2,Solubility at pH 7.4_2,Solubility at pH 7.4_Comment_2,Solubility of Astemizole at pH 7.4_Qualifier_2,Solubility of Astemizole at pH 7.4_2,Solubility of Sulfamethizole at pH 7.4_Qualifier_2,Solubility of Sulfamethizole at pH 7.4_2,Solubility of Imipramine HCl at pH 7.4_Qualifier_2,Solubility of Imipramine HCl at pH 7.4_2
0,1,24826444,2374148,COC1=CC=C(C=C1)OCC2=NNC(=S)N2N,Active,40,,,=,10.85,200,<,0.1,Below LOQ,=,14.2,>,40.5,>,47.5,200.0,=,21.6,,=,27.7,>,40.5,>,47.5
1,2,49669186,5295761,CC1=CC(=C(C=C1)NC2=NC3=CC=CC=C3N4C2=NN=C4)Cl,Inactive,0,,,<,0.1,200,<,0.1,Below LOQ,=,12.3,>,40.5,>,47.5,,,,,,,,,,
2,3,49669768,22431387,CC1=CC(=CC=C1)CCNC2=NC3=C(C=C(C=C3)C)N4C2=NN=C4,Inactive,0,,,<,0.1,200,<,0.1,Below LOQ,=,12.3,>,40.5,>,47.5,,,,,,,,,,
3,4,8139962,14296,CC1=C(N=C(C(=N1)C)C)C,Inactive,0,,,<,0.1,200,<,0.1,Below LOQ,=,11.4,>,40.5,>,47.5,,,,,,,,,,
4,5,24803435,865684,CC1=CC2=NC(=C(C=C2C=C1)C#N)NC3=CC=CC=C3OC,Inactive,0,,,<,0.2,200,<,0.2,Below LOQ,=,11.4,>,40.5,>,47.5,,,,,,,,,,


In [13]:
# Create the desired data frame with CID and SID for sifting 
df_solubility = df_solubility[[ 'CID', 'SID']]

In [14]:
# Sift the main data frame with targets using the solubility data frame 
df = pd.merge(df_target, df_solubility, on=["CID", 'SID'])

In [15]:
df['target'].value_counts()

target
0    16160
1      170
Name: count, dtype: int64

In [16]:
# Samples labled 0
df_0 = df[df['target']==0]

# Extract every nth row of samples labled 0   
n = 8
df_0 = df_0[df_0.index % n == 0] 
df_0.shape

(2020, 3)

In [17]:
# Concatenate the sifted target 0 with the original target 1
df = pd.concat([df_0, df_target_1])

# Count the target
df['target'].value_counts()

target
0    2020
1    1154
Name: count, dtype: int64

[<a href="#content">Back to top</a>]

## Create csv files <a name="4"></a>

In [18]:
# Create a dataframe with CID and SMILES columns
df_smiles = df[['CID','SID', 'target']]

df_smiles = df_smiles.sample(
    frac = 1,        # Return entire dataframe
    random_state=1   # Make result reproducible
    ).reset_index(drop=True)

# Create a csv file from the resulted dataframe
df_smiles.to_csv('CID_SID_targets.csv', index=False)  

[<a href="#content">Back to top</a>]