# Generate Binding Affinity Dataset

In [1]:
# standard imports
import numpy as np
import pandas as pd
import random
from datetime import datetime

# import custom modules
import sys
sys.path.append('../..')
import project_config

Read in [ExCAPE-DB dataset](https://jcheminf.biomedcentral.com/articles/10.1186/s13321-017-0203-5), published in *Journal of Cheminformatics* in 2017, and generate binding affinity dataset. Data was retrieved from Zenodo at DOI: [10.5281/zenodo.2543724](https://doi.org/10.5281/zenodo.2543724).

In [10]:
# read in dataset file
excape_db_path = project_config.DATA_DIR / "pubchem.chembl.dataset4publication_inchi_smiles_v2.tsv"
excape_db = pd.read_csv(excape_db_path, sep='\t', header=0, index_col=False)
excape_db.head()

Unnamed: 0,Ambit_InchiKey,Original_Entry_ID,Entrez_ID,Activity_Flag,pXC50,DB,Original_Assay_ID,Tax_ID,Gene_Symbol,Ortholog_Group,InChI,SMILES,updated
0,AAAAZQPHATYWOK-YRBRRWAQNA-N,11399331,2064,A,7.19382,pubchem,248914,9606,ERBB2,1346,InChI=1/C32H29ClN6O3S/c1-4-41-28-16-25-22(15-2...,ClC=1C=C(NC=2C=3C(N=CC2C#N)=CC(OCC)=C(NC(=O)/C...,
1,AAAAZQPHATYWOK-YRBRRWAQNA-N,CHEMBL175513,1956,A,6.73,chembl20,312997,9606,EGFR,1260,InChI=1/C32H29ClN6O3S/c1-4-41-28-16-25-22(15-2...,C1=2C(=C(C#N)C=NC1=CC(=C(C2)NC(/C=C/CN(C)C)=O)...,
2,AAABHMIRDIOYOK-NPVYFSBINA-N,CHEMBL1527551,10919,N,4.55,chembl20,737344,9606,EHMT2,6822,InChI=1/C18H14N6O3/c1-23-10-15(24(26)27)16(22-...,O=C(NC=1C=C2N=C(NC2=CC1)C=3C=CC=CC3)C4=NN(C=C4...,
3,AAABHMIRDIOYOK-NPVYFSBINA-N,CHEMBL1527551,19885,A,5.35,chembl20,688759,10090,RORC,3770,InChI=1/C18H14N6O3/c1-23-10-15(24(26)27)16(22-...,O=C(NC=1C=C2N=C(NC2=CC1)C=3C=CC=CC3)C4=NN(C=C4...,
4,AAABHMIRDIOYOK-NPVYFSBINA-N,CHEMBL1527551,216,N,4.4,chembl20,688238,9606,ALDH1A1,143,InChI=1/C18H14N6O3/c1-23-10-15(24(26)27)16(22-...,O=C(NC=1C=C2N=C(NC2=CC1)C=3C=CC=CC3)C4=NN(C=C4...,


From the ExCAPE-DB abstract:
> "Chemogenomics data generally refers to the activity data of chemical compounds on an array of protein targets and represents an important source of information for building in silico target prediction models. The increasing volume of chemogenomics data offers exciting opportunities to build models based on Big Data. Preparing a high quality data set is a vital step in realizing this goal and this work aims to compile such a comprehensive chemogenomics dataset. This dataset comprises over 70 million SAR data points from publicly available databases (PubChem and ChEMBL) including structure, target information and activity annotations. Our aspiration is to create a useful chemogenomics resource reflecting industry-scale data not only for building predictive models of in silico polypharmacology and off-target effects but also for the validation of cheminformatics approaches in general."

We inspect the data and generate a binding affinity dataset for protein targets of interest.

In [27]:
# set target of interest to dopamine receptor D2 (DRD2)
target = 'DRD2'

# find molecules that bind to target of interest
target_df = excape_db[excape_db['Gene_Symbol'] == target]

# drop duplicates
target_df = target_df.drop_duplicates(subset=['SMILES'])

# filter for pIC50 > 5 as active and pIC50 < 5 as inactive
target_active = target_df[target_df['Activity_Flag'] == 'A']
target_inactive = target_df[target_df['Activity_Flag'] == 'N']

# print number of active and inactive molecules
print("Number of active molecules: ", len(target_active))
print("Number of inactive molecules: ", len(target_inactive))

Number of active molecules:  8323
Number of inactive molecules:  343206


Generate binding affinity dataset.

In [28]:
# randomly sample 100000 molecules from the inactive molecules
target_inactive_sample = target_inactive.sample(n=100000, random_state=42)

# combine active and inactive molecules
target_training = pd.concat([target_active, target_inactive_sample])

# map active and inactive to 1 and 0
target_training['Activity_Flag'] = target_training['Activity_Flag'].map({'A': 1, 'N': 0})

# scramble the dataset
target_training = target_training.sample(frac=1, random_state=42).reset_index(drop=True)

# subset for SMILES and activity flag
target_training = target_training[['SMILES', 'Activity_Flag']].reset_index(drop=True)

# rename columns
activity_name = target + '_activity'
target_training.columns = ['SMILES', activity_name]
target_training.head()

Unnamed: 0,SMILES,DRD2_activity
0,ClC1=CC(NC(=O)CSC2=NN(C3=CC=CC=C3)C=N2)=CC=C1OC,0
1,ClC=1C(=CC(NC(=O)CN(C(O)C2=CN(C(O)=C2)CC=3OC=C...,0
2,O\1C=2C(C=C(/C1=N/CC3=CC=CC=C3)C(=O)N)=CC=CC2,0
3,N1(CCN(CC1)C=2C=CC=C(C2)C(F)(F)F)CCCOC3=CC=CC=...,0
4,C=1(Cl)C(=CC=2[C@@H]3[C@@H](N(CCC2C1)C)CCC4=C3...,1


Save file to data directory.

In [29]:
# save file
target_training.to_csv(project_config.DATA_DIR / str(target + '_binding_data.csv'), index=False)

# print value counts
print(target_training[activity_name].value_counts())

0    99998
1     8037
Name: DRD2_activity, dtype: int64
