# Computational Drug Discovery

## Descriptor Calculation and Dataset Preparation

We will be calculating molecular descriptors that are essentially quantitative description of the compounds in the dataset. Finally, we will be preparing this into a dataset for subsequent model building.

### Download PaDEL-Descriptor

From: Search Github + below files



fingerprints xml file

### Load bioactivity data

In [1]:
import pandas as pd

In [2]:
df3 = pd.read_csv('RP1ab_04_bioactivity_data_3class_pIC50.csv')

In [3]:
df3

Unnamed: 0.1,Unnamed: 0,molecule_chembl_id,canonical_smiles,class,MW,LogP,NumHDonors,NumHAcceptors,pIC50
0,0,CHEMBL480,Cc1c(OCC(F)(F)F)ccnc1C[S+]([O-])c1nc2ccccc2[nH]1,active,369.368,3.51522,1.0,4.0,6.408935
1,1,CHEMBL178459,Cc1c(-c2cnccn2)ssc1=S,active,226.351,3.30451,0.0,5.0,6.677781
2,2,CHEMBL3545157,O=c1sn(-c2cccc3ccccc23)c(=O)n1Cc1ccccc1,active,334.400,3.26220,0.0,5.0,7.096910
3,3,CHEMBL297453,O=C(O[C@@H]1Cc2c(O)cc(O)cc2O[C@@H]1c1cc(O)c(O)...,intermediate,458.375,2.23320,8.0,11.0,5.801343
4,4,CHEMBL4303595,O=C1C=Cc2cc(Br)ccc2C1=O,active,237.052,2.22770,0.0,2.0,7.397940
...,...,...,...,...,...,...,...,...,...
1131,1131,CHEMBL5286307,CC(C)(C)[C@H](NC(=O)C(F)(F)F)C(=O)N1[C@@H]2CCC...,active,499.534,1.38378,3.0,5.0,7.743282
1132,1132,CHEMBL5282079,CC(C)(C)[C@H](NS(=O)(=O)C(F)(F)F)C(=O)N1C[C@H]...,active,535.589,0.85788,3.0,6.0,7.649364
1133,1133,CHEMBL5275584,Cn1cnc2c1c(=O)n(CC(=O)Nc1ccc(S(=O)(=O)Nc3ncccn...,intermediate,470.471,-0.33680,2.0,11.0,5.093665
1134,1134,CHEMBL5281103,Cn1c(=O)c2c(nc3n(CC#Cc4ccc(Cl)c(Cl)c4)ccn23)n(...,intermediate,402.241,2.04490,0.0,7.0,5.327902


### Calculate fingerprint descriptors

In [4]:
import glob
xml_files = glob.glob("fingerprints_xml/*.xml")
xml_files.sort()
xml_files

['fingerprints_xml\\AtomPairs2DFingerprintCount.xml',
 'fingerprints_xml\\AtomPairs2DFingerprinter.xml',
 'fingerprints_xml\\EStateFingerprinter.xml',
 'fingerprints_xml\\ExtendedFingerprinter.xml',
 'fingerprints_xml\\Fingerprinter.xml',
 'fingerprints_xml\\GraphOnlyFingerprinter.xml',
 'fingerprints_xml\\KlekotaRothFingerprintCount.xml',
 'fingerprints_xml\\KlekotaRothFingerprinter.xml',
 'fingerprints_xml\\MACCSFingerprinter.xml',
 'fingerprints_xml\\PubchemFingerprinter.xml',
 'fingerprints_xml\\SubstructureFingerprintCount.xml',
 'fingerprints_xml\\SubstructureFingerprinter.xml']

In [5]:
FP_list = ['AtomPairs2DCount',
 'AtomPairs2D',
 'EState',
 'CDKextended',
 'CDK',
 'CDKgraphonly',
 'KlekotaRothCount',
 'KlekotaRoth',
 'MACCS',
 'PubChem',
 'SubstructureCount',
 'Substructure']

In [6]:
fp = dict(zip(FP_list, xml_files))
fp

{'AtomPairs2DCount': 'fingerprints_xml\\AtomPairs2DFingerprintCount.xml',
 'AtomPairs2D': 'fingerprints_xml\\AtomPairs2DFingerprinter.xml',
 'EState': 'fingerprints_xml\\EStateFingerprinter.xml',
 'CDKextended': 'fingerprints_xml\\ExtendedFingerprinter.xml',
 'CDK': 'fingerprints_xml\\Fingerprinter.xml',
 'CDKgraphonly': 'fingerprints_xml\\GraphOnlyFingerprinter.xml',
 'KlekotaRothCount': 'fingerprints_xml\\KlekotaRothFingerprintCount.xml',
 'KlekotaRoth': 'fingerprints_xml\\KlekotaRothFingerprinter.xml',
 'MACCS': 'fingerprints_xml\\MACCSFingerprinter.xml',
 'PubChem': 'fingerprints_xml\\PubchemFingerprinter.xml',
 'SubstructureCount': 'fingerprints_xml\\SubstructureFingerprintCount.xml',
 'Substructure': 'fingerprints_xml\\SubstructureFingerprinter.xml'}

In [7]:
fp['AtomPairs2D']

'fingerprints_xml\\AtomPairs2DFingerprinter.xml'

In [8]:
from padelpy import padeldescriptor

fingerprint = 'Substructure'

fingerprint_output_file = ''.join([fingerprint,'.csv']) #Substructure.csv
fingerprint_descriptortypes = fp[fingerprint]

padeldescriptor(mol_dir='molecule.smi', 
                d_file=fingerprint_output_file, #'Substructure.csv'
                #descriptortypes='SubstructureFingerprint.xml', 
                descriptortypes= fingerprint_descriptortypes,
                detectaromaticity=True,
                standardizenitro=True,
                standardizetautomers=True,
                threads=2,
                removesalt=True,
                log=True,
                fingerprints=True)

In [9]:
descriptors = pd.read_csv(fingerprint_output_file)
descriptors

Unnamed: 0,Name,SubFP1,SubFP2,SubFP3,SubFP4,SubFP5,SubFP6,SubFP7,SubFP8,SubFP9,...,SubFP298,SubFP299,SubFP300,SubFP301,SubFP302,SubFP303,SubFP304,SubFP305,SubFP306,SubFP307
0,CHEMBL480,1,0,0,0,0,0,0,0,1,...,1,1,1,1,1,0,0,0,0,1
1,CHEMBL178459,1,0,0,0,0,0,0,0,0,...,0,0,1,1,1,1,0,0,0,1
2,CHEMBL3545157,0,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,1
3,CHEMBL297453,0,1,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,1
4,CHEMBL4303595,0,0,0,0,1,0,0,0,0,...,0,0,1,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1131,CHEMBL5286307,1,1,1,1,0,0,0,0,1,...,0,0,1,1,1,0,0,0,0,1
1132,CHEMBL5282079,1,1,1,1,0,0,0,0,1,...,0,0,1,1,1,0,0,0,0,1
1133,CHEMBL5275584,0,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,1
1134,CHEMBL5281103,0,0,0,0,0,1,0,0,0,...,0,0,1,1,1,0,0,0,0,1


In [10]:
df3_X = pd.read_csv('Substructure.csv')
df3_X

Unnamed: 0,Name,SubFP1,SubFP2,SubFP3,SubFP4,SubFP5,SubFP6,SubFP7,SubFP8,SubFP9,...,SubFP298,SubFP299,SubFP300,SubFP301,SubFP302,SubFP303,SubFP304,SubFP305,SubFP306,SubFP307
0,CHEMBL480,1,0,0,0,0,0,0,0,1,...,1,1,1,1,1,0,0,0,0,1
1,CHEMBL178459,1,0,0,0,0,0,0,0,0,...,0,0,1,1,1,1,0,0,0,1
2,CHEMBL3545157,0,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,1
3,CHEMBL297453,0,1,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,1
4,CHEMBL4303595,0,0,0,0,1,0,0,0,0,...,0,0,1,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1131,CHEMBL5286307,1,1,1,1,0,0,0,0,1,...,0,0,1,1,1,0,0,0,0,1
1132,CHEMBL5282079,1,1,1,1,0,0,0,0,1,...,0,0,1,1,1,0,0,0,0,1
1133,CHEMBL5275584,0,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,1
1134,CHEMBL5281103,0,0,0,0,0,1,0,0,0,...,0,0,1,1,1,0,0,0,0,1


### Preparing the X and Y Data Matrices

X data matrix

In [11]:
df3_X = df3_X.drop(columns=['Name'])
df3_X

Unnamed: 0,SubFP1,SubFP2,SubFP3,SubFP4,SubFP5,SubFP6,SubFP7,SubFP8,SubFP9,SubFP10,...,SubFP298,SubFP299,SubFP300,SubFP301,SubFP302,SubFP303,SubFP304,SubFP305,SubFP306,SubFP307
0,1,0,0,0,0,0,0,0,1,0,...,1,1,1,1,1,0,0,0,0,1
1,1,0,0,0,0,0,0,0,0,0,...,0,0,1,1,1,1,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,1
3,0,1,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,1
4,0,0,0,0,1,0,0,0,0,0,...,0,0,1,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1131,1,1,1,1,0,0,0,0,1,0,...,0,0,1,1,1,0,0,0,0,1
1132,1,1,1,1,0,0,0,0,1,0,...,0,0,1,1,1,0,0,0,0,1
1133,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,1
1134,0,0,0,0,0,1,0,0,0,0,...,0,0,1,1,1,0,0,0,0,1


Y variable

Convert IC50 to pIC50

In [12]:
df3_Y = df3['pIC50']
df3_Y

0       6.408935
1       6.677781
2       7.096910
3       5.801343
4       7.397940
          ...   
1131    7.743282
1132    7.649364
1133    5.093665
1134    5.327902
1135    5.195179
Name: pIC50, Length: 1136, dtype: float64

Combining X and Y variable

In [13]:
dataset3 = pd.concat([df3_X,df3_Y], axis=1)
dataset3

Unnamed: 0,SubFP1,SubFP2,SubFP3,SubFP4,SubFP5,SubFP6,SubFP7,SubFP8,SubFP9,SubFP10,...,SubFP299,SubFP300,SubFP301,SubFP302,SubFP303,SubFP304,SubFP305,SubFP306,SubFP307,pIC50
0,1,0,0,0,0,0,0,0,1,0,...,1,1,1,1,0,0,0,0,1,6.408935
1,1,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,0,0,0,1,6.677781
2,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,0,0,0,0,1,7.096910
3,0,1,0,0,0,0,0,0,0,0,...,0,1,1,1,0,0,0,0,1,5.801343
4,0,0,0,0,1,0,0,0,0,0,...,0,1,1,0,1,0,0,0,1,7.397940
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1131,1,1,1,1,0,0,0,0,1,0,...,0,1,1,1,0,0,0,0,1,7.743282
1132,1,1,1,1,0,0,0,0,1,0,...,0,1,1,1,0,0,0,0,1,7.649364
1133,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,0,0,0,0,1,5.093665
1134,0,0,0,0,0,1,0,0,0,0,...,0,1,1,1,0,0,0,0,1,5.327902


Save csv file

In [14]:
#dataset3.to_csv('RP1ab_06_bioactivity_data_3class_pIC50_pubchem_fp.csv', index=False)