In [1]:
# import packages
import pandas as pd
import numpy as np
from rdkit import Chem

In [2]:
# Load data from chEMBL csv
path = "chEMBL_potency.csv"
data = pd.read_csv(path, sep=';') # line 513 is skipped

display(data.head(3)) # sanity check

Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Molecule Max Phase,Molecular Weight,#RO5 Violations,AlogP,Compound Key,Smiles,Standard Type,Standard Relation,...,Target Type,Document ChEMBL ID,Source ID,Source Description,Document Journal,Document Year,Cell ChEMBL ID,Properties,Action Type,Standard Text Value
0,CHEMBL389516,,,446.33,0.0,3.32,SID7971969,O=C(CCN1C(=O)c2ccccc2C1=O)N1CCN(Cc2ccc(Cl)cc2C...,Potency,'=',...,SINGLE PROTEIN,CHEMBL1201862,7,PubChem BioAssays,,,,,,
1,CHEMBL1488260,,,251.29,0.0,3.55,SID14720854,Cc1ccc(C(=O)c2oc3ccccc3c2N)cc1,Potency,'=',...,SINGLE PROTEIN,CHEMBL1201862,7,PubChem BioAssays,,,,,,
2,CHEMBL1512440,,,564.5,3.0,2.21,SID26749736,O=c1c(O)cc(C2Oc3cc(O)cc(O)c3CC2O)cc2c(C3Oc4cc(...,Potency,'=',...,SINGLE PROTEIN,CHEMBL1201862,7,PubChem BioAssays,,,,,,


In [3]:
# extract relevant columns
dataset = data[['Molecule ChEMBL ID','AlogP','Smiles', 'Comment']]
display(dataset)


Unnamed: 0,Molecule ChEMBL ID,AlogP,Smiles,Comment
0,CHEMBL389516,3.32,O=C(CCN1C(=O)c2ccccc2C1=O)N1CCN(Cc2ccc(Cl)cc2C...,Inconclusive
1,CHEMBL1488260,3.55,Cc1ccc(C(=O)c2oc3ccccc3c2N)cc1,Inconclusive
2,CHEMBL1512440,2.21,O=c1c(O)cc(C2Oc3cc(O)cc(O)c3CC2O)cc2c(C3Oc4cc(...,Active
3,CHEMBL1575899,4.82,O=C(CSc1nnc(COc2ccccc2)n1-c1ccccc1)c1ccccc1,Inconclusive
4,CHEMBL3145060,3.66,Cc1[nH]n(-c2ccccc2)c(=O)c1N=Nc1ccc2c(c1)OCCO2,Active
...,...,...,...,...
49804,CHEMBL1331234,2.19,CCN(CC)CC.O=C1NC(=S)S/C1=C1\Sc2ccccc2N1CCCS(=O...,Inconclusive
49805,CHEMBL3191399,3.16,COc1ccccc1NS(=O)(=O)c1cccc(C(=O)N/N=C/c2ccc(C)...,Inconclusive
49806,CHEMBL1489005,6.21,COc1ccc(C=C(c2nc3ccccc3s2)c2nc3ccccc3s2)cc1O,Inconclusive
49807,CHEMBL1360098,3.12,Cc1cccc(OCC(=O)Nc2nnc(-c3ccco3)s2)c1,Inconclusive


In [4]:
# Extract inactive compounds
inactives = dataset[dataset['Comment'] == 'Inconclusive']
inactives = inactives.drop(columns=['Comment'])
display(inactives)

Unnamed: 0,Molecule ChEMBL ID,AlogP,Smiles
0,CHEMBL389516,3.32,O=C(CCN1C(=O)c2ccccc2C1=O)N1CCN(Cc2ccc(Cl)cc2C...
1,CHEMBL1488260,3.55,Cc1ccc(C(=O)c2oc3ccccc3c2N)cc1
3,CHEMBL1575899,4.82,O=C(CSc1nnc(COc2ccccc2)n1-c1ccccc1)c1ccccc1
5,CHEMBL3144968,2.85,Cc1[nH]n(C2=NCCS2)c(=O)c1N=Nc1ccccc1
6,CHEMBL1497520,4.55,Cc1nn(Cc2ccccc2)c(C)c1NC(=O)c1noc2c1CCc1ccccc1-2
...,...,...,...
49804,CHEMBL1331234,2.19,CCN(CC)CC.O=C1NC(=S)S/C1=C1\Sc2ccccc2N1CCCS(=O...
49805,CHEMBL3191399,3.16,COc1ccccc1NS(=O)(=O)c1cccc(C(=O)N/N=C/c2ccc(C)...
49806,CHEMBL1489005,6.21,COc1ccc(C=C(c2nc3ccccc3s2)c2nc3ccccc3s2)cc1O
49807,CHEMBL1360098,3.12,Cc1cccc(OCC(=O)Nc2nnc(-c3ccco3)s2)c1


In [5]:
# Extract active compounds
actives = dataset[dataset['Comment'] == "Active"]
actives = actives.drop(['Comment'], axis=1)
display(actives)

Unnamed: 0,Molecule ChEMBL ID,AlogP,Smiles
2,CHEMBL1512440,2.21,O=c1c(O)cc(C2Oc3cc(O)cc(O)c3CC2O)cc2c(C3Oc4cc(...
4,CHEMBL3145060,3.66,Cc1[nH]n(-c2ccccc2)c(=O)c1N=Nc1ccc2c(c1)OCCO2
7,CHEMBL1470408,5.21,O=C(O)CC/C(=C\c1ccc(N2CCCC2)c([N+](=O)[O-])c1)...
21,CHEMBL1465049,4.58,Oc1ccc(NCc2ccccc2OCc2ccccc2)cc1
38,CHEMBL1353526,3.68,N#C/C(=C1\C(=O)Nc2ccccc21)c1nc2ccccc2s1
...,...,...,...
49741,CHEMBL1400055,4.24,O=C(Nc1nc2ccc(F)cc2s1)c1nc(S(=O)(=O)Cc2ccccc2F...
49748,CHEMBL1310877,2.67,Cc1cc2ccccn2c1C(=O)c1ccc(NN)c([N+](=O)[O-])c1
49772,CHEMBL1564004,2.86,CC(=O)Oc1cc2oc(=O)c3c(c2cc1Cl)CCC3
49777,CHEMBL3214310,4.14,C/C(=N\OCc1nc2ccccc2c(=O)n1/N=C/c1cc(C)cc([N+]...


In [6]:
# remove invalid molecules from actives
valid_actives = []
for i in range(len(actives)):
    try:
        mol = Chem.MolFromSmiles(actives.iloc[i]['Smiles'])
        if mol is not None:
            valid_actives.append(actives.iloc[i])
    except:
        pass
valid_actives = pd.DataFrame(valid_actives)
print("Number of valid actives: ", len(valid_actives))

Number of valid actives:  5811


In [10]:
# convert to actives and inactives into csv
valid_actives.to_csv('actives.csv', index=False)
inactives.to_csv('inactives.csv', index=False)

In [11]:
smiles_strings = valid_actives['Smiles']
display(smiles_strings)
smiles_strings.to_csv('actives_list.csv', index=False, header=False)

2        O=c1c(O)cc(C2Oc3cc(O)cc(O)c3CC2O)cc2c(C3Oc4cc(...
4            Cc1[nH]n(-c2ccccc2)c(=O)c1N=Nc1ccc2c(c1)OCCO2
7        O=C(O)CC/C(=C\c1ccc(N2CCCC2)c([N+](=O)[O-])c1)...
21                         Oc1ccc(NCc2ccccc2OCc2ccccc2)cc1
38                 N#C/C(=C1\C(=O)Nc2ccccc21)c1nc2ccccc2s1
                               ...                        
49741    O=C(Nc1nc2ccc(F)cc2s1)c1nc(S(=O)(=O)Cc2ccccc2F...
49748        Cc1cc2ccccn2c1C(=O)c1ccc(NN)c([N+](=O)[O-])c1
49772                   CC(=O)Oc1cc2oc(=O)c3c(c2cc1Cl)CCC3
49777    C/C(=N\OCc1nc2ccccc2c(=O)n1/N=C/c1cc(C)cc([N+]...
49796                  N#C/C(=C\c1ccc(N2CCOCC2)o1)c1ccccc1
Name: Smiles, Length: 5811, dtype: object