Data is extracted from chembl_24.db (from [ChEMBL database](https://www.ebi.ac.uk/chembl/downloads)). We then parse the data and save data into pickle file for use in deep learning of the structures.

Import packages

In [1]:
import numpy as np
import pandas as pd
import os
import sqlite3
import pickle
import rdkit.Chem as chem

from random import shuffle

Connect database using sqlite3 package

In [2]:
db = sqlite3.connect('chembl_24.db')
c = db.cursor()

Import doc_id from chembl_24.db, then use doc_id to extract molregno (unique internal Chembl compound identifier)

In [3]:
categories = ['%toxin%', '%fungicidal%', '%nematicidal%', '%herbicidal%', '%insecticidal%']

In [4]:
molregno = dict.fromkeys(categories, None)
for cat in categories:
    # extract doc_id from assays that contain agrochemical and non-agrochemical keywords
    doc_id = c.execute("SELECT doc_id FROM assays where description like '%s'" %cat).fetchall()
    doc_id = [i[0] for i in doc_id]
    
    # extract unique compound identifier from doc_id
    molregno[cat] = c.execute("SELECT molregno FROM compound_records WHERE doc_id IN " + str(tuple(doc_id))).fetchall()
    molregno[cat] = [i[0] for i in molregno[cat]]
    print ("%s" %cat[1:-1], ":", len(molregno[cat]))

toxin : 541388
fungicidal : 4678
nematicidal : 555
herbicidal : 3715
insecticidal : 5987


Dispose data that overlaps 

In [5]:
%%time
for i, cat_a in enumerate(categories):
    for j, cat_b in enumerate(categories):
        if (i < j):
            intersection = list(set(molregno[cat_a])&set(molregno[cat_b]))
            molregno[cat_a] = [x for x in molregno[cat_a] if x not in intersection]

CPU times: user 6.19 s, sys: 48.9 ms, total: 6.23 s
Wall time: 6.24 s


Shuffle toxin data and then take as many toxin (non-agrochemical) data as the other data added together

In [6]:
shuffle(molregno['%toxin%'])
molregno['%toxin%'] = molregno["%toxin%"][:15000]

Then, we get canonical smiles string and compound properties from molregno compound identifier. We check to make sure the smiles strings and properties are mapped exactly.

In [7]:
smiles_string = dict.fromkeys(categories, None)
mw_freebase_dict = dict.fromkeys(categories, None)
alogp_dict = dict.fromkeys(categories, None)
hba_dict = dict.fromkeys(categories, None)
hbd_dict = dict.fromkeys(categories, None)
psa_dict = dict.fromkeys(categories, None)
rtb_dict = dict.fromkeys(categories, None)
acd_logp_dict = dict.fromkeys(categories, None)
acd_logd_dict = dict.fromkeys(categories, None)
full_mwt_dict = dict.fromkeys(categories, None)
aromatic_rings_dict = dict.fromkeys(categories, None)
heavy_atoms_dict = dict.fromkeys(categories, None)
qed_weighted_dict = dict.fromkeys(categories, None)
mw_monoisotopic_dict = dict.fromkeys(categories, None)
hba_lipinski_dict = dict.fromkeys(categories, None)
hbd_lipinski_dict = dict.fromkeys(categories, None)

In [8]:
for cat in molregno:
    smiles_string[cat], mw_freebase_dict[cat], alogp_dict[cat], hba_dict[cat], hbd_dict[cat], psa_dict[cat] = [], [], [], [], [], []
    rtb_dict[cat], acd_logp_dict[cat], acd_logd_dict[cat], full_mwt_dict[cat], aromatic_rings_dict[cat], heavy_atoms_dict[cat] = [], [], [], [], [], []
    qed_weighted_dict[cat], mw_monoisotopic_dict[cat], hba_lipinski_dict[cat], hbd_lipinski_dict[cat] = [], [], [], []
    for num in molregno[cat]:
        smile = c.execute("SELECT canonical_smiles FROM compound_structures WHERE molregno = " + str(num)).fetchall()
        properties = c.execute("SELECT * FROM compound_properties WHERE molregno = " + str(num)).fetchall()
        if not smile or not properties:
            molregno[cat].remove(num)
        else:
            properties = properties[0]
            smiles_string[cat].append(smile[0])
            
            # assign properties to corresponding dictionaries
            mw_freebase_dict[cat].append(properties[1])
            alogp_dict[cat].append(properties[2])
            hba_dict[cat].append(properties[3])
            hbd_dict[cat].append(properties[4])
            psa_dict[cat].append(properties[5])
            rtb_dict[cat].append(properties[6])
            acd_logp_dict[cat].append(properties[11])
            acd_logd_dict[cat].append(properties[12])
            full_mwt_dict[cat].append(properties[14])
            aromatic_rings_dict[cat].append(properties[15])
            heavy_atoms_dict[cat].append(properties[16])
            qed_weighted_dict[cat].append(properties[17])
            mw_monoisotopic_dict[cat].append(properties[18])
            hba_lipinski_dict[cat].append(properties[20])
            hbd_lipinski_dict[cat].append(properties[21])

Convert smiles string and the properties into a long list and then create another list containing their corresponding categorical name

In [9]:
canonical_smiles, label = [], []
mw_freebase, alogp, hba, hbd, psa, rtb, acd_logp, acd_logd, full_mwt, aromatic_rings, heavy_atoms, qed_weighted, mw_monoisotopic, hba_lipinski, hbd_lipinski = [],[],[],[],[],[],[],[],[],[],[],[],[],[],[],
for cat in smiles_string:
    canonical_smiles += smiles_string[cat]
    label += [cat[1:-1]]*len(smiles_string[cat])
    
    mw_freebase += mw_freebase_dict[cat]
    alogp += alogp_dict[cat]
    hba += hba_dict[cat]
    hbd += hbd_dict[cat]
    psa += psa_dict[cat]
    rtb += rtb_dict[cat]
    acd_logp += acd_logp_dict[cat]
    acd_logd += acd_logd_dict[cat]
    full_mwt += full_mwt_dict[cat]
    aromatic_rings += aromatic_rings_dict[cat]
    heavy_atoms += heavy_atoms_dict[cat]
    qed_weighted += qed_weighted_dict[cat]
    mw_monoisotopic += mw_monoisotopic_dict[cat]
    hba_lipinski += hba_lipinski_dict[cat]
    hbd_lipinski += hbd_lipinski_dict[cat]
    

Stack the two lists together

In [10]:
data = np.column_stack((canonical_smiles, label, mw_freebase, alogp, hba, hbd, psa, rtb, acd_logp, acd_logd, full_mwt, aromatic_rings, heavy_atoms, qed_weighted, mw_monoisotopic, hba_lipinski, hbd_lipinski))

Convert data into pandas dataframe

In [11]:
data = pd.DataFrame(data, columns=['smiles', 'category', 'mw_freebase', 'alogp', 'hba', 'hbd', 'psa', 'rtb', 'acd_logp', 'acd_logd', 'full_mwt', 'aromatic_rings', 'heavy_atoms', 'qed_weighted', 'mw_monoisotopic', 'hba_lipinski', 'hbd_lipinski'])

Add a column containing RDKit Molecule class

In [12]:
%%time
data['mol'] = data['smiles'].apply(chem.MolFromSmiles)

CPU times: user 10.1 s, sys: 397 ms, total: 10.5 s
Wall time: 10.6 s


Add another column that differentiates non-agrochemical and agrochemical category

In [13]:
def agrochemical(x):
    if x == 'toxin':
        return 0
    return 1

In [14]:
%%time
data['agrochemical'] = data['category'].apply(agrochemical)

CPU times: user 13.9 ms, sys: 1.46 ms, total: 15.3 ms
Wall time: 14.5 ms


Remove null values

In [15]:
data.dropna(axis=0, inplace=True)

Count number of compounds in agrochemical and non-agrochemical category

In [16]:
data['agrochemical'].value_counts()

0    14676
1    13587
Name: agrochemical, dtype: int64

In [17]:
data

Unnamed: 0,smiles,category,mw_freebase,alogp,hba,hbd,psa,rtb,acd_logp,acd_logd,full_mwt,aromatic_rings,heavy_atoms,qed_weighted,mw_monoisotopic,hba_lipinski,hbd_lipinski,mol,agrochemical
0,Cl.O=C(NCc1ccncc1)[C@@H]2CCCN2C(=O)[C@@H]3CCCN3,toxin,302.38,0.44,4,2,74.33,4,1.04,-0.77,338.84,1,22,0.85,302.174,6,2,<rdkit.Chem.rdchem.Mol object at 0x114c4ec60>,0
1,CCN(CC)S(=O)(=O)c1ccc(N\N=C\c2cc(OC)c(OC)cc2OC...,toxin,422.51,2.58,8,1,102.35,10,5.19,5.19,422.51,2,29,0.46,422.162,9,1,<rdkit.Chem.rdchem.Mol object at 0x114c4ef80>,0
2,Cc1ccc(cc1)S(=O)(=O)NCC2(CCOCC2)c3ccccc3,toxin,345.46,3.02,3,1,55.4,5,3.31,3.31,345.46,2,24,0.91,345.14,4,1,<rdkit.Chem.rdchem.Mol object at 0x10fde58f0>,0
3,NC(=O)COC(=O)c1c2CCCc2nc3ccccc13,toxin,270.29,1.37,4,1,82.28,3,1.63,1.63,270.29,2,20,0.85,270.1,5,2,<rdkit.Chem.rdchem.Mol object at 0x10fde5b70>,0
4,CCCCc1ccc(nc1)c2nn[nH]n2,toxin,203.25,1.6,4,1,67.35,4,2.58,0.58,203.25,2,15,0.82,203.117,5,1,<rdkit.Chem.rdchem.Mol object at 0x10fde5cb0>,0
5,NC(=O)c1cn(nc1C2=Cc3ccccc3OC2=O)c4ccccc4,toxin,331.33,2.74,5,1,91.12,3,1.89,1.89,331.33,4,25,0.58,331.096,6,2,<rdkit.Chem.rdchem.Mol object at 0x10fde5940>,0
6,NNC(=O)c1cc(nc2ccc(Br)cc12)c3ccc(N)cc3,toxin,357.21,2.85,4,3,94.03,2,2.02,2.02,357.21,3,22,0.28,356.027,5,5,<rdkit.Chem.rdchem.Mol object at 0x114dcf080>,0
7,CCOC(=O)C(C(=O)OCC)S(=O)(=O)C,toxin,238.26,-0.47,6,0,86.74,5,1.01,0.46,238.26,0,15,0.47,238.051,6,0,<rdkit.Chem.rdchem.Mol object at 0x114dcf3f0>,0
8,Brc1ccccc1C(=O)Nc2nnc(s2)S(=O)(=O)N3CCCCCC3,toxin,445.36,3.12,6,1,92.26,4,3.47,2.77,445.36,2,25,0.73,443.993,7,1,<rdkit.Chem.rdchem.Mol object at 0x114dcf300>,0
9,CC\N=C(\S)/N\N=C\c1ccccc1[N+](=O)[O-],toxin,252.3,1.82,4,2,79.89,4,2.42,-0.08,252.3,1,17,0.28,252.068,6,1,<rdkit.Chem.rdchem.Mol object at 0x114dcf030>,0


Save data as pickle file

In [18]:
data = data.to_pickle("./data.pkl")