### Search enumerated structures within the Drugbank database

In [2]:
import pandas as pd
from rdkit import Chem
import numpy as np
import matplotlib.pyplot as plt
import multiprocessing as mp

from tqdm.notebook import tqdm

from tqdm.contrib.concurrent import process_map



### Load structures from Drugbank - can be replaced by any other database.
downloaded from https://go.drugbank.com/releases/latest#structures

In [3]:
drugs = Chem.SDMolSupplier("./drugbank_structures.sdf")
drugs = [i for i in drugs if i]
for d in drugs:
    # cleanup
    Chem.SanitizeMol(d)
    
    # Kekulization removed.
#     Chem.Kekulize(d,clearAromaticFlags=True)

RDKit ERROR: [19:38:52] Explicit valence for atom # 0 Cl, 5, is greater than permitted
RDKit ERROR: [19:38:52] ERROR: Could not sanitize molecule ending on line 288741
RDKit ERROR: [19:38:52] ERROR: Explicit valence for atom # 0 Cl, 5, is greater than permitted
RDKit ERROR: [19:38:52] Explicit valence for atom # 39 N, 5, is greater than permitted
RDKit ERROR: [19:38:52] ERROR: Could not sanitize molecule ending on line 327780
RDKit ERROR: [19:38:52] ERROR: Explicit valence for atom # 39 N, 5, is greater than permitted
RDKit ERROR: [19:38:52] Explicit valence for atom # 19 O, 3, is greater than permitted
RDKit ERROR: [19:38:52] ERROR: Could not sanitize molecule ending on line 398763
RDKit ERROR: [19:38:52] ERROR: Explicit valence for atom # 19 O, 3, is greater than permitted
RDKit ERROR: [19:38:52] Explicit valence for atom # 1 N, 4, is greater than permitted
RDKit ERROR: [19:38:52] ERROR: Could not sanitize molecule ending on line 522182
RDKit ERROR: [19:38:52] ERROR: Explicit valence

In [4]:
data = pd.read_csv("./data_files/smiles_min_dist_natoms.csv",index_col=0)

### a function to search for motif matches in the Drugbank database

In [8]:
# the one just for number of lines - used for the entire drugbank
def rowfunc(smiles):
    # search the smiles in "row" through all the structures in "database"
    substruct = Chem.MolFromSmiles(smiles)
    Chem.SanitizeMol(substruct)
    
    # used to Kekulize, but currently not implemented.
    # SanitizeMol will cause aromatic bonds to be encoded as such.
    
#     Chem.Kekulize(substruct,clearAromaticFlags=True)
    
    substruct_matches = 0
    for database_mol in drugs:
        if database_mol.HasSubstructMatch(substruct):
            substruct_matches += 1
    return substruct_matches

### run search

In [12]:
# 20 minutes at chunksize = 10000, max_workers = 24

result = process_map(rowfunc, list(data.smiles), chunksize=10000,max_workers=24)


  0%|          | 0/222739 [00:00<?, ?it/s]



In [13]:
data["drugbank_matches"] = result

In [15]:
# take a look
data.head()

Unnamed: 0,smiles,min_dist_all,natoms,drugbank_matches
0,COON,6.0,4,0
1,CONO,6.0,4,18
2,CON=O,7.0,4,19
3,NOCO,4.0,4,4
4,C1ONO1,5.0,4,0


In [16]:
# save to csv as a backup 
data.to_csv("./data_files/smiles_min_dist_dbank.csv")