### Search enumerated structures within the Drugbank database

In [1]:
import pandas as pd
from rdkit import Chem
import numpy as np
import matplotlib.pyplot as plt
import multiprocessing as mp

from tqdm.notebook import tqdm

### Load structures from Drugbank - can be replaced by any other database.
downloaded from https://go.drugbank.com/releases/latest#structures

In [2]:
drugs = Chem.SDMolSupplier("./drugbank_1000.sdf")
drugs = [i for i in drugs if i]
for d in drugs:
    # cleanup
    Chem.SanitizeMol(d)


In [3]:
data = pd.read_csv("./data_files/smiles_min_dist_natoms.csv",index_col=0)

### a function to search for motif matches in the Drugbank database

In [4]:
# the one just for number of lines - used for the entire drugbank
def rowfunc(smiles):
    # search the smiles in "row" through all the structures in "database"
    substruct = Chem.MolFromSmiles(smiles)
    Chem.SanitizeMol(substruct)
    
    # used to Kekulize, but currently not implemented.
    # SanitizeMol will cause aromatic bonds to be encoded as such.
    
#     Chem.Kekulize(substruct,clearAromaticFlags=True)
    
    substruct_matches = 0
    for database_mol in drugs:
        if database_mol.HasSubstructMatch(substruct):
            substruct_matches += 1
    return substruct_matches

### run search

In [5]:
# 10 minutes on single core

result = []
for s in tqdm(data.smiles):
    result.append(rowfunc(s))


  0%|          | 0/57230 [00:00<?, ?it/s]

In [6]:
data["drugbank_matches"] = result

In [7]:
# take a look
data.head()

Unnamed: 0,smiles,min_dist_all,natoms,drugbank_matches
0,COON,7.0,4,0
1,CONO,7.0,4,2
2,CON=O,8.0,4,2
3,NOCO,5.0,4,0
4,C1ONO1,6.0,4,0


In [8]:
# save to csv as a backup 
data.to_csv("./data_files/smiles_min_dist_dbank.csv")