In [2]:
import sys
import csv

import apsw

from rdkit import Chem

def chembl(path, limit=None):
    '''Parse the ChEMBLdb CSV format and return the chembl_id, smiles fields'''

    with open(path, 'rt') as inputfile:
        reader = csv.reader(inputfile, delimiter='\t', skipinitialspace=True)
        next(reader) # skip header line
        
        counter = 0
        for chembl_id, smiles, inchi, inchi_key in reader:
            
            # skip problematic compounds
            if len(smiles) > 300: continue
            smiles = smiles.replace('=N#N','=[N+]=[N-]')
            smiles = smiles.replace('N#N=','[N-]=[N+]=')
            if not Chem.MolFromSmiles(smiles): continue
            
            yield chembl_id, smiles
            counter += 1
            if counter == limit:
                break

def createdb(chemicalite_path, chembl_path):
    '''Initialize a database schema and load the ChEMBLdb data'''

    connection = apsw.Connection('chembldb.sql')
    connection.enableloadextension(True)
    connection.loadextension(chemicalite_path)
    connection.enableloadextension(False)

    cursor = connection.cursor()
    
    cursor.execute("PRAGMA page_size=4096")

    cursor.execute("CREATE TABLE chembl(id INTEGER PRIMARY KEY, "
                   "chembl_id TEXT, smiles TEXT, molecule MOL)")

    cursor.execute("SELECT create_molecule_rdtree('chembl', 'molecule')")

    cursor.execute("BEGIN")
    
    for chembl_id, smiles in chembl(chembl_path):
        cursor.execute("INSERT INTO chembl(chembl_id, smiles, molecule) "
                       "VALUES(?, ?, mol(?))", (chembl_id, smiles, smiles))

    cursor.execute("COMMIT")
    


In [7]:
chemicalite_path = "/home/server2/Documents/malaria/chembl_21_sqlite/chemicalite/chemicalite.so"
chembl_path = "/home/server2/Documents/malaria/chembl_21_chemreps.txt"
chembldb_sql = "/home/server2/Documents/malaria/chembldb.sql"
createdb(chemicalite_path, chembl_path)

In [12]:
from __future__ import print_function

import sys
import csv

import apsw

def createbfp(chemicalite_path, chembldb_path):
    '''Create indexed virtual tables containing the bfp data'''

    connection = apsw.Connection(chembldb_path)
    connection.enableloadextension(True)
    connection.loadextension(chemicalite_path)
    connection.enableloadextension(False)

    cursor = connection.cursor()
    
    # sorry for the hard-coded bfp sizes in bytes (128, 64). 
    # I will fix this
    cursor.execute("CREATE VIRTUAL TABLE torsion USING rdtree(id, bfp bytes(128))");
    cursor.execute("CREATE VIRTUAL TABLE morgan USING rdtree(id, bfp bytes(64))");
    cursor.execute("CREATE VIRTUAL TABLE feat_morgan USING rdtree(id, bfp bytes(64))");

    cursor.execute("INSERT INTO torsion(id, bfp) SELECT id, mol_topological_torsion_bfp(molecule) FROM chembl")
    cursor.execute("INSERT INTO morgan(id, bfp) SELECT id, mol_morgan_bfp(molecule, 2) FROM chembl")
    cursor.execute("INSERT INTO feat_morgan(id, bfp) SELECT id, mol_feat_morgan_bfp(molecule, 2) FROM chembl")



In [14]:
chemicalite_path = "/home/server2/Documents/malaria/chembl_21_sqlite/chemicalite/chemicalite.so"
chembl_path = "/home/server2/Documents/malaria/chembl_21_chemreps.txt"
chembldb_sql = "/home/server2/Documents/malaria/chembldb.sql"
createbfp(chemicalite_path, chembldb_sql)

In [15]:
from __future__ import print_function

import sys
import time

import apsw

def search(c, target, threshold):
    t1 = time.time()
    rs = c.execute(
        "SELECT c.chembl_id, c.smiles, "
        "bfp_tanimoto(mol_morgan_bfp(c.molecule, 2), mol_morgan_bfp(?, 2)) as t "
        "FROM "
        "chembl as c JOIN "
        "(SELECT id FROM morgan WHERE "
        "id match rdtree_tanimoto(mol_morgan_bfp(?, 2), ?)) as idx "
        "USING(id) ORDER BY t DESC",
        (target, target, threshold)).fetchall()
    t2 = time.time()
    return rs, t2-t1

def tanimoto_search(chemicalite_path, chembldb_sql, target, threshold):
    connection = apsw.Connection(chembldb_sql)
    connection.enableloadextension(True)
    connection.loadextension(chemicalite_path)
    connection.enableloadextension(False)

    cursor = connection.cursor()

    print('searching for target:', target)

    matches, t = search(cursor, target, float(threshold))
    for match in matches:
        print(match[0], match[1], match[2])
    print('Found {0} matches in {1} seconds'.format(len(matches), t))




In [17]:
chemicalite_path = "/home/server2/Documents/malaria/chembl_21_sqlite/chemicalite/chemicalite.so"
chembl_path = "/home/server2/Documents/malaria/chembl_21_chemreps.txt"
chembldb_sql = "/home/server2/Documents/malaria/chembldb.sql"
target = "Cc1nc(N)nc(N)c1OCCCOc1cccc(Cl)c1Cl"
threshold = 0.5
tanimoto_search(chemicalite_path, chembldb_sql, target, threshold)

searching for target: Cc1nc(N)nc(N)c1OCCCOc1cccc(Cl)c1Cl
CHEMBL534824 Cc1nc(N)nc(N)c1OCCCOc2cccc(Cl)c2Cl 1.0
CHEMBL579606 Cl.Cc1nc(N)nc(N)c1OCCCOc2cccc(Cl)c2Cl 0.95
CHEMBL529285 Cc1nc(N)nc(N)c1OCCCOc2ccccc2Br 0.720930232558
CHEMBL528653 COc1cccc(OC)c1OCCCOc2c(C)nc(N)nc2N 0.697674418605
CHEMBL533051 Cl.Cc1nc(N)nc(N)c1OCCCOc2ccccc2Br 0.688888888889
CHEMBL533255 Cl.COc1cccc(OC)c1OCCCOc2c(C)nc(N)nc2N 0.666666666667
CHEMBL529099 Cc1nc(N)nc(N)c1CCCOc2ccccc2Cl 0.608695652174
CHEMBL581298 Cc1nc(N)nc(N)c1OCCCCOc2ccccc2 0.577777777778
CHEMBL547451 Cc1nc(N)nc(N)c1OCCCCCCOc2ccccc2 0.565217391304
CHEMBL548345 Cl.Cc1nc(N)nc(N)c1OCCCCCCOc2ccccc2 0.541666666667
CHEMBL582553 O.Cl.Cc1nc(N)nc(N)c1OCCCCOc2ccccc2 0.541666666667
CHEMBL528695 Cc1nc(N)nc(N)c1CCCOc2ccccc2Br 0.541666666667
CHEMBL580655 Cc1nc(N)nc(N)c1OCCCOc2ccc(Br)cc2CC=C 0.528301886792
CHEMBL586460 Cc1ccccc1OCCCc2c(C)nc(N)nc2N 0.520833333333
CHEMBL529701 COc1ccccc1COc2c(C)nc(N)nc2N 0.510204081633
CHEMBL549004 CCc1nc(N)nc(N)c1CCCOc2ccccc2Cl 0.5

In [18]:
import pandas as pd
df = pd.read_csv("GAMO_PFdata_200115.csv")


In [21]:
smiles = df['smiles'] #13403
smiles[0]

'Cc1nc(N)nc(N)c1OCCCOc1cccc(Cl)c1Cl'

In [None]:
chemicalite_path = "/home/server2/Documents/malaria/chembl_21_sqlite/chemicalite/chemicalite.so"
chembl_path = "/home/server2/Documents/malaria/chembl_21_chemreps.txt"
chembldb_sql = "/home/server2/Documents/malaria/chembldb.sql"
target = "Cc1nc(N)nc(N)c1OCCCOc1cccc(Cl)c1Cl"
threshold = 0.5
results = []
for i in range(13404):
    matrix = tanimoto_search(chemicalite_path, chembldb_sql, smiles[i], threshold)
    results.append(matrix)
    
    

searching for target: Cc1nc(N)nc(N)c1OCCCOc1cccc(Cl)c1Cl
CHEMBL534824 Cc1nc(N)nc(N)c1OCCCOc2cccc(Cl)c2Cl 1.0
CHEMBL579606 Cl.Cc1nc(N)nc(N)c1OCCCOc2cccc(Cl)c2Cl 0.95
CHEMBL529285 Cc1nc(N)nc(N)c1OCCCOc2ccccc2Br 0.720930232558
CHEMBL528653 COc1cccc(OC)c1OCCCOc2c(C)nc(N)nc2N 0.697674418605
CHEMBL533051 Cl.Cc1nc(N)nc(N)c1OCCCOc2ccccc2Br 0.688888888889
CHEMBL533255 Cl.COc1cccc(OC)c1OCCCOc2c(C)nc(N)nc2N 0.666666666667
CHEMBL529099 Cc1nc(N)nc(N)c1CCCOc2ccccc2Cl 0.608695652174
CHEMBL581298 Cc1nc(N)nc(N)c1OCCCCOc2ccccc2 0.577777777778
CHEMBL547451 Cc1nc(N)nc(N)c1OCCCCCCOc2ccccc2 0.565217391304
CHEMBL548345 Cl.Cc1nc(N)nc(N)c1OCCCCCCOc2ccccc2 0.541666666667
CHEMBL582553 O.Cl.Cc1nc(N)nc(N)c1OCCCCOc2ccccc2 0.541666666667
CHEMBL528695 Cc1nc(N)nc(N)c1CCCOc2ccccc2Br 0.541666666667
CHEMBL580655 Cc1nc(N)nc(N)c1OCCCOc2ccc(Br)cc2CC=C 0.528301886792
CHEMBL586460 Cc1ccccc1OCCCc2c(C)nc(N)nc2N 0.520833333333
CHEMBL529701 COc1ccccc1COc2c(C)nc(N)nc2N 0.510204081633
CHEMBL549004 CCc1nc(N)nc(N)c1CCCOc2ccccc2Cl 0.5