In [1]:
import sys
import csv

import apsw

from rdkit import Chem

def chembl(path, limit=None):
    '''Parse the ChEMBLdb CSV format and return the chembl_id, smiles fields'''

    with open(path, 'rt') as inputfile:
        reader = csv.reader(inputfile, delimiter='\t', skipinitialspace=True)
        next(reader) # skip header line
        
        counter = 0
        #for chembl_id, smiles in reader:
        for chembl_id, smiles, inchi, inchi_key in reader:
            
            # skip problematic compounds
            if len(smiles) > 300: continue
            smiles = smiles.replace('=N#N','=[N+]=[N-]')
            smiles = smiles.replace('N#N=','[N-]=[N+]=')
            if not Chem.MolFromSmiles(smiles): continue
            
            yield chembl_id, smiles
            counter += 1
            if counter == limit:
                break

def createdb(chemicalite_path, chembl_path):
    '''Initialize a database schema and load the ChEMBLdb data'''

    connection = apsw.Connection('paul_chembldb.sql')
    connection.enableloadextension(True)
    connection.loadextension(chemicalite_path)
    connection.enableloadextension(False)

    cursor = connection.cursor()
    
    cursor.execute("PRAGMA page_size=4096")

    cursor.execute("CREATE TABLE chembl(id INTEGER PRIMARY KEY, "
                   "chembl_id TEXT, smiles TEXT, molecule MOL)")

    cursor.execute("SELECT create_molecule_rdtree('chembl', 'molecule')")

    cursor.execute("BEGIN")
    
    for chembl_id, smiles in chembl(chembl_path):
        cursor.execute("INSERT INTO chembl(chembl_id, smiles, molecule) "
                       "VALUES(?, ?, mol(?))", (chembl_id, smiles, smiles))

    cursor.execute("COMMIT")
    


In [20]:
chemicalite_path = "/home/server2/Documents/malaria/chembl_21_sqlite/chemicalite/chemicalite.so"
chembl_path = "/home/server2/Documents/malaria/paul_standardized_chembl_21.txt"
chembldb_sql = "/home/server2/Documents/malaria/paul_chembldb.sql"
createdb(chemicalite_path, chembl_path)

In [21]:
from __future__ import print_function

import sys
import csv

import apsw

def createbfp(chemicalite_path, chembldb_path):
    '''Create indexed virtual tables containing the bfp data'''

    connection = apsw.Connection(chembldb_path)
    connection.enableloadextension(True)
    connection.loadextension(chemicalite_path)
    connection.enableloadextension(False)

    cursor = connection.cursor()
    
    # sorry for the hard-coded bfp sizes in bytes (128, 64). 
    # I will fix this
    cursor.execute("CREATE VIRTUAL TABLE torsion USING rdtree(id, bfp bytes(128))");
    cursor.execute("CREATE VIRTUAL TABLE morgan USING rdtree(id, bfp bytes(64))");
    cursor.execute("CREATE VIRTUAL TABLE feat_morgan USING rdtree(id, bfp bytes(64))");

    cursor.execute("INSERT INTO torsion(id, bfp) SELECT id, mol_topological_torsion_bfp(molecule) FROM chembl")
    cursor.execute("INSERT INTO morgan(id, bfp) SELECT id, mol_morgan_bfp(molecule, 2) FROM chembl")
    cursor.execute("INSERT INTO feat_morgan(id, bfp) SELECT id, mol_feat_morgan_bfp(molecule, 2) FROM chembl")



In [22]:
chemicalite_path = "/home/server2/Documents/malaria/chembl_21_sqlite/chemicalite/chemicalite.so"
chembl_path = "/home/server2/Documents/malaria/paul_standardized_chembl_21.txt"
chembldb_sql = "/home/server2/Documents/malaria/paul_chembldb.sql"
createbfp(chemicalite_path, chembldb_sql)

In [6]:
from __future__ import print_function

import sys
import time

import apsw

def search(c, target, threshold):
    t1 = time.time()
    rs = c.execute(
        "SELECT c.chembl_id, c.smiles, "
        "bfp_tanimoto(mol_morgan_bfp(c.molecule, 2), mol_morgan_bfp(?, 2)) as t "
        "FROM "
        "chembl as c JOIN "
        "(SELECT id FROM morgan WHERE "
        "id match rdtree_tanimoto(mol_morgan_bfp(?, 2), ?)) as idx "
        "USING(id) ORDER BY t DESC",
        (target, target, threshold)).fetchall()
    t2 = time.time()
    return rs, t2-t1

def tanimoto_search(chemicalite_path, chembldb_sql, target, threshold):
    connection = apsw.Connection(chembldb_sql)
    connection.enableloadextension(True)
    connection.loadextension(chemicalite_path)
    connection.enableloadextension(False)

    cursor = connection.cursor()

    print('searching for target:', target)

    matches, t = search(cursor, target, float(threshold))
    for match in matches:
        print(match[0], match[1], match[2])
    print('Found {0} matches in {1} seconds'.format(len(matches), t))




In [None]:
chemicalite_path = "/home/server2/Documents/malaria/chembl_21_sqlite/chemicalite/chemicalite.so"
chembl_path = "/home/server2/Documents/malaria/paul_standardized_chembl_21.txt"
chembldb_sql = "/home/server2/Documents/malaria/paul_chembldb.sql"
target = "Cc1nc(N)nc(N)c1OCCCOc1cccc(Cl)c1Cl"
threshold = 0.9
tanimoto_search(chemicalite_path, chembldb_sql, target, threshold)

searching for target: Cc1nc(N)nc(N)c1OCCCOc1cccc(Cl)c1Cl


In [2]:
import pandas as pd
df = pd.read_csv("GAMO_PFdata_200115.csv")


In [3]:
smiles = df['smiles'] #13403
orig_TCMDCID = df['ORIG_TCMDCID']

In [39]:
chemicalite_path = "/home/server2/Documents/malaria/chembl_21_sqlite/chemicalite/chemicalite.so"
chembl_path = "/home/server2/Documents/malaria/paul_standardized_chembl_21.txt"
chembldb_sql = "/home/server2/Documents/malaria/paul_chembldb.sql"
target = "Cc1nc(N)nc(N)c1OCCCOc1cccc(Cl)c1Cl"
threshold = 0.90

def tanimoto_search(chemicalite_path, chembldb_sql, target, threshold):
    connection = apsw.Connection(chembldb_sql)
    connection.enableloadextension(True)
    connection.loadextension(chemicalite_path)
    connection.enableloadextension(False)

    cursor = connection.cursor()

    print('searching for target:', target)

    matches, t = search(cursor, target, float(threshold))
    for match in matches:
        print(match[0], match[1], match[2])
    print('Found {0} matches in {1} seconds'.format(len(matches), t))



results = []
for i in range(10):
    matrix = tanimoto_search(chemicalite_path, chembldb_sql, smiles[i], threshold)
    print(matrix)
    
    

searching for target: Cc1nc(N)nc(N)c1OCCCOc1cccc(Cl)c1Cl
CHEMBL534824 Cc1nc(N)nc(N)c1OCCCOc2cccc(Cl)c2Cl 1.0
CHEMBL579606 Cl.Cc1nc(N)nc(N)c1OCCCOc2cccc(Cl)c2Cl 0.95
Found 2 matches in 0.0659239292145 seconds
None
searching for target: CC1(C)NC(N)=NC(=N)N1OCCCOc1c(Cl)cc(Cl)cc1Cl
CHEMBL530506 CC1(C)NC(=NC(=N)N1OCCCOc2c(Cl)cc(Cl)cc2Cl)N 1.0
CHEMBL532396 Br.CC1(C)NC(=NC(=N)N1OCCCOc2c(Cl)cc(Cl)cc2Cl)N 0.953488372093
Found 2 matches in 0.0416920185089 seconds
None
searching for target: CC1(C)NC(N)=NC(=N)N1OCCOc1cccc(c1)C(F)(F)F
CHEMBL580973 CC1(C)NC(=NC(=N)N1OCCOc2cccc(c2)C(F)(F)F)N 1.0
CHEMBL548336 Br.CC1(C)NC(=NC(=N)N1OCCOc2cccc(c2)C(F)(F)F)N 0.96
Found 2 matches in 0.0294108390808 seconds
None
searching for target: Cc1ccc(CON2C(=N)N=C(N)NC2(C)C)cc1C
CHEMBL533166 Cc1ccc(CON2C(=N)N=C(N)NC2(C)C)cc1C 1.0
CHEMBL530088 Cl.Cc1ccc(CON2C(=N)N=C(N)NC2(C)C)cc1C 0.952380952381
Found 2 matches in 0.0576150417328 seconds
None
searching for target: CCC(C)(C)n1c(C)cc2c1ccc1nc(N)nc(N)c21
CHEMBL424170 CCC(

In [110]:
%cd 0.7

/home/server2/Documents/malaria/0.7


In [4]:
connection = apsw.Connection(chembldb_sql)
connection.enableloadextension(True)
connection.loadextension(chemicalite_path)
connection.enableloadextension(False)

cursor = connection.cursor()
#target = "CCc1cccc(OCCCON2C(=N)N=C(N)NC2(C)C)c1"
threshold = 0.6
print('searching for target:', target)

matches, t = search(cursor, target, float(threshold))
results = []

for i in range(13403):
    #matrix = tanimoto_search(chemicalite_path, chembldb_sql, smiles[i], threshold)
    matches, t = search(cursor, smiles[i], float(threshold))
    #data_frame = pd.DataFrame(results[i], columns=["CHEMBL_ID", "SMILES", "Similarity"])
    results.append(matches)


NameError: name 'chembldb_sql' is not defined

In [28]:
%cd ChEMBL_FPdata_standardized_Tanimoto_0.6/

/home/server2/Documents/malaria/ChEMBL_FPdata_standardized_Tanimoto_0.7


In [29]:
import pandas as pd
import numpy as np

csvs = []

for i in range(13403):
    data_frame = pd.DataFrame(results[i], columns=["CHEMBL_ID", "SMILES", "Similarity"]) 
    ##data_frame.to_csv(str(smiles[i].split("\\/")[0]) + ".csv", sep = '\t', index = False)
    data_frame.to_csv(orig_TCMDCID[i] + ".csv", sep = '\t', index = False)
    #csvs.append(data_frame)
    #smiles[1].split("\\")
    #df.to_csv(data_frame + ".csv", sep='\t')
    

In [108]:
for i in range(10):
    

Unnamed: 0,CHEMBL_ID,SMILES,Similarity
0,CHEMBL533166,Cc1ccc(CON2C(=N)N=C(N)NC2(C)C)cc1C,1.0
1,CHEMBL530088,Cl.Cc1ccc(CON2C(=N)N=C(N)NC2(C)C)cc1C,0.952381
2,CHEMBL548854,COc1ccc(CON2C(=N)N=C(N)NC2(C)C)cc1OC,0.765957
3,CHEMBL527580,Cl.COc1ccc(CON2C(=N)N=C(N)NC2(C)C)cc1OC,0.734694
4,CHEMBL582777,CC1(C)NC(=NC(=N)N1OCc2ccc(cc2)c3ccccc3)N,0.717391
5,CHEMBL528095,Cc1cc(C)c(CON2C(=N)N=C(N)NC2(C)C)cc1C,0.717391


In [91]:
pd.DataFrame(results[0], columns=["CHEMBL_ID", "SMILES", "Similarity"])

Unnamed: 0,CHEMBL_ID,SMILES,Similarity
0,CHEMBL534824,Cc1nc(N)nc(N)c1OCCCOc2cccc(Cl)c2Cl,1.0
1,CHEMBL579606,Cl.Cc1nc(N)nc(N)c1OCCCOc2cccc(Cl)c2Cl,0.95
2,CHEMBL529285,Cc1nc(N)nc(N)c1OCCCOc2ccccc2Br,0.72093


In [142]:
words = smiles[759].split("-")[0]  

In [143]:
words

'COCCNCc1cccc(c1)'

In [150]:
%cd ..

/home/server2/Documents/malaria


In [151]:
df_2 = pd.read_csv("ChEMBL_PFdata-all_200115.csv")

In [154]:
smiles_2 = df_2['smiles'] #31307
ORIG_chemblID = df_2['ORIG_CHEMBLID']


In [155]:
connection = apsw.Connection(chembldb_sql)
connection.enableloadextension(True)
connection.loadextension(chemicalite_path)
connection.enableloadextension(False)

cursor = connection.cursor()
#target = "CCc1cccc(OCCCON2C(=N)N=C(N)NC2(C)C)c1"
threshold = 0.7
#print('searching for target:', target)

matches, t = search(cursor, target, float(threshold))
results = []

for i in range(31307):
    #matrix = tanimoto_search(chemicalite_path, chembldb_sql, smiles[i], threshold)
    matches, t = search(cursor, smiles_2[i], float(threshold))
    #data_frame = pd.DataFrame(results[i], columns=["CHEMBL_ID", "SMILES", "Similarity"])
    results.append(matches)
    



In [None]:
#%cd ChEMBL_PFData_Tanimoto_0.7_cutoff/
import pandas as pd
import numpy as np

csvs = []

for i in range(31307):
    data_frame = pd.DataFrame(results[i], columns=["CHEMBL_ID", "SMILES", "Similarity"]) 
    ##data_frame.to_csv(str(smiles[i].split("\\/")[0]) + ".csv", sep = '\t', index = False)
    data_frame.to_csv(ORIG_chemblID[i] + ".csv", sep = '\t', index = False)
    #csvs.append(data_frame)
    #smiles[1].split("\\")
    #df.to_csv(data_frame + ".csv", sep='\t')

In [156]:
%cd ChEMBL_PFData_Tanimoto_0.7_cutoff/

/home/server2/Documents/malaria/ChEMBL_PFData_Tanimoto_0.7_cutoff


In [157]:
#%cd ChEMBL_PFData_Tanimoto_0.7_cutoff/
import pandas as pd
import numpy as np

csvs = []

for i in range(31307):
    data_frame = pd.DataFrame(results[i], columns=["CHEMBL_ID", "SMILES", "Similarity"]) 
    ##data_frame.to_csv(str(smiles[i].split("\\/")[0]) + ".csv", sep = '\t', index = False)
    data_frame.to_csv(ORIG_chemblID[i] + ".csv", sep = '\t', index = False)
    #csvs.append(data_frame)
    #smiles[1].split("\\")
    #df.to_csv(data_frame + ".csv", sep='\t')