In [1]:
import os
import gc
from multiprocessing import Pool
from functools import partial
import numpy as np
import glob

from rdkit import Chem

from SmilesPE.pretokenizer import atomwise_tokenizer
from SmilesPE.pretokenizer import kmer_tokenizer

import string
import tqdm
import pandas as pd

import requests

In [2]:
def fetch_smiles(processed_query):
    URL = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/" + processed_query + "/property/CanonicalSMILES/json"
    r = requests.get(URL) 
    cid_smiles = {}
    try:
        for entry in r.json()['PropertyTable']['Properties']:
            cid_smiles[entry["CID"]] = entry["CanonicalSMILES"]
    except:
        print (r)
    return (cid_smiles)

def get_smiles_from_cid(query):
    assert(type(query) == int or type(query) == list or type(query) == dict)
    
    
    if type(query) == list or type(query) == dict:
        output_dict = {}
        processed_query = ""
        loop = tqdm.tqdm(enumerate(query), total=len(query),leave=False)
        for i,cid in loop:
            try:
                cid = int(cid)
            except:
                return ("Error in CID = " + str(cid))
            processed_query += str(cid) + ","
            
            if (i + 1) % 500 == 0 or (i + 1) == len(query):
                processed_query = processed_query[:-1]
                fetched_dict = fetch_smiles(processed_query)
                output_dict.update(fetched_dict)
                processed_query = ""
                
        return output_dict

    if type(query) == int:
        processed_query = str(query)
        for entry in r.json()['PropertyTable']['Properties']:
            cid_smiles =  entry["CanonicalSMILES"]
            
        return cid_smiles

In [3]:
# To remove rdkit warning
from rdkit import RDLogger
lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)

input_file = "cid_cluster_5338.txt"
open_file = open(input_file,"r").readlines()
cid_from_file = []
for i,entry in enumerate(open_file):
    cid_from_file.append(entry.split()[0])
cid_smiles_web = get_smiles_from_cid(cid_from_file)

                                                    

In [4]:
input_file = "ML_input_5338.txt"
open_file = open(input_file,"r").readlines()
smiles_from_file = []
for i,entry in enumerate(open_file):
    smiles_from_file.append(entry.split()[0])

In [5]:
len(cid_smiles_web),len(smiles_from_file)

(5223, 5223)

In [6]:
def sanitize_molecule(output_type=None,smiles=None):
    molecule = Chem.MolFromSmiles(smiles,sanitize=False)
    if molecule is None:
        return None
    else:
        try:
            Chem.SanitizeMol(molecule)
            if output_type == "canonical":
                return Chem.MolToSmiles(molecule)
            else:
                return smiles
        except:
            return None

        
# List or dataframe with header ("Smiles,Label") as input for sanity check
# output_type=None (canonical - for canonical smiles output)
# Number_of_workers(1) = to run in pool of threads
def sanity_check(df,output_type = None,Number_of_workers = 1):
    if type(df) == list:
        func = partial(sanitize_molecule,output_type)
        
        p = Pool(Number_of_workers)
        clean_smiles = list(tqdm.tqdm(p.imap(func, df), total=len(df),leave=False))
        p.close()
        
        return clean_smiles
    
    else:
        labels = []
        for label in df.groupby('Label'):
            labels.append(label[0])
        
        clean_smiles_list = []
        label_array = []
        
        for label in labels:
            
            canonical_smiles = df[df['Label'] == label]['Smiles'].to_list()
            
            func = partial(sanitize_molecule,output_type)
            
            p = Pool(Number_of_workers)
            clean_smiles = list(tqdm.tqdm(p.imap(func, canonical_smiles), total=len(canonical_smiles),leave=False))
            clean_smiles_list.extend(clean_smiles)
            p.close() 
            
            label_array.extend(label * np.ones(len(clean_smiles),dtype=int))
            
        output_df = pd.DataFrame(columns=["Smiles","Label"]) 
        output_df["Smiles"] = clean_smiles_list
        output_df["Label"]  = label_array
    
        return  output_df

In [7]:
smiles_from_file = sanity_check(smiles_from_file,output_type = "canonical",Number_of_workers = 8)

                                                      

In [8]:
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import MACCSkeys
from rdkit.Chem.AtomPairs import Pairs
from rdkit import DataStructs

In [9]:
def smiles_fingerprint(smiles,ftype,radius=None,bits=2048):
    m1 = Chem.MolFromSmiles(smiles)
    if ftype == "morgan":
        fp1 = AllChem.GetMorganFingerprintAsBitVect(m1,radius,nBits=bits)
        
    if ftype == "topological":
        fp1 = Chem.RDKFingerprint(m1)
        
    if ftype == "MACCS":
        fp1 = MACCSkeys.GenMACCSKeys(m1)
        
    if ftype == "atompairs":
        fp1 = Pairs.GetAtomPairFingerprint(m1)
        
    #bits = fp1.ToBitString()
    return (fp1)

In [10]:
smiles_web_fp = []
for entry in cid_smiles_web:
    smiles_web_fp.append(smiles_fingerprint(cid_smiles_web[entry],"topological"))

In [11]:
smiles_file_fp = []
for entry in smiles_from_file:
    smiles_file_fp.append(smiles_fingerprint(entry,"topological"))

In [12]:
count = 0
for i,fp1 in enumerate(smiles_web_fp):
    found = False
    for fp2 in smiles_file_fp:
        if (DataStructs.FingerprintSimilarity(fp1,fp2)) == 1:
            count += 1
            found = True
            break
    if not found:
        print (smiles_from_file[i],list(cid_smiles_web.values())[i],list(cid_smiles_web.keys())[i])

N#[N+]C=C([O-])CCC(N)C(=O)O C(CC(=O)C=[N+]=[N-])C(C(=O)O)N 164
N#[N+]C=C([O-])OCC(N)C(=O)O C(C(C(=O)O)N)OC(=O)C=[N+]=[N-] 830
N#[N+][O-] [N-]=[N+]=O 948
CCC1(CCC(C)C)C(=O)N=C([O-])NC1=O CCC1(C(=O)NC(=O)N=C1[O-])CCC(C)C 2163
O=C1CN(N=Cc2ccc(-c3ccc([N+](=O)[O-])cc3)o2)C([O-])=N1 C1C(=NC(=O)N1N=CC2=CC=C(O2)C3=CC=C(C=C3)[N+](=O)[O-])[O-] 2951
COC1C(OC(N)=O)C(O)C(Oc2ccc3c(=O)c(NC(=O)c4ccc(O)c(CC=C(C)C)c4)c([O-])oc3c2C)OC1(C)C CC1=C(C=CC2=C1OC(=O)C(=C2[O-])NC(=O)C3=CC(=C(C=C3)O)CC=C(C)C)OC4C(C(C(C(O4)(C)C)OC)OC(=O)N)O 4545
CCOC([O-])=C[N+]#N CCOC(=O)C=[N+]=[N-] 12192
CCN(CC)c1ccc2nc3c4ccccc4c(=[NH2+])cc-3oc2c1.[Cl-] CC[N+](=C1C=CC2=NC3=C(C=C(C4=CC=CC=C43)N)OC2=C1)CC.[Cl-] 16938


In [13]:
count

5215

In [105]:
import sqlite3
connect = sqlite3.connect('/mnt/external-images-pvc/quantmap/qm_chem_fix.sqlite')
d = connect.cursor()
d.execute("select cid,smiles from stitch_chem where cid = 830;")

<sqlite3.Cursor at 0x7f8505a53ea0>

In [106]:
print (d.fetchall()) 

[(830, 'C(C(C(=O)O)N)OC(=C[N+]#N)[O-]'), (830, 'C(C(C(=O)O)N)OC(=C[N+]#N)[O-]')]


In [127]:
#[N-]=[N+]=O
molecule = "[N-]=[N+]=O"
Chem.MolToSmiles(Chem.MolFromSmiles(molecule),canonical=True)

'[N-]=[N+]=O'

In [125]:
Chem.MolToSmiles(Chem.Kekulize(Chem.MolFromSmiles(molecule)),canonical=True)

ArgumentError: Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)

In [22]:
input_file = "cid_cluster_5338.txt"
open_file = open(input_file,"r").readlines()

clusters = {}
for entry in open_file:
    clus = entry.split()[1]
    if clus in clusters:
        clusters[clus] += 1
    else:
        clusters[clus] = 1
print ((clusters))

{'8': 208, '3': 1466, '2': 306, '16': 171, '19': 501, '18': 268, '5': 718, '12': 36, '6': 130, '0': 292, '14': 67, '1': 325, '13': 182, '7': 142, '10': 49, '4': 167, '11': 130, '9': 21, '20': 30, '15': 4, '17': 10}


In [None]:
clusters = {}
for entry in open_file:
    clus = entry.split()[1]
    if clus in clusters:
        clusters[clus] += 1
    else:
        clusters[clus] = 1
print ((clusters))