# Reverse engineer molecules

#### Apr 2024 version

## Enumerate signatures and molecules from Morgan fingerprints

In [5]:
# packages
import numpy
import os
import pandas as pd
import sys
import time
numpy.set_printoptions(threshold=sys.maxsize)

path_directory = "C:/Users/meyerp/Documents/INRAE/Diophantine/Enumération/github/signature" # CHANGE TO YOUR DIRECTORY PATH
os.chdir(path_directory)

from rdkit.Chem.Descriptors import ExactMolWt

# Reverse engineer molecules from Morgan
from src.imports import *
from src.utils import read_csv
from src.enumerate_utils import ReducedFingerprint
from src.enumerate_signature import EnumerateMoleculeFromSignature, EnumerateSignatureFromMorgan, SignatureNeighbor
from src.signature_alphabet import SignatureAlphabet, SignatureFromSmiles, LoadAlphabet, MorganVectorFromSignature

In [6]:
# paths
file_smiles = './datasets/MetaNetX_weight_500_radius_3_size_1000'
file_alphabet = './datasets/MetaNetX_weight_500_Alphabet_radius_3_nBits_2048'

# Load Smiles file
H, D = read_csv(file_smiles)
print(H, D.shape[0])
Smiles = np.asarray(list(set(D[:,0])))
print(f'Number of smiles: {len(Smiles)}')

# Get alphabet
Alphabet = LoadAlphabet(file_alphabet)
Alphabet.printout()

['SMILES', 'SIG', 'SIG-NBIT'] 1000
Number of smiles: 1000
filename: ./datasets/MetaNetX_weight_500_Alphabet_radius_3_nBits_2048.npz
radius: 3
nBits: 2048
splitcomponent: False
isomericSmiles: False
formalCharge: True
atomMapping: False
kekuleSmiles: False
allHsExplicit: False
maxvalence: 4
alphabet length: 551908


- ECFP reduced => Signature

In [7]:
list_i = []
list_smi = []
list_nsig = []
list_foundsig = []
list_ct = []
list_ct_solve = []
list_timeout = []

In [8]:
Alphabet.nBits = 2048
max_nbr_partition = int(1e5)

for i in range(5):
    smi = D[i, 0]
    print(i, "|", smi)
    sig, mol, smi = SignatureFromSmiles(smi, Alphabet, neighbor=True, verbose=False)
    sign = SignatureNeighbor(sig)
    morgan = ReducedFingerprint(smi, radius=Alphabet.radius, useFeatures=False)
    st = time.time()
    
    Ssig, bool_timeout, ct_solve = EnumerateSignatureFromMorgan(morgan, Alphabet, max_nbr_partition=max_nbr_partition, verbose=False)
    foundsig = sign in Ssig
    ft = time.time() - st
    
    print(f'{i} | {smi} | {len(Ssig)} | {int(foundsig)} | {ft:.4f} | {ct_solve:.4f}| {bool_timeout}')
    
    list_i.append(i)
    list_smi.append(smi)
    list_nsig.append(len(Ssig))
    list_foundsig.append(foundsig)
    list_ct.append(ft)
    list_ct_solve.append(ct_solve)
    list_timeout.append(bool_timeout)

0 | CCN(C(C)O)C(O)C=NC=C(C)C(=O)C(=O)C(=O)O
0 | CCN(C(C)O)C(O)C=NC=C(C)C(=O)C(=O)C(=O)O | 1 | 1 | 8.2098 | 0.0030| False
1 | OC1C(O)C2CC1C1C3(Cl)CC(Cl)(C(Cl)C3Cl)C21O
1 | OC1C(O)C2CC1C1C3(Cl)CC(Cl)(C(Cl)C3Cl)C21O | 1 | 1 | 7.7199 | 0.0020| False
2 | O=C(O)CCc1nc2ccccc2n(CCc2ccccc2)c1=O
2 | O=C(O)CCc1nc2ccccc2n(CCc2ccccc2)c1=O | 1 | 1 | 13.0907 | 0.0030| False
3 | CCC1OC(=O)C(C)C(O)C(C)C(O)C(C)(O)CC(C)C(O)C(C)(O)C(O)C1C
3 | CCC1OC(=O)C(C)C(O)C(C)C(O)C(C)(O)CC(C)C(O)C(C)(O)C(O)C1C | 1 | 1 | 21.3744 | 0.0060| False
4 | COC(=O)c1sc(=S)n(-c2ccccc2)c1N
4 | COC(=O)c1sc(=S)n(-c2ccccc2)c1N | 1 | 1 | 7.8169 | 0.0010| False


In [9]:
df = pd.DataFrame({
    "ID": list_i,
    "smi": list_smi,    
    "Nsig": list_nsig,
    "FoundSig": list_foundsig,
    "CPU-time enum ecfp_red to sig": list_ct,
    "CPU_time solve by partitions": list_ct_solve,
    "Partitions timeout": list_timeout,
})

df

#df.to_excel("RevSig1_7_ecfp_to_sig", index=False)

Unnamed: 0,ID,smi,Nsig,FoundSig,CPU-time enum ecfp_red to sig,CPU_time solve by partitions,Partitions timeout
0,0,CCN(C(C)O)C(O)C=NC=C(C)C(=O)C(=O)C(=O)O,1,True,8.209765,0.003,False
1,1,OC1C(O)C2CC1C1C3(Cl)CC(Cl)(C(Cl)C3Cl)C21O,1,True,7.719864,0.002001,False
2,2,O=C(O)CCc1nc2ccccc2n(CCc2ccccc2)c1=O,1,True,13.090723,0.003019,False
3,3,CCC1OC(=O)C(C)C(O)C(C)C(O)C(C)(O)CC(C)C(O)C(C)...,1,True,21.374355,0.006002,False
4,4,COC(=O)c1sc(=S)n(-c2ccccc2)c1N,1,True,7.816876,0.001007,False


- Signature => Smiles

In [10]:
list_i = []
list_smi = []
list_wt = []
list_nmol = []
list_foundmol = []
list_ct_mol = []
list_recursion_timeout = []

In [11]:
Alphabet.nBits = 0
max_nbr_recursion = 1e5
max_nbr_solution = 1001
repeat = 10

print(f'ID | smi | weigth | Nmol | FoundMol | CPU-time | Timeout')
for i in range(5):
    smi = D[i, 0]
    print(i, "|", smi)
    wt = ExactMolWt(Chem.MolFromSmiles(smi))
    sig, mol, smi = SignatureFromSmiles(smi, Alphabet, neighbor=True)
    sign = SignatureNeighbor(sig)
    st = time.time()
    
    smol, recursion_timeout = EnumerateMoleculeFromSignature(sign, Alphabet, smi,
                                                             max_nbr_recursion=max_nbr_recursion,
                                                             max_nbr_solution=max_nbr_solution,
                                                             repeat=repeat,
                                                             verbose=False)

    ft = time.time()-st
    foundmol = smi in smol
    print(f'{i} | {smi} | {wt:.1f} | {len(smol)} | {int(foundmol)} | {ft:.4f} | {recursion_timeout}')
    if foundmol == False:
        print(smol)
    list_i.append(i)
    list_smi.append(smi)
    list_wt.append(wt)
    list_nmol.append(len(smol))
    list_foundmol.append(int(foundmol))
    list_ct_mol.append(ft)
    list_recursion_timeout.append(recursion_timeout)

ID | smi | weigth | Nmol | FoundMol | CPU-time | Timeout
0 | CCN(C(C)O)C(O)C=NC=C(C)C(=O)C(=O)C(=O)O
0 | CCN(C(C)O)C(O)C=NC=C(C)C(=O)C(=O)C(=O)O | 286.1 | 1 | 1 | 0.0241 | False
1 | OC1C(O)C2CC1C1C3(Cl)CC(Cl)(C(Cl)C3Cl)C21O
1 | OC1C(O)C2CC1C1C3(Cl)CC(Cl)(C(Cl)C3Cl)C21O | 346.0 | 1 | 1 | 0.0310 | False
2 | O=C(O)CCc1nc2ccccc2n(CCc2ccccc2)c1=O
2 | O=C(O)CCc1nc2ccccc2n(CCc2ccccc2)c1=O | 322.1 | 1 | 1 | 0.2470 | False
3 | CCC1OC(=O)C(C)C(O)C(C)C(O)C(C)(O)CC(C)C(O)C(C)(O)C(O)C1C
3 | CCC1OC(=O)C(C)C(O)C(C)C(O)C(C)(O)CC(C)C(O)C(C)(O)C(O)C1C | 420.3 | 2 | 1 | 0.1340 | False
4 | COC(=O)c1sc(=S)n(-c2ccccc2)c1N
4 | COC(=O)c1sc(=S)n(-c2ccccc2)c1N | 266.0 | 1 | 1 | 0.0300 | False


In [12]:
df = pd.DataFrame({
    "ID": list_i,
    "smi": list_smi,
    "weight": list_wt,
    "Nmol": list_nmol,
    "FoundMol": list_foundmol,
    "CPU-time enum sig to mol": list_ct_mol,
    "Recursion timeout": list_recursion_timeout,
})

df

#df.to_excel("RevSig_rad3_sig_to_mol", index=False)

Unnamed: 0,ID,smi,weight,Nmol,FoundMol,CPU-time enum sig to mol,Recursion timeout
0,0,CCN(C(C)O)C(O)C=NC=C(C)C(=O)C(=O)C(=O)O,286.116486,1,1,0.024077,False
1,1,OC1C(O)C2CC1C1C3(Cl)CC(Cl)(C(Cl)C3Cl)C21O,345.969705,1,1,0.030992,False
2,2,O=C(O)CCc1nc2ccccc2n(CCc2ccccc2)c1=O,322.131742,1,1,0.247005,False
3,3,CCC1OC(=O)C(C)C(O)C(C)C(O)C(C)(O)CC(C)C(O)C(C)...,420.272318,2,1,0.134001,False
4,4,COC(=O)c1sc(=S)n(-c2ccccc2)c1N,266.01837,1,1,0.030002,False


- ECFP reduced => Signature => Smiles

In [13]:
list_i = []
list_smi = []
list_wt = []
list_nsig = []
list_nsigtrue = []
list_foundsig = []
list_nmol = []
list_foundmol = []
list_ct_sig = []
list_ct_mol = []
list_ct_all = []

In [14]:
max_nbr_partition = int(1e5)
max_nbr_recursion = 1e5
max_nbr_solution = 1001
repeat = 10

print(f'ID | smi | weigth | Nsig | NsigTrue | FoundSig | Nmol | FoundMol | CPU-time')
for i in range(5):
    # preparation
    smi = D[i, 0]
    print(i, "|", smi)
    wt = ExactMolWt(Chem.MolFromSmiles(smi))
    sig, mol, smi = SignatureFromSmiles(smi, Alphabet, neighbor=True, verbose=False)
    sign = SignatureNeighbor(sig)
    morgan = ReducedFingerprint(smi, radius=Alphabet.radius, useFeatures=False)
    
    # ecfp_red => signature(s)
    st_1 = time.time()
    Alphabet.nBits = 2048
    Ssig, bool_timeout, ct_solve = EnumerateSignatureFromMorgan(morgan, Alphabet, max_nbr_partition=max_nbr_partition, verbose=False)
    foundsig = sign in Ssig
    ct_sig = time.time() - st_1
    print(f'...enumeratesignature: nbr signatures {len(Ssig)} CPU-time {ct_sig}')
    
    # signature(s) => molecule(s)
    st_2 = time.time()
    Smol, Nsig = set(), 0
    Alphabet.nBits = 0
    for j in range(len(Ssig)):
        #print("Ssig[j]", Ssig[j])
        smol, recursion_timeout = EnumerateMoleculeFromSignature(Ssig[j], Alphabet, smi,
                                                         max_nbr_recursion=max_nbr_recursion,
                                                         max_nbr_solution=max_nbr_solution,
                                                         repeat=repeat,
                                                         verbose=False)
        if len(smol):
            Nsig += 1
            print(f'...enumeratemolecule:  signature {j} nbr molecules {len(smol)}')
        Smol = Smol | set(smol)
    ct_mol = time.time() - st_2
    foundmol = smi in Smol
    
    ct_total = time.time() - st_1
    print(f'{i} | {smi} | {wt:.1f} | {len(Ssig)} | {Nsig} | {int(foundsig)} | {len(Smol)} | {int(foundmol)} | {ct_total:.4f}')
    list_i.append(i)
    list_smi.append(smi)
    list_wt.append(wt)
    list_nsig.append(len(Ssig))
    list_nsigtrue.append(Nsig)
    list_foundsig.append(int(foundsig))
    list_nmol.append(len(Smol))
    list_foundmol.append(int(foundmol))
    list_ct_sig.append(ct_sig)
    list_ct_mol.append(ct_mol)
    list_ct_all.append(ct_total)
    if foundmol == False:
        print(Smol)

ID | smi | weigth | Nsig | NsigTrue | FoundSig | Nmol | FoundMol | CPU-time
0 | CCN(C(C)O)C(O)C=NC=C(C)C(=O)C(=O)C(=O)O
...enumeratesignature: nbr signatures 1 CPU-time 8.391731262207031
...enumeratemolecule:  signature 0 nbr molecules 1
0 | CCN(C(C)O)C(O)C=NC=C(C)C(=O)C(=O)C(=O)O | 286.1 | 1 | 1 | 1 | 1 | 1 | 8.4147
1 | OC1C(O)C2CC1C1C3(Cl)CC(Cl)(C(Cl)C3Cl)C21O
...enumeratesignature: nbr signatures 1 CPU-time 8.085188627243042
...enumeratemolecule:  signature 0 nbr molecules 1
1 | OC1C(O)C2CC1C1C3(Cl)CC(Cl)(C(Cl)C3Cl)C21O | 346.0 | 1 | 1 | 1 | 1 | 1 | 8.1213
2 | O=C(O)CCc1nc2ccccc2n(CCc2ccccc2)c1=O
...enumeratesignature: nbr signatures 1 CPU-time 14.70762038230896
...enumeratemolecule:  signature 0 nbr molecules 1
2 | O=C(O)CCc1nc2ccccc2n(CCc2ccccc2)c1=O | 322.1 | 1 | 1 | 1 | 1 | 1 | 14.9087
3 | CCC1OC(=O)C(C)C(O)C(C)C(O)C(C)(O)CC(C)C(O)C(C)(O)C(O)C1C
...enumeratesignature: nbr signatures 1 CPU-time 22.336121082305908
...enumeratemolecule:  signature 0 nbr molecules 2
3 | CCC1OC(=O)C(

In [15]:
df = pd.DataFrame({
    "ID": list_i,
    "smi": list_smi,
    "wt": list_wt,
    "Nsig": list_nsig,
    "NsigTrue": list_nsigtrue,
    "FoundSig": list_foundsig,
    "Nmol": list_nmol,
    "Foundmol": list_foundmol,
    "CPU-time ecfp_sig": list_ct_sig,
    "CPU_time sig_mol": list_ct_mol,
    "CPU_time ecfp_mol": list_ct_all,
})

df

#df.to_excel("RevSig_rad3_ecfp_to_mol", index=False)

Unnamed: 0,ID,smi,wt,Nsig,NsigTrue,FoundSig,Nmol,Foundmol,CPU-time ecfp_sig,CPU_time sig_mol,CPU_time ecfp_mol
0,0,CCN(C(C)O)C(O)C=NC=C(C)C(=O)C(=O)C(=O)O,286.116486,1,1,1,1,1,8.391731,0.022947,8.414678
1,1,OC1C(O)C2CC1C1C3(Cl)CC(Cl)(C(Cl)C3Cl)C21O,345.969705,1,1,1,1,1,8.085189,0.036087,8.121275
2,2,O=C(O)CCc1nc2ccccc2n(CCc2ccccc2)c1=O,322.131742,1,1,1,1,1,14.70762,0.201082,14.908703
3,3,CCC1OC(=O)C(C)C(O)C(C)C(O)C(C)(O)CC(C)C(O)C(C)...,420.272318,1,1,1,2,1,22.336121,0.139087,22.475209
4,4,COC(=O)c1sc(=S)n(-c2ccccc2)c1N,266.01837,1,1,1,1,1,7.823828,0.031001,7.854828
