# Reverse engineer molecules

#### Apr 2024 version

## Enumerate signatures and molecules from Morgan fingerprints

In [20]:
# CHANGE TO YOUR DIRECTORY PATH
path_directory = "C:/Users/meyerp/Documents/INRAE/Diophantine/Enumération/github/signature"

# packages
import numpy as np
import os

import pandas as pd
import sys
import time

np.set_printoptions(threshold=sys.maxsize)

os.chdir(path_directory)

from rdkit.Chem.Descriptors import ExactMolWt
from rdkit import Chem
from rdkit import RDLogger

RDLogger.DisableLog("rdApp.*")

# Reverse engineer molecules from Morgan
from signature.enumerate_signature import enumerate_molecule_from_signature, enumerate_signature_from_morgan
from signature.enumerate_utils import reduced_fingerprint, test_sol_ECFP, test_sol_ECFP_reduced, test_sol_sig
from signature.Signature import MoleculeSignature
from signature.signature_alphabet import (
    load_alphabet,
    morgan_vector_from_signature,
    SignatureAlphabet,
    signature_from_smiles,
)
from signature.utils import read_csv

In [21]:
# paths
file_smiles = "./datasets/MetaNetX_weight_500_radius_3_size_1000"
#file_smiles = "./datasets/emolecule_weight_500_radius_2_size_1000"
#file_smiles = "./datasets/emolecule_weight_500"
#file_smiles = "./datasets/dataset.test.small"

#file_alphabet = "./datasets/MetaNetX_weight_500_Alphabet_radius_2_nBits_2048_smarts"
#file_alphabet = "./datasets/MetaNetX_weight_500_Alphabet_radius_3_nBits_2048" # radius 3 smiles
#file_alphabet = "./datasets/MetaNetX_weight_500_r2_smiles_boundary_Alphabet_radius_2_nBits_2048"
#file_alphabet = "./datasets/MetaNetX_weight_500_r2_smiles_full_Alphabet_radius_2_nBits_2048"
#file_alphabet = "./datasets/MetaNetX_weight_500_Alphabet_radius_3_nBits_2048" # smiles r3
#file_alphabet = "./datasets/MetaNetX_weight_500_Alphabet_radius_2_boundary_nBits_2048" # smiles r2 boundary
file_alphabet = "./datasets/MetaNetX_weight_500_Alphabet_radius_2_nBits_2048_smarts_full" # smarts r2

# Load Smiles file
H, D = read_csv(file_smiles)
print(H, D.shape[0])
Smiles = np.asarray(list(set(D[:, 0])))
print(f"Number of smiles: {len(Smiles)}")

# Get alphabet
Alphabet = load_alphabet(file_alphabet)
Alphabet.print_out()

['SMILES', 'SIG', 'SIG-NBIT'] 1000
Number of smiles: 1000
filename: ./datasets/MetaNetX_weight_500_Alphabet_radius_2_nBits_2048_smarts_full.npz
radius: 2
nBits: 2048
splitcomponent: False
isomericSmiles: False
formalCharge: True
atomMapping: False
kekuleSmiles: False
allHsExplicit: False
maxvalence: 4
alphabet length: 176426


In [22]:
Alphabet.use_smarts = True
Alphabet.boundary_bonds = False

- ECFP reduced => Signature

In [23]:
list_i = []
list_smi = []
list_nsig = []
list_foundsig = []
list_ct = []
list_ct_solve = []
list_timeout = []

In [24]:
Alphabet.nBits = 2048
max_nbr_partition = int(1e5)

for i in range(5):
    smi = D[i, 0]
    print(i, "|", smi)

    mol = Chem.MolFromSmiles(smi)
    ms = MoleculeSignature(mol, radius=Alphabet.radius, neighbor=True, use_smarts=Alphabet.use_smarts, nbits=False, boundary_bonds=Alphabet.boundary_bonds, map_root=True)
    sig = ms.as_deprecated_string(morgan=False, root=False, neighbors=True)
    morgan = reduced_fingerprint(smi, radius=Alphabet.radius, useFeatures=False)
#    print(sig)
    st = time.time()

    Ssig, bool_timeout, ct_solve = enumerate_signature_from_morgan(
        morgan, Alphabet, max_nbr_partition=max_nbr_partition, method="partitions", verbose=False
    )
    foundsig = sig in Ssig
    ft = time.time() - st

    print(f"{i} | {smi} | {len(Ssig)} | {int(foundsig)} | {ft:.4f} | {ct_solve:.4f}| {bool_timeout}")

    list_i.append(i)
    list_smi.append(smi)
    list_nsig.append(len(Ssig))
    list_foundsig.append(foundsig)
    list_ct.append(ft)
    list_ct_solve.append(ct_solve)
    list_timeout.append(bool_timeout)

0 | CCN(C(C)O)C(O)C=NC=C(C)C(=O)C(=O)C(=O)O
0 | CCN(C(C)O)C(O)C=NC=C(C)C(=O)C(=O)C(=O)O | 1 | 1 | 2.1100 | 0.0040| False
1 | OC1C(O)C2CC1C1C3(Cl)CC(Cl)(C(Cl)C3Cl)C21O
1 | OC1C(O)C2CC1C1C3(Cl)CC(Cl)(C(Cl)C3Cl)C21O | 1 | 1 | 1.9419 | 0.0020| False
2 | O=C(O)CCc1nc2ccccc2n(CCc2ccccc2)c1=O
2 | O=C(O)CCc1nc2ccccc2n(CCc2ccccc2)c1=O | 1 | 1 | 2.4230 | 0.0020| False
3 | CCC1OC(=O)C(C)C(O)C(C)C(O)C(C)(O)CC(C)C(O)C(C)(O)C(O)C1C
3 | CCC1OC(=O)C(C)C(O)C(C)C(O)C(C)(O)CC(C)C(O)C(C)(O)C(O)C1C | 1 | 1 | 3.8994 | 0.0039| False
4 | COC(=O)c1sc(=S)n(-c2ccccc2)c1N
4 | COC(=O)c1sc(=S)n(-c2ccccc2)c1N | 1 | 1 | 2.2700 | 0.0020| False


In [25]:
100 * sum(list_foundsig) / len(list_foundsig)

100.0

In [26]:
df = pd.DataFrame(
    {
        "ID": list_i,
        "smi": list_smi,
        "Nsig": list_nsig,
        "FoundSig": list_foundsig,
        "CT enum ecfp_red to sig": list_ct,
        "CT solve by partitions": list_ct_solve,
        "Partitions timeout": list_timeout,
    }
)

df

# df.to_excel("RevSig_ecfp_to_sig", index=False)

Unnamed: 0,ID,smi,Nsig,FoundSig,CT enum ecfp_red to sig,CT solve by partitions,Partitions timeout
0,0,CCN(C(C)O)C(O)C=NC=C(C)C(=O)C(=O)C(=O)O,1,True,2.109981,0.003982,False
1,1,OC1C(O)C2CC1C1C3(Cl)CC(Cl)(C(Cl)C3Cl)C21O,1,True,1.941916,0.002001,False
2,2,O=C(O)CCc1nc2ccccc2n(CCc2ccccc2)c1=O,1,True,2.423007,0.002003,False
3,3,CCC1OC(=O)C(C)C(O)C(C)C(O)C(C)(O)CC(C)C(O)C(C)...,1,True,3.899387,0.003902,False
4,4,COC(=O)c1sc(=S)n(-c2ccccc2)c1N,1,True,2.270003,0.002006,False


- Signature => Smiles

In [8]:
list_i = []
list_smi = []
list_wt = []
list_nmol = []
list_foundmol = []
list_ct_mol = []
list_recursion_timeout = []

In [9]:
Alphabet.nBits = 0
max_nbr_recursion = 1e5
max_nbr_solution = 1001
repeat = 10

print(f"ID | smi | weigth | Nmol | FoundMol | CPU-time | Timeout")
for i in range(5):
    smi = D[i, 0]
    print(i, "|", smi)
    wt = ExactMolWt(Chem.MolFromSmiles(smi))
    
    mol = Chem.MolFromSmiles(smi)
    ms = MoleculeSignature(mol, radius=Alphabet.radius, neighbor=True, use_smarts=Alphabet.use_smarts, nbits=False, boundary_bonds=Alphabet.boundary_bonds, map_root=True, legacy=False)
    sig = ms.as_deprecated_string(morgan=False, root=False, neighbors=True)

    st = time.time()
    smol, recursion_timeout = enumerate_molecule_from_signature(
        sig,
        Alphabet,
        smi,
        use_smarts=Alphabet.use_smarts,
        boundary_bonds=Alphabet.boundary_bonds,
        max_nbr_recursion=max_nbr_recursion,
        max_nbr_solution=max_nbr_solution,
        repeat=repeat,
        verbose=False,
    )

    ft = time.time() - st
    foundmol = smi in smol
    if foundmol == False:
        print(smol)

    print(f"{i} | {smi} | {wt:.1f} | {len(smol)} | {int(foundmol)} | {ft:.4f} | {recursion_timeout}")

    list_i.append(i)
    list_smi.append(smi)
    list_wt.append(wt)
    list_nmol.append(len(smol))
    list_foundmol.append(int(foundmol))
    list_ct_mol.append(ft)
    list_recursion_timeout.append(recursion_timeout)

ID | smi | weigth | Nmol | FoundMol | CPU-time | Timeout
0 | CC1OC(n2cnc3c(N)ncnc32)C(O)C1O
0 | CC1OC(n2cnc3c(N)ncnc32)C(O)C1O | 251.1 | 1 | 1 | 0.0420 | False
1 | CC(SC(=O)C1=CC=CS1=O)C(=O)NCC(=O)[O-]
[]
1 | CC(SC(=O)C1=CC=CS1=O)C(=O)NCC(=O)[O-] | 288.0 | 0 | 0 | 0.0239 | False
2 | CCCCCCCC(=O)OCC(COP(=O)([O-])[O-])OC(=O)CCCCC
[]
2 | CCCCCCCC(=O)OCC(COP(=O)([O-])[O-])OC(=O)CCCCC | 394.2 | 0 | 0 | 10.6582 | True
3 | O=CC(O)CC(O)(OC(=O)C(O)C=O)C(O)OCC1OC(O)C(O)C(O)C1O
3 | O=CC(O)CC(O)(OC(=O)C(O)C=O)C(O)OCC1OC(O)C(O)C(O)C1O | 414.1 | 1 | 1 | 0.0740 | False
4 | COc1c(O)c(CCC(=O)O)cc2c1OCO2
4 | COc1c(O)c(CCC(=O)O)cc2c1OCO2 | 240.1 | 1 | 1 | 0.0290 | False


In [10]:
100 * sum(list_foundmol) / len(list_foundmol)

60.0

In [11]:
df = pd.DataFrame(
    {
        "ID": list_i,
        "smi": list_smi,
        "weight": list_wt,
        "Nmol": list_nmol,
        "FoundMol": list_foundmol,
        "CT enum sig to mol": list_ct_mol,
        "Recursion timeout": list_recursion_timeout,
    }
)

df

# df.to_excel("RevSig_sig_to_mol", index=False)

Unnamed: 0,ID,smi,weight,Nmol,FoundMol,CT enum sig to mol,Recursion timeout
0,0,CC1OC(n2cnc3c(N)ncnc32)C(O)C1O,251.101839,1,1,0.041959,False
1,1,CC(SC(=O)C1=CC=CS1=O)C(=O)NCC(=O)[O-],288.000588,0,0,0.023919,False
2,2,CCCCCCCC(=O)OCC(COP(=O)([O-])[O-])OC(=O)CCCCC,394.176752,0,0,10.658238,True
3,3,O=CC(O)CC(O)(OC(=O)C(O)C=O)C(O)OCC1OC(O)C(O)C(...,414.100955,1,1,0.07399,False
4,4,COc1c(O)c(CCC(=O)O)cc2c1OCO2,240.063388,1,1,0.028996,False


- ECFP reduced => Signature => Smiles

In [66]:
list_i = []
list_smi = []
list_wt = []
list_nsig = []
list_nsigtrue = []
list_foundsig = []
list_nmol = []
list_foundmol = []
list_ct_sig = []
list_ct_mol = []
list_ct_all = []
list_ct_solve = []
list_timeout_sig = []
list_timeout_mol = []

In [67]:
max_nbr_partition = int(1e5)
max_nbr_recursion = 1e5
max_nbr_solution = 1001
repeat = 10

print(f"ID | smi | weigth | Nsig | NsigTrue | FoundSig | Nmol | FoundMol | CPU-time")
for i in range(10):
    # preparation
    smi = D[i, 0]
    print(i, "|", smi)
    wt = ExactMolWt(Chem.MolFromSmiles(smi))

    mol = Chem.MolFromSmiles(smi)
    ms = MoleculeSignature(mol, radius=Alphabet.radius, neighbor=True, use_smarts=Alphabet.use_smarts, nbits=False, boundary_bonds=Alphabet.boundary_bonds, map_root=True, legacy=False)
    sig = ms.as_deprecated_string(morgan=False, root=False, neighbors=True)
    morgan = reduced_fingerprint(smi, radius=Alphabet.radius, useFeatures=False)

    # ecfp_red => signature(s)
    st_1 = time.time()
    Alphabet.nBits = 2048
    Ssig, partition_timeout, ct_solve = enumerate_signature_from_morgan(
        morgan, Alphabet, max_nbr_partition=max_nbr_partition, method="partitions", verbose=False
    )
    foundsig = sig in Ssig
    ct_sig = time.time() - st_1
    print(f"...enumeratesignature: nbr signatures {len(Ssig)} CPU-time {ct_sig:.4f}")

    # signature(s) => molecule(s)
    st_2 = time.time()
    Smol, Nsig = set(), 0
    Alphabet.nBits = 0
    list_recursion_timeout = []
    for j in range(len(Ssig)):
        smol, recursion_timeout = enumerate_molecule_from_signature(
            Ssig[j],
            Alphabet,
            smi,
            use_smarts=Alphabet.use_smarts,
            boundary_bonds=Alphabet.boundary_bonds,
            max_nbr_recursion=max_nbr_recursion,
            max_nbr_solution=max_nbr_solution,
            repeat=repeat,
            verbose=False,
        )
        list_recursion_timeout.append(recursion_timeout)
        if len(smol):
            Nsig += 1
            print(f"...enumeratemolecule:  signature {j} nbr molecules {len(smol)}")
        Smol = Smol | set(smol)
    recursion_timeout = True in list_recursion_timeout
    ct_mol = time.time() - st_2

    ct_total = time.time() - st_1
    
    Alphabet.nBits = 2048
    Smolfinal = []
    for smi2 in Smol:
        if test_sol_ECFP_reduced([smi, smi2], Alphabet=Alphabet):
            Smolfinal.append(smi2)
    if len(Smol) != len(Smolfinal):
        print("Smol", len(Smol), "Smolfinal", len(Smolfinal))

    foundmol = smi in Smolfinal
    if foundmol == False:
        print(Smolfinal)
        
    print(
        f"{i} | {smi} | {wt:.1f} | {len(Ssig)} | {Nsig} | {int(foundsig)} | {len(Smolfinal)} | {int(foundmol)} | {ct_total:.4f} | {ct_solve:.4f}"
    )
    
    list_i.append(i)
    list_smi.append(smi)
    list_wt.append(wt)
    list_nsig.append(len(Ssig))
    list_nsigtrue.append(Nsig)
    list_foundsig.append(int(foundsig))
    list_nmol.append(len(Smolfinal))
    list_foundmol.append(int(foundmol))
    list_ct_sig.append(ct_sig)
    list_ct_mol.append(ct_mol)
    list_ct_all.append(ct_total)
    list_ct_solve.append(ct_solve)
    list_timeout_sig.append(partition_timeout)
    list_timeout_mol.append(recursion_timeout)

ID | smi | weigth | Nsig | NsigTrue | FoundSig | Nmol | FoundMol | CPU-time
0 | CCN(C(C)O)C(O)C=NC=C(C)C(=O)C(=O)C(=O)O
...enumeratesignature: nbr signatures 1 CPU-time 2.1899
...enumeratemolecule:  signature 0 nbr molecules 1
0 | CCN(C(C)O)C(O)C=NC=C(C)C(=O)C(=O)C(=O)O | 286.1 | 1 | 1 | 1 | 1 | 1 | 2.2169 | 0.0030
1 | OC1C(O)C2CC1C1C3(Cl)CC(Cl)(C(Cl)C3Cl)C21O
...enumeratesignature: nbr signatures 1 CPU-time 1.7520
...enumeratemolecule:  signature 0 nbr molecules 1
1 | OC1C(O)C2CC1C1C3(Cl)CC(Cl)(C(Cl)C3Cl)C21O | 346.0 | 1 | 1 | 1 | 1 | 1 | 1.7920 | 0.0010
2 | O=C(O)CCc1nc2ccccc2n(CCc2ccccc2)c1=O
...enumeratesignature: nbr signatures 1 CPU-time 2.1130
...enumeratemolecule:  signature 0 nbr molecules 1
2 | O=C(O)CCc1nc2ccccc2n(CCc2ccccc2)c1=O | 322.1 | 1 | 1 | 1 | 1 | 1 | 2.2930 | 0.0020
3 | CCC1OC(=O)C(C)C(O)C(C)C(O)C(C)(O)CC(C)C(O)C(C)(O)C(O)C1C
...enumeratesignature: nbr signatures 1 CPU-time 2.7270
[]
3 | CCC1OC(=O)C(C)C(O)C(C)C(O)C(C)(O)CC(C)C(O)C(C)(O)C(O)C1C | 420.3 | 1 | 0 | 1 | 

In [68]:
100 * sum(list_foundmol) / len(list_foundmol)

90.0

In [69]:
df = pd.DataFrame(
    {
        "ID": list_i,
        "smi": list_smi,
        "wt": list_wt,
        "Nsig": list_nsig,
        "NsigTrue": list_nsigtrue,
        "FoundSig": list_foundsig,
        "Nmol": list_nmol,
        "Foundmol": list_foundmol,
        "CT ecfp_sig": list_ct_sig,
        "CT sig_mol": list_ct_mol,
        "CT ecfp_mol": list_ct_all,
        "CT solve_partitions": list_ct_solve,
        "Timeout ecfp_sig": list_timeout_sig,
        "Timeout sig_mol": list_timeout_mol,
    }
)

df

# df.to_excel("RevSig_ecfp_to_mol_smiles_r2_boundary_500pe.xlsx", index=False)

Unnamed: 0,ID,smi,wt,Nsig,NsigTrue,FoundSig,Nmol,Foundmol,CT ecfp_sig,CT sig_mol,CT ecfp_mol,CT solve_partitions,Timeout ecfp_sig,Timeout sig_mol
0,0,CCN(C(C)O)C(O)C=NC=C(C)C(=O)C(=O)C(=O)O,286.116486,1,1,1,1,1,2.189906,0.027015,2.216921,0.003001,False,False
1,1,OC1C(O)C2CC1C1C3(Cl)CC(Cl)(C(Cl)C3Cl)C21O,345.969705,1,1,1,1,1,1.752001,0.039999,1.792,0.001007,False,False
2,2,O=C(O)CCc1nc2ccccc2n(CCc2ccccc2)c1=O,322.131742,1,1,1,1,1,2.113007,0.179078,2.293001,0.002006,False,True
3,3,CCC1OC(=O)C(C)C(O)C(C)C(O)C(C)(O)CC(C)C(O)C(C)...,420.272318,1,0,1,0,0,2.726989,0.030926,2.757915,0.00397,False,True
4,4,COC(=O)c1sc(=S)n(-c2ccccc2)c1N,266.01837,1,1,1,1,1,1.785909,0.044097,1.830006,0.001,False,True
5,5,CCC(O)C(O)C(CCl)C(Cl)C(=O)C(=O)O,272.021829,1,1,1,1,1,1.647995,0.020956,1.668951,0.001982,False,False
6,6,CC1C(=O)c2ccc3c(c2C(c2ccc4c(c2)OCO4)C1C)OCO3,338.115424,1,1,1,2,1,3.303149,0.470059,3.773208,0.003023,False,True
7,7,COc1ccc(O)cc1-c1oc2c(OC)c(OC)cc(O)c2c(=O)c1OC,374.100168,1,1,1,1,1,3.856939,0.093039,3.950993,0.002958,False,True
8,8,CCCCC=CCC(O)C(O)C(O)C(O)CCCC(=O)O,304.188589,1,1,1,1,1,2.351981,0.05399,2.405972,0.001954,False,False
9,9,Cc1c(CCC(=O)N2CCCC2)c(=O)n2c3ccccc3n(C)c2c1C#N,362.174276,1,1,1,1,1,3.134946,0.095054,3.23,0.003035,False,True
