# Reverse engineer molecules

## Enumerate signatures and molecules from Morgan fingerprints

In [None]:
# CHANGE TO YOUR DIRECTORY PATH
path_directory = "C:/Users/meyerp/Documents/INRAE/Diophantine/Enumération/github/signature"

# packages
import numpy as np
import os
import copy

import pandas as pd
import sys
import time

import importlib

np.set_printoptions(threshold=sys.maxsize)

os.chdir(path_directory)

from rdkit.Chem.Descriptors import ExactMolWt
from rdkit import Chem
from rdkit import RDLogger

RDLogger.DisableLog("rdApp.*")

from rdkit.Chem import AllChem
fpgen = AllChem.GetMorganGenerator(radius=2, fpSize=2048)

In [None]:
import importlib

import signature.enumerate_signature as enumerate_signature
import signature.enumerate_utils as enumerate_utils
import signature.Signature as Signature
import signature.signature_alphabet as signature_alphabet
import signature.utils as utils

# Reload the module
importlib.reload(enumerate_signature)
importlib.reload(enumerate_utils)
importlib.reload(Signature)
importlib.reload(signature_alphabet)
importlib.reload(utils)
# Now you can import the specific class again if needed
from signature.enumerate_signature import enumerate_molecule_from_signature, enumerate_signature_from_morgan
from signature.enumerate_utils import reduced_fingerprint, test_sol_ECFP, test_sol_ECFP_reduced, test_sol_sig
from signature.signature_alphabet import (
    load_alphabet,
    morgan_vector_from_signature,
    sanitize_molecule,
    SignatureAlphabet,
    signature_from_smiles,
)
from signature.utils import mol_from_smiles, read_csv, read_tsv
from signature.Signature import MoleculeSignature

In [None]:
#### paths

## dataset test
file_smiles = "./datasets/metanetx_sample_MAXMW_500_SIZE_1k"

## alphabet
file_alphabet = "./datasets/MetaNetX_weight_500_new_Alphabet_radius_2_nBits_2048_smarts_full_newv_full"

# Load Smiles file
#H, D = read_csv(file_smiles)
H, D = read_tsv(file_smiles)
print(H, D.shape[0])
Smiles = np.asarray(list(set(D[:, 0])))
print(f"Number of smiles: {len(Smiles)}")
list_smiles = []
for i in range(len(Smiles)):
    smi = D[i, 8]
    mol = mol_from_smiles(smi, keep_stereo=True)
    _, smi_san = sanitize_molecule(mol, formalCharge=True)
    list_smiles.append(smi_san)

# Get alphabet
Alphabet = load_alphabet(file_alphabet)
Alphabet.print_out()

- to suppress sig without neighbors

- ECFP => Signature

In [None]:
list_i = []
list_smi = []
list_nsig = []
list_foundsig = []
list_ct = []
list_ct_solve = []
list_timeout = []

In [None]:
Alphabet.nBits = 2048
max_nbr_partition = int(1e4)

for i in range(5):
    smi = list_smiles[i]
    print(i, "|", smi)

    mol = Chem.MolFromSmiles(smi)
    ms = MoleculeSignature(mol, radius=Alphabet.radius, use_smarts=Alphabet.use_smarts, nbits=0, boundary_bonds=Alphabet.boundary_bonds, map_root=True)
    ms.post_compute_neighbors()
    sig = sorted([atom.to_string(neighbors=True) for atom in ms.atoms])
    morgan = fpgen.GetCountFingerprint(mol).ToList()
    
    st = time.time()
    Ssig, bool_timeout, ct_solve = enumerate_signature_from_morgan(
        morgan, Alphabet, max_nbr_partition=max_nbr_partition, verbose=False
    )
    foundsig = sig in Ssig
    ft = time.time() - st

    print(f"{i} | {smi} | {len(Ssig)} | {int(foundsig)} | {ft:.4f} | {ct_solve:.4f}| {bool_timeout}")

    list_i.append(i)
    list_smi.append(smi)
    list_nsig.append(len(Ssig))
    list_foundsig.append(foundsig)
    list_ct.append(ft)
    list_ct_solve.append(ct_solve)
    list_timeout.append(bool_timeout)

In [None]:
100 * sum(list_foundsig) / len(list_foundsig)

In [None]:
df = pd.DataFrame(
    {
        "ID": list_i,
        "smi": list_smi,
        "Nsig": list_nsig,
        "FoundSig": list_foundsig,
        "CT enum ecfp_red to sig": list_ct,
        "CT solve by partitions": list_ct_solve,
        "Partitions timeout": list_timeout,
    }
)

df

# df.to_excel("RevSig_ecfp_to_sig.xlsx", index=False)

- Signature => Smiles

In [None]:
list_i = []
list_smi = []
list_nmol = []
list_foundmol = []
list_ct_mol = []
list_recursion_timeout = []

In [None]:
Alphabet.nBits = 0
max_nbr_recursion = 1e4
max_nbr_solution = 1001
repeat = 10

print(f"ID | smi | weigth | Nmol | FoundMol | CPU-time | Timeout")
for i in range(5):
    smi = list_smiles[i]
    print(i, "|", smi)

    mol = Chem.MolFromSmiles(smi)
    ms = MoleculeSignature(mol, radius=Alphabet.radius, use_smarts=Alphabet.use_smarts, nbits=0, boundary_bonds=Alphabet.boundary_bonds, map_root=True)
    ms.post_compute_neighbors()
    sig = sorted([atom.to_string(neighbors=True) for atom in ms.atoms])

    st = time.time()
    smol, recursion_timeout = enumerate_molecule_from_signature(
        sig,
        Alphabet,
        smi,
        max_nbr_recursion=max_nbr_recursion,
        max_nbr_solution=max_nbr_solution,
        repeat=repeat,
        verbose=False,
    )
    ft = time.time() - st
    foundmol = smi in smol
    if foundmol == False:
        print(smol)

    print(f"{i} | {smi} | {len(smol)} | {int(foundmol)} | {ft:.4f} | {recursion_timeout}")

    list_i.append(i)
    list_smi.append(smi)
    list_nmol.append(len(smol))
    list_foundmol.append(int(foundmol))
    list_ct_mol.append(ft)
    list_recursion_timeout.append(recursion_timeout)

In [None]:
100 * sum(list_foundmol) / len(list_foundmol)

In [None]:
df = pd.DataFrame(
    {
        "ID": list_i,
        "smi": list_smi,
        "Nmol": list_nmol,
        "FoundMol": list_foundmol,
        "CT enum sig to mol": list_ct_mol,
        "Recursion timeout": list_recursion_timeout,
    }
)

df

# df.to_excel("RevSig_sig_to_mol", index=False)

- ECFP => Signature => Smiles

In [None]:
list_i = []
list_smi = []
list_wt = []
list_nsig = []
list_nsigtrue = []
list_foundsig = []
list_nmol = []
list_foundmol = []
list_ct_sig = []
list_ct_mol = []
list_ct_all = []
list_ct_solve = []
list_timeout_sig = []
list_timeout_mol = []

In [None]:
max_nbr_partition = int(1e4)
max_nbr_recursion = 1e4
max_nbr_solution = 1001
repeat = 10

print(f"ID | smi | weigth | Nsig | NsigTrue | FoundSig | Nmol | FoundMol | CPU-time")
for i in range(5):
    # preparation
    smi = list_smiles[i]
    print(i, "|", smi)
    wt = ExactMolWt(Chem.MolFromSmiles(smi))

    mol = Chem.MolFromSmiles(smi)
    ms = MoleculeSignature(mol, radius=Alphabet.radius, use_smarts=Alphabet.use_smarts, nbits=0, boundary_bonds=Alphabet.boundary_bonds, map_root=True)
    ms.post_compute_neighbors()
    sig = sorted([atom.to_string(neighbors=True) for atom in ms.atoms])
    #sig = " .. ".join(sig)
    morgan = fpgen.GetCountFingerprint(mol).ToList()
    
    # ecfp => signature(s)
    st_1 = time.time()
    Alphabet.nBits = 0
    Ssig, partition_timeout, ct_solve = enumerate_signature_from_morgan(
        morgan, Alphabet, max_nbr_partition=max_nbr_partition, verbose=False
    )
    if len(Ssig) > 20:
        partition_timeout = True
    Ssig = Ssig[:20]

    foundsig = sig in Ssig
    ct_sig = time.time() - st_1
    print(f"      ...enumeratesignature: nbr signature(s) {len(Ssig)} CPU-time {ct_sig:.4f}")

    # signature(s) => molecule(s)
    st_2 = time.time()
    Smol, Nsig = set(), 0
    Alphabet.nBits = 2048
    list_recursion_timeout = []
    for j in range(len(Ssig)):
        smol, recursion_timeout = enumerate_molecule_from_signature(
            Ssig[j],
            Alphabet,
            smi,
            max_nbr_recursion=max_nbr_recursion,
            max_nbr_solution=max_nbr_solution,
            repeat=repeat,
            verbose=False,
        )
        list_recursion_timeout.append(recursion_timeout)
        if len(smol):
            Nsig += 1
            print(f"      ...enumeratemolecule:  signature {j} nbr molecule(s) {len(smol)}")
        Smol = Smol | set(smol)
    recursion_timeout = True in list_recursion_timeout
    ct_mol = time.time() - st_2

    ct_total = time.time() - st_1

    Alphabet.nBits = 2048
    Smolfinal = []
    for smi2 in Smol:
        if test_sol_ECFP([smi, smi2], Alphabet=Alphabet):
            Smolfinal.append(smi2)

    if len(Smol) != len(Smolfinal):
        print("Smol", len(Smol), "Smolfinal", len(Smolfinal))

    foundmol = smi in Smolfinal
    if foundmol == False:
        print(Smolfinal)

    print(
        f"{i} | {smi} | {wt:.1f} | {len(Ssig)} | {Nsig} | {int(foundsig)} | {len(Smolfinal)} | {int(foundmol)} | {ct_total:.4f} | {ct_solve:.4f}"
    )

    list_i.append(i)
    list_smi.append(smi)
    list_wt.append(wt)
    list_nsig.append(len(Ssig))
    list_nsigtrue.append(Nsig)
    list_foundsig.append(int(foundsig))
    list_nmol.append(len(Smolfinal))
    list_foundmol.append(int(foundmol))
    list_ct_sig.append(ct_sig)
    list_ct_mol.append(ct_mol)
    list_ct_all.append(ct_total)
    list_ct_solve.append(ct_solve)
    list_timeout_sig.append(partition_timeout)
    list_timeout_mol.append(recursion_timeout)

In [None]:
100 * sum(list_foundmol) / len(list_foundmol)

In [None]:
df = pd.DataFrame(
    {
        "ID": list_i,
        "smi": list_smi,
        "wt": list_wt,
        "Nsig": list_nsig,
        "NsigTrue": list_nsigtrue,
        "FoundSig": list_foundsig,
        "Nmol": list_nmol,
        "Foundmol": list_foundmol,
        "CT ecfp_sig": list_ct_sig,
        "CT sig_mol": list_ct_mol,
        "CT ecfp_mol": list_ct_all,
        "CT solve_partitions": list_ct_solve,
        "Timeout ecfp_sig": list_timeout_sig,
        "Timeout sig_mol": list_timeout_mol,
    }
)

df

#df.to_excel("RevSig_ecfp_to_mol.xlsx", index=False)