# **2. Enumerate molecules from ECFPs**

In this notebook we show how to perform the enumeration on the MetaNetx, eMolecules, DrugBank and MolForge datasets. The alphabets and datasets can be found on https://doi.org/10.5281/zenodo.15682264.

In [12]:
import pandas as pd
from rdkit import Chem, RDLogger
from rdkit.Chem import AllChem
from rdkit.Chem.Descriptors import ExactMolWt

from molsig.enumerate_signature import enumerate_molecule_from_morgan
from molsig.Signature import MoleculeSignature
from molsig.SignatureAlphabet import load_alphabet, merge_alphabets

RDLogger.DisableLog("rdApp.*")

# Datasets path

In [None]:
path_datasets = "C:/Users/meyerp/Documents/INRAE/Datasets/"

### Load of the Alphabet

We first load an Alphabet to perform the enumeration (MetaNetX, eMolecules or MetaNetX-eMolecules-DrugBank merged).

In [None]:
file_alphabet = path_datasets + "alphabets/metanetx_alphabet.npz"  # MetaNetX
#file_alphabet = path_datasets + "alphabets/emolecules_alphabet.npz"  # eMolecules
#file_alphabet = path_datasets + "alphabets/metanetx_emolecules_drugbank_chembl_merged_alphabet.npz"  # DrugBank

Alphabet = load_alphabet(file_alphabet, verbose=True)

### Load of the SMILES dataset

We load the SMILES dataset (MetaNetX, eMolecules or DrugBank).

In [None]:
path_smiles = path_datasets + "metanetx/test.tsv"  # MetaNetX
#path_smiles = path_datasets + "emolecules/test.tsv"  # eMolecules
#path_smiles = path_datasets + "drugbank/drugbank_500_no_duplicates.tsv"  # DrugBank
#path_smiles = path_datasets + "molforge/molforge.tsv"  # MolForge

df = pd.read_csv(path_smiles, sep='\t', usecols = ["SMILES"])  # MetaNetX, eMolecules, MolForge
list_smiles = list(df["SMILES"])[:10000]  # MetaNetX, eMolecules, MolForge
#df = pd.read_csv(path_smiles, sep='\t')  # DrugBank
#list_smiles = list(df["SMILES_STEREO"])  # DrugBank

### ECFP representation function

We import from AllChem the ECFP representation function and impose the same parameters than the one used when computing the Alphabet.

In [None]:
fpgen = AllChem.GetMorganGenerator(radius=Alphabet.radius, fpSize=Alphabet.nBits, includeChirality=Alphabet.use_stereo)

### Enumeration on the ECFPs of the input molecules

We set the partition and the recursion thresholds to $2 \times 10^5$.

In [None]:
max_nbr_partition = int(2e5)
max_nbr_recursion = int(2e5)

We now perform the enumeration.

In [None]:
list_i = []
list_smi = []
list_wt = []
list_nsig = []
list_nsigtrue = []
list_foundsig = []
list_nmol = []
list_foundmol = []
list_ct_sig = []
list_ct_mol = []
list_ct_all = []
list_ct_dioph = []
list_partition_threshold_reached = []
list_recursion_threshold_reached = []
list_new_smiles = []

print(f"ID | smi | weigth | Nsig | NsigTrue | FoundSig | Nmol | FoundMol | CPU-time")
for i in range(len(list_smiles)):
    smi = list_smiles[i]

    mol = Chem.MolFromSmiles(smi)
    wt = ExactMolWt(mol)
    print(i, "|", smi)
    # Compute ECFP
    morgan = fpgen.GetCountFingerprint(mol).ToList()
    # Compute molecular signature
    ms = MoleculeSignature(mol, radius=Alphabet.radius, nbits=0, map_root=True, use_stereo=False)
    ms.post_compute_neighbors()
    sig = sorted([atom.to_string(neighbors=True) for atom in ms.atoms])
    # Enumeration ECFP => molecule(s)
    Ssig, Smol, Nsig, thresholds_reached, computational_times = enumerate_molecule_from_morgan(
        morgan,
        Alphabet,
        max_nbr_partition=max_nbr_partition,
        max_nbr_recursion=max_nbr_recursion,
        verbose=True,
    )
    foundsig = sig in Ssig
    foundmol = smi in Smol
    print(
        f"{i} | {smi} | {wt:.1f} | {len(Ssig)} | {Nsig} | {int(foundsig)} | {len(Smol)} | {int(foundmol)} | {computational_times[2]:.4f} | {thresholds_reached[0]} | {thresholds_reached[1]}"
    )
    
    list_i.append(i)
    list_smi.append(smi)
    list_wt.append(wt)
    list_nsig.append(len(Ssig))
    list_nsigtrue.append(Nsig)
    list_foundsig.append(int(foundsig))
    list_nmol.append(len(Smol))
    list_foundmol.append(int(foundmol))
    list_ct_sig.append(computational_times[0])
    list_ct_mol.append(computational_times[1])
    list_ct_all.append(computational_times[2])
    list_ct_dioph.append(computational_times[3])
    list_partition_threshold_reached.append(thresholds_reached[0])
    list_recursion_threshold_reached.append(thresholds_reached[1])
    if smi in Smol:
        Smol.remove(smi)
    list_new_smiles.append(Smol)

In [None]:
100 * sum(list_foundmol) / len(list_foundmol)

## We export the results of the enumeration.

In [None]:
df = pd.DataFrame(
    {
        "ID": list_i,
        "smi": list_smi,
        "wt": list_wt,
        "Nsig": list_nsig,
        "NsigTrue": list_nsigtrue,
        "FoundSig": list_foundsig,
        "Nmol": list_nmol,
        "Foundmol": list_foundmol,
        "CT ecfp_sig": list_ct_sig,
        "CT sig_mol": list_ct_mol,
        "CT ecfp_mol": list_ct_all,
        "CT solve_partitions": list_ct_dioph,
        "ThresholdPart": list_partition_threshold_reached,
        "ThresholdRec": list_recursion_threshold_reached,
        "NewSmiles": list_new_smiles,
    }
)

#df.to_excel("RevSig_results.xlsx")