# Build Training sets and Signature Alphabets

In [1]:
# CHANGE TO YOUR DIRECTORY PATH
path_directory = "C:/Users/meyerp/Documents/INRAE/Diophantine/Enumération/github/signature"
dataset_directory = "C:/Users/meyerp/Documents/INRAE/Datasets"

import os
import time

os.chdir(path_directory)
os.environ["RD_CANON"] = "True"

import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit import RDLogger

RDLogger.DisableLog("rdApp.*")

from src.signature.Signature import MoleculeSignature
from src.signature.signature_old import sanitize_molecule
from src.signature.signature_alphabet import load_alphabet, SignatureAlphabet, signature_from_smiles
from src.signature.utils import read_csv, read_tsv, read_txt, write_csv

Canonicalization of SMARTS is enabled.


In [2]:
use_smarts = False

## Build molecule datasets: sanitize molecules

In [None]:
# Load file and sanitize molecule
# Final file format is two columns ID + canonical SMILES
# Warning: this cell is slow

def sanitize(data, MaxMolecularWeight, size):
    # Remove molecules with weight > MaxMolecularWeight
    # and with more than one piece. Make sure all molecules
    # are unique.
    D, SMI = [], set()
    for i in range(data.shape[0]):
        ID, smi = data[i, 0], str(data[i, 1])
        if smi == "nan":
            continue
        if i % 100000 == 0:
            print(f"-------{i} {data[i,0]} {data[i,1]} {len(smi)}")
        if smi.find(".") != -1:
            continue  # not in one piece
        if smi in SMI:
            continue  # aready there
        if len(smi) > int(MaxMolecularWeight / 5):  # Cheap skip
            continue
        mol, smi = sanitize_molecule(Chem.MolFromSmiles(smi))
        if mol == None:
            continue
        mw = Chem.Descriptors.ExactMolWt(mol)
        if mw > MaxMolecularWeight:
            continue
        if smi in SMI:
            continue  # canonical smi aready there
        SMI.add(smi)
        data[i, 1] = smi
        D.append(data[i])
        if len(D) >= size:
            break
    return np.asarray(D)


MaxMolecularWeight = 500
size = float("inf")
filename = dataset_directory + "/MetaNetX"
H, D = read_tsv(filename)
H = ["ID", "SMILES"]
D = D[:, [0, 8]]
np.random.shuffle(D)
print(f"size={D.shape[0]}")
D = sanitize(D, MaxMolecularWeight, size)
filename = f"{filename}_weight_{str(MaxMolecularWeight)}"
print(f"File={filename} Header={H} D={D.shape}")
#write_csv(filename, H, D)
"""
MaxMolecularWeight = 500
size = 1e6
filename = dataset_directory + "/emolecule"
H = ['ID', 'SMILES']
T = read_txt(filename+'.txt')
print(f'Size={len(T)}')
D = {}
for i in range(len(T)-1):
    line = list(T[i+1].split(' '))
    D[i] = [line[1]+'_'+line[2], line[0]]
D = np.asarray(list(D.values()))
np.random.shuffle(D)
print(f'File={filename} Header={H} D={D.shape}')
D = sanitize(D, MaxMolecularWeight, size)
filename = f'{filename}_weight_{str(MaxMolecularWeight)}'
print(f'File={filename} Header={H} D={D.shape}')
write_csv(filename, H, D)
"""

## Build training sets: filter molecules for generative models and deterministic enumeration

In [None]:
# Compute signature in various format
def filter(smi, radius, verbose=False):
    if "." in smi:  #
        return "", "", None, ""
    if "*" in smi:  #
        return "", "", None, ""
    if "[" in smi:  # cannot process [*] without kekularization
        if "@" not in smi:
            return "", "", None, ""
    #Alphabet = SignatureAlphabet(radius=radius, nBits=0)
    #sig1, mol, smi = signature_from_smiles(smi, Alphabet, neighbor=True, verbose=False)
    #Alphabet = SignatureAlphabet(radius=radius, nBits=2048)
    #sig2, mol, smi = signature_from_smiles(smi, Alphabet, neighbor=True, verbose=False)
    
    mol = Chem.MolFromSmiles(smi)
    if mol is None:
        return "", "", None, ""
    ms = MoleculeSignature(mol, radius=radius, neighbor=True, use_smarts=use_smarts, nbits=0)
    sig1 = ms.as_deprecated_string(morgan=False, neighbors=True)
    ms = MoleculeSignature(mol, radius=radius, neighbor=True, use_smarts=use_smarts, nbits=2048)
    sig2 = ms.as_deprecated_string(morgan=False, neighbors=True)
    
    if sig1 == "" or sig2 == "":
        return "", "", None, ""
    return sig1, sig2, mol, smi


# parameters
radius = 2
size = 1000
ext = f"_radius_{radius}_size_{size}"
if use_smarts:
    ext = ext + "_smarts"
# filename = './datasets/emolecule_weight_500'
filename = dataset_directory + "/MetaNetX_weight_500"
seed = 10
np.random.seed(seed=seed)

# Load Smiles file
H, D = read_csv(filename)
print(H, D.shape[0])
Smiles = np.asarray(list(set(D[:, 1])))
print(f"Number of smiles: {len(Smiles)}")
np.random.shuffle(Smiles)

# Get to business
H = [
    "SMILES",
    "SIG",
    "SIG-NBIT",
]
D, i, I = {}, 0, 0
while True:
    sig1, sig2, mol, smi = filter(Smiles[i], radius)
    if sig1 == "" or sig2 == "":
        print(Smiles[i])
        i += 1
        continue
    D[I] = [smi, sig1, sig2]
    i, I = i + 1, I + 1
    if I == size:
        break
D = np.asarray(list(D.values()))
#write_csv(filename + ext, H, D)

## Build Signature Alphabet

In [5]:
# Compute signature in various format

# Parameters
radius = 2
nBits = 2048
allHsExplicit = False
ext = "_Alphabet"
ext += "_hydrogen" if allHsExplicit else ""
ext += f"_radius_{radius}_nBits_{nBits}"
if use_smarts:
    ext = ext + "_smarts"
file_smiles = dataset_directory + "/MetaNetX_weight_500"
file_alphabet = file_smiles + ext
seed = 10
np.random.seed(seed=seed)

# Load Smiles file
H, D = read_csv(file_smiles)
print(f"Header={H}\nD={D.shape}")
Smiles = np.asarray(list(D[:, 1]))
Smiles = Smiles[:50, ]
print(f"Number of smiles: {len(Smiles)}")

# Create Alphabet
print("nBits", nBits)
Alphabet = SignatureAlphabet(radius=radius, nBits=nBits, use_smarts=use_smarts)

# Get save and load Alphabet
start_time = time.time()
Alphabet.fill(Smiles, verbose=False)
#Alphabet.save(file_alphabet)
#Alphabet = load_alphabet(file_alphabet)
ft = time.time() - start_time
print(f"CPU time compute Alphabet: {ft:.2f}")
Alphabet.print_out()

Header=['ID', 'SMILES']
D=(188547, 2)
Number of smiles: 50
nBits 2048
... processing alphabet iteration: 0 size: 0 time: 0.000000
CPU time compute Alphabet: 0.52
filename: 
radius: 3
nBits: 2048
splitcomponent: False
isomericSmiles: False
formalCharge: True
atomMapping: False
kekuleSmiles: False
allHsExplicit: False
maxvalence: 4
alphabet length: 541
