# Build Training sets and Signature Alphabets

## Build molecule datasets: sanitize molecules

In [4]:
# CHANGE TO YOUR DIRECTORY PATH
path_directory = "C:/Users/meyerp/Documents/INRAE/Diophantine/Enumération/github/signature"

import os
import time

os.chdir(path_directory)

import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit import RDLogger

RDLogger.DisableLog("rdApp.*")

from src.signature.signature import sanitize_molecule
from src.signature.signature_alphabet import load_alphabet, SignatureAlphabet, signature_from_smiles
from src.signature.utils import read_csv, read_tsv, read_txt, write_csv

In [None]:
# Load file and sanitize molecule
# Final file format is two columns ID + canonical SMILES
# Warning: this cell is slow


def sanitize(data, MaxMolecularWeight, size):
    # Remove molecules with weight > MaxMolecularWeight
    # and with more than one piece. Make sure all molecules
    # are unique.
    D, SMI = [], set()
    for i in range(data.shape[0]):
        ID, smi = data[i, 0], str(data[i, 1])
        if smi == "nan":
            continue
        if i % 100000 == 0:
            print(f"-------{i} {data[i,0]} {data[i,1]} {len(smi)}")
        if smi.find(".") != -1:
            continue  # not in one piece
        if smi in SMI:
            continue  # aready there
        if len(smi) > int(MaxMolecularWeight / 5):  # Cheap skip
            continue
        mol, smi = sanitize_molecule(Chem.MolFromSmiles(smi))
        if mol == None:
            continue
        mw = Chem.Descriptors.ExactMolWt(mol)
        if mw > MaxMolecularWeight:
            continue
        if smi in SMI:
            continue  # canonical smi aready there
        SMI.add(smi)
        data[i, 1] = smi
        D.append(data[i])
        if len(D) >= size:
            break
    return np.asarray(D)


MaxMolecularWeight = 500
size = float("inf")
filename = "./datasets/MetaNetX"
H, D = read_tsv(filename)
H = ["ID", "SMILES"]
D = D[:, [0, 8]]
np.random.shuffle(D)
print(f"size={D.shape[0]}")
D = sanitize(D, MaxMolecularWeight, size)
filename = f"{filename}_weight_{str(MaxMolecularWeight)}"
print(f"File={filename} Header={H} D={D.shape}")
write_csv(filename, H, D)
"""
MaxMolecularWeight = 500
size = 1e6
filename = './datasets/emolecule'
H = ['ID', 'SMILES']
T = read_txt(filename+'.txt')
print(f'Size={len(T)}')
D = {}
for i in range(len(T)-1):
    line = list(T[i+1].split(' '))
    D[i] = [line[1]+'_'+line[2], line[0]]
D = np.asarray(list(D.values()))
np.random.shuffle(D)
print(f'File={filename} Header={H} D={D.shape}')
D = sanitize(D, MaxMolecularWeight, size)
filename = f'{filename}_weight_{str(MaxMolecularWeight)}'
print(f'File={filename} Header={H} D={D.shape}')
write_csv(filename, H, D)
"""

## Build training sets: filter molecules for generative models and deterministic enumeration

In [2]:
# Compute signature in various format


def filter(smi, radius, verbose=False):
    if "." in smi:  #
        return "", "", None, ""
    if "*" in smi:  #
        return "", "", None, ""
    if "[" in smi:  # cannot process [*] without kekularization
        if "@" not in smi:
            return "", "", None, ""
    Alphabet = SignatureAlphabet(radius=radius, nBits=0)
    sig1, mol, smi = signature_from_smiles(smi, Alphabet, neighbor=True, verbose=False)
    Alphabet = SignatureAlphabet(radius=radius, nBits=2048)
    sig2, mol, smi = signature_from_smiles(smi, Alphabet, neighbor=True, verbose=False)
    if sig1 == "" or sig2 == "":
        return "", "", None, ""
    return sig1, sig2, mol, smi


# parameters
radius = 2
size = 1000
ext = f"_radius_{radius}_size_{size}"
# filename = './datasets/emolecule_weight_500'
filename = "./datasets/MetaNetX_weight_500"
seed = 10
np.random.seed(seed=seed)

# Load Smiles file
H, D = read_csv(filename)
print(H, D.shape[0])
Smiles = np.asarray(list(set(D[:, 1])))
print(f"Number of smiles: {len(Smiles)}")
np.random.shuffle(Smiles)

# Get to business
H = [
    "SMILES",
    "SIG",
    "SIG-NBIT",
]
D, i, I = {}, 0, 0
while True:
    sig1, sig2, mol, smi = filter(Smiles[i], radius)
    if sig1 == "" or sig2 == "":
        print(Smiles[i])
        i += 1
        continue
    D[I] = [smi, sig1, sig2]
    i, I = i + 1, I + 1
    if I == size:
        break
D = np.asarray(list(D.values()))
write_csv(filename + ext, H, D)

['ID', 'SMILES'] 188547
Number of smiles: 188547
CC(C)C(O)(C=O)N(Cc1ccc(-c2ccccc2-c2nn[nH]n2)cc1)C(=O)C(O)CC(O)CO
CC1(C)SC2C([NH3])C(=O)N2C1C(O)=O
COc1ccc(C[NH](C)CC(C)(CO)CO)cc1
c1ccc2c3c([nH]c2c1)C1CC2CCOCC2CN1CC3
*c1oc2c(O)c3oc(=O)c(*)c(*)c3c(*)c2c1*
CCCCC(=O)OCC(COP(O)(=O)OCCN(C)(C)C)OC(=O)CCCC
CCCCCCCC(=O)OC(CO)COP(O)(=O)OCC[NH3]
CCC(C)C(NC(=O)C(CCC(=O)O)NC(=O)C(N)Cc1cnc[nH]1)C(=O)O
CCCCC(=O)N(Cc1ccc(C=C(CC(O)=O)c2nn[nH]n2)cc1)C(O)(C=O)C(C)C
O=P1(O)O[U](=O)(=O)O1
ON(=O)c1ccc2c(Cl)c(C(=O)NCCCN3CCOCC3)sc2c1
CCCCCC=CC(=O)OC(CCC(O)=O)N(C)(C)C
C#NC1C2c3c[nH]c4cccc(c34)C(C)(C)C2CC(Cl)C1(C)C=C
CCC(C)C(NC(=O)C(Cc1c[nH]c2ccccc12)NC(=O)CN)C(=O)O
Nc1ncnc(NC2OC(CO)C(O)C2O)c1N(O)=O
ON(=O)O[Pb]ON(O)=O
Cc1nn(C)c(C)c1Nc1n[nH]c(N)n1
CCc1nnc2n1N=C(c1ccc(N(O)=O)cc1)CS2
NC(Cc1ccccc1)C(=O)NC(Cc1cnc[nH]1)C(=O)O
Cc1cnc(CS(=O)c2nc3ccc(O)cc3[nH]2)c(C)c1O


KeyboardInterrupt: 

## Build Signature Alphabet

In [5]:
# Compute signature in various format

# Parameters
radius = 2
nBits = 2048
allHsExplicit = False
ext = "_Alphabet"
ext += "_hydrogen" if allHsExplicit else ""
ext += f"_radius_{radius}_nBits_{nBits}"
file_smiles = "./datasets/MetaNetX_weight_500"
file_alphabet = file_smiles + ext
seed = 10
np.random.seed(seed=seed)

# Load Smiles file
H, D = read_csv(file_smiles)
print(f"Header={H}\nD={D.shape}")
Smiles = np.asarray(list(D[:, 1]))
print(f"Number of smiles: {len(Smiles)}")

# Create Alphabet
Alphabet = SignatureAlphabet(radius=radius, nBits=nBits)

# Get save and load Alphabet
start_time = time.time()
Alphabet.fill(Smiles, verbose=False)
Alphabet.save(file_alphabet)
Alphabet = load_alphabet(file_alphabet)
print(f"CPU time compute Alphabet: {time.time() - start_time:.2f}")
Alphabet.printout()

Header=['ID', 'SMILES']
D=(188547, 2)
Number of smiles: 188547
... processing alphabet iteration: 0 size: 0 time: 0.0
... processing alphabet iteration: 1000 size: 6971 time: 15.204696893692017


KeyboardInterrupt: 