# Build Training sets and Signature Alphabets

In [11]:
# CHANGE TO YOUR DIRECTORY PATH
path_directory = "C:/Users/meyerp/Documents/INRAE/Diophantine/Enumération/github/signature"
dataset_directory = "C:/Users/meyerp/Documents/INRAE/Datasets"

import os
import time

os.chdir(path_directory)

import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit import RDLogger

RDLogger.DisableLog("rdApp.*")

from signature.Signature import MoleculeSignature
from signature.signature_alphabet import compatible_alphabets, load_alphabet, merge_alphabets, sanitize_molecule, SignatureAlphabet, signature_from_smiles
from signature.utils import read_csv, read_tsv, read_txt, write_csv

## Build molecule datasets: sanitize molecules

In [None]:
# Load file and sanitize molecule
# Final file format is two columns ID + canonical SMILES
# Warning: this cell is slow

def sanitize(data, MaxMolecularWeight, size):
    # Remove molecules with weight > MaxMolecularWeight
    # and with more than one piece. Make sure all molecules
    # are unique.
    D, SMI = [], set()
    for i in range(data.shape[0]):
        ID, smi = data[i, 0], str(data[i, 1])
        if smi == "nan":
            continue
        if i % 100000 == 0:
            print(f"-------{i} {data[i,0]} {data[i,1]} {len(smi)}")
        if smi.find(".") != -1:
            continue  # not in one piece
        if smi in SMI:
            continue  # aready there
        if len(smi) > int(MaxMolecularWeight / 5):  # Cheap skip
            continue
        mol, smi = sanitize_molecule(Chem.MolFromSmiles(smi), formalCharge=True)
        if mol == None:
            continue
        mw = Chem.Descriptors.ExactMolWt(mol)
        if mw > MaxMolecularWeight:
            continue
        if smi in SMI:
            continue  # canonical smi aready there
        SMI.add(smi)
        data[i, 1] = smi
        D.append(data[i])
        if len(D) >= size:
            break
    return np.asarray(D)

In [None]:
"""
MaxMolecularWeight = 500
size = float("inf")
filename = dataset_directory + "/MetaNetX"
H, D = read_tsv(filename)
H = ["ID", "SMILES"]
D = D[:, [0, 8]]
np.random.shuffle(D)
print(f"size={D.shape[0]}")
D = sanitize(D, MaxMolecularWeight, size)
filename = f"{filename}_weight_{str(MaxMolecularWeight)}"
print(f"File={filename} Header={H} D={D.shape}")
write_csv(filename, H, D)
"""

MaxMolecularWeight = 500
size = 1e6
filename = dataset_directory + "/emolecule"
H = ['ID', 'SMILES']
T = read_txt(filename+'.txt')
size = len(T)
print(f'Size={len(T)}')
D = {}
for i in range(len(T)-1):
    line = list(T[i+1].split(' '))
    D[i] = [line[1]+'_'+line[2], line[0]]
D = np.asarray(list(D.values()))
np.random.shuffle(D)
print(f'File={filename} Header={H} D={D.shape}')
D = sanitize(D, MaxMolecularWeight, size)
filename = f'{filename}_weight_{str(MaxMolecularWeight)}'
print(f'File={filename} Header={H} D={D.shape}')
write_csv(filename, H, D)

## Old: Build training sets: filter molecules for generative models and deterministic enumeration

In [None]:
# Compute signature in various format
def filter(smi, radius, verbose=False):
    if "." in smi:  #
        return "", "", None, ""
    if "*" in smi:  #
        return "", "", None, ""
    if "[" in smi:  # cannot process [*] without kekularization
        if "@" not in smi:
            return "", "", None, ""

    mol = Chem.MolFromSmiles(smi)
    if mol is None:
        return "", "", None, ""
    ms = MoleculeSignature(mol, radius=radius, neighbor=True, use_smarts=use_smarts, nbits=0)
    sig1 = ms.as_deprecated_string(morgan=False, neighbors=True)
    ms = MoleculeSignature(mol, radius=radius, neighbor=True, use_smarts=use_smarts, nbits=2048)
    sig2 = ms.as_deprecated_string(morgan=False, neighbors=True)
    
    if sig1 == "" or sig2 == "":
        return "", "", None, ""
    return sig1, sig2, mol, smi


# parameters
radius = 2
size = 1000
ext = f"_radius_{radius}_size_{size}"
if use_smarts:
    ext = ext + "_smarts"
# filename = './datasets/emolecule_weight_500'
filename = dataset_directory + "/MetaNetX_weight_500"
seed = 10
np.random.seed(seed=seed)

# Load Smiles file
H, D = read_csv(filename)
print(H, D.shape[0])
Smiles = np.asarray(list(set(D[:, 1])))
print(f"Number of smiles: {len(Smiles)}")
np.random.shuffle(Smiles)

# Get to business
H = [
    "SMILES",
    "SIG",
    "SIG-NBIT",
]
D, i, I = {}, 0, 0
while True:
    sig1, sig2, mol, smi = filter(Smiles[i], radius)
    if sig1 == "" or sig2 == "":
        print(Smiles[i])
        i += 1
        continue
    D[I] = [smi, sig1, sig2]
    i, I = i + 1, I + 1
    if I == size:
        break
D = np.asarray(list(D.values()))
#write_csv(filename + ext, H, D)

## Build Signature Alphabet

In [7]:
# Compute signature in various format

# Parameters
radius = 2
nBits = 2048
allHsExplicit = False
use_smarts = True
boundary_bonds = False
ext = "_Alphabet"
ext += "_hydrogen" if allHsExplicit else ""
ext += f"_radius_{radius}_nBits_{nBits}"
file_smiles = dataset_directory + "/MetaNetX_weight_500"
file_alphabet = file_smiles + ext
seed = 10
np.random.seed(seed=seed)

# Load Smiles file
H, D = read_csv(file_smiles)
print(f"Header={H}\nD={D.shape}")
Smiles = np.asarray(list(D[:, 0]))
print(f"Number of smiles: {len(Smiles)}")

# Create Alphabet
print("nBits", nBits)
Alphabet = SignatureAlphabet(radius=radius, nBits=nBits, use_smarts=use_smarts, boundary_bonds=boundary_bonds, map_root=True, legacy=False)

# Get save and load Alphabet
start_time = time.time()
Alphabet.fill(Smiles, verbose=False)
ft = time.time() - start_time
print(f"CPU time compute Alphabet: {ft:.2f}")
Alphabet.print_out()
Alphabet.save(file_alphabet)

Header=['SMILES']
D=(208763, 1)
Number of smiles: 208763
nBits 2048
... processing alphabet iteration: 0 size: 0 time: 0.000000
... processing alphabet iteration: 1000 size: 8483 time: 31.616388
... processing alphabet iteration: 2000 size: 13737 time: 33.360161
... processing alphabet iteration: 3000 size: 18122 time: 32.209820
... processing alphabet iteration: 4000 size: 21736 time: 29.097859
... processing alphabet iteration: 5000 size: 25206 time: 27.581646
... processing alphabet iteration: 6000 size: 28176 time: 30.087819
... processing alphabet iteration: 7000 size: 31027 time: 29.410967
... processing alphabet iteration: 8000 size: 33661 time: 29.536975
... processing alphabet iteration: 9000 size: 36239 time: 29.671223
... processing alphabet iteration: 10000 size: 38585 time: 28.913601
... processing alphabet iteration: 11000 size: 40906 time: 30.375118
... processing alphabet iteration: 12000 size: 43105 time: 32.094247
... processing alphabet iteration: 13000 size: 45150 t

### To merge alphabets

- generate several alphabets

In [None]:
# Compute signature in various format

# Parameters
radius = 2
nBits = 2048
allHsExplicit = False
ext = "_Alphabet"
ext += "_hydrogen" if allHsExplicit else ""
ext += f"_radius_{radius}_nBits_{nBits}"
file_smiles = dataset_directory
file_alphabet = file_smiles + ext

In [None]:
# Load Smiles file
for i in range(1, 18):
    H, D = read_csv(file_smiles + str(i))
    print(f"Header={H}\nD={D.shape}")

    Smiles = np.asarray(list(D[:, 1]))
    print(f"Number of smiles: {len(Smiles)}")

    # Create Alphabet
    print("nBits", nBits)
    Alphabet = SignatureAlphabet(radius=radius, nBits=nBits, use_smarts=use_smarts, boundary_bonds=boundary_bonds, map_root=True, legacy=False)

    # Get save and load Alphabet
    start_time = time.time()
    Alphabet.fill(Smiles, verbose=False)
    Alphabet.save(file_alphabet + str(i))
    ft = time.time() - start_time
    print(f"CPU time compute Alphabet: {ft:.2f}")
    Alphabet.print_out()
    print("\n\n")

- load two alphabets

In [None]:
file_alphabet_1 = dataset_directory + "/MetaNetX_weight_500_fcharge_Alphabet_radius_2_nBits_2048"
Alphabet_1 = load_alphabet(file_alphabet_1)
Alphabet_1.print_out()

In [None]:
file_alphabet_2 = dataset_directory + "/emolecule_weight_500_fcharge__Alphabet_radius_2_nBits_2048"
Alphabet_2 = load_alphabet(file_alphabet_2)
Alphabet_2.print_out()

- merge

In [None]:
compatible_alphabets(Alphabet_1, Alphabet_2)

In [None]:
Alphabet_3 = merge_alphabets(Alphabet_1, Alphabet_2)
Alphabet_3.print_out()
# Alphabet_3.save(dataset_directory + "/MetaNetX_emolecule_weight_500_fcharge_Alphabet_radius_2_nBits_2048")