# Build Training sets and Signature Alphabets

## Build molecule datasets: sanitize molecules

In [1]:
# Load file and sanitize molecule
# Final file format is two columns ID + canonical SMILES
# Warning: this cell is slow

import os
path_directory = "C:/Users/meyerp/Documents/INRAE/Diophantine/Enumération/github/signature" # CHANGE TO YOUR DIRECTORY PATH
os.chdir(path_directory)

from src.imports import *
from src.signature import SanitizeMolecule
from src.utils import read_tsv, read_txt, write_csv

def sanitize(data, MaxMolecularWeight, size):
    # Remove molecules with weight > MaxMolecularWeight
    # and with more than one piece. Make sure all molecules
    # are unique.    
    D, SMI = [], set()
    for i in range(data.shape[0]):
        ID, smi = data[i,0], str(data[i,1])
        if smi == 'nan':
            continue
        if i % 100000 == 0:
            print(f'-------{i} {data[i,0]} {data[i,1]} {len(smi)}')
        if smi.find('.') != -1:
            continue # not in one piece
        if smi in SMI:
            continue # aready there
        if len(smi) > int(MaxMolecularWeight/5): # Cheap skip
            continue
        mol, smi = SanitizeMolecule(Chem.MolFromSmiles(smi))
        if mol == None:
            continue
        mw = Chem.Descriptors.ExactMolWt(mol)
        if mw > MaxMolecularWeight:
            continue
        if smi in SMI:
            continue  # canonical smi aready there      
        SMI.add(smi)
        data[i,1] = smi
        D.append(data[i])
        if len(D) >= size:
            break
    return np.asarray(D)
      
MaxMolecularWeight = 500
size = float('inf')
filename = './datasets/MetaNetX'
H, D = read_tsv(filename)
H = ['ID', 'SMILES']
D = D[:,[0,8]]
np.random.shuffle(D)
print(f'size={D.shape[0]}')
D = sanitize(D, MaxMolecularWeight, size)
filename = f'{filename}_weight_{str(MaxMolecularWeight)}'
print(f'File={filename} Header={H} D={D.shape}')
write_csv(filename, H, D)
"""
MaxMolecularWeight = 500
size = 1e6
filename = './datasets/emolecule'
H = ['ID', 'SMILES']
T = read_txt(filename+'.txt')
print(f'Size={len(T)}')
D = {}
for i in range(len(T)-1):
    line = list(T[i+1].split(' '))
    D[i] = [line[1]+'_'+line[2], line[0]]
D = np.asarray(list(D.values()))
np.random.shuffle(D)
print(f'File={filename} Header={H} D={D.shape}')
D = sanitize(D, MaxMolecularWeight, size)
filename = f'{filename}_weight_{str(MaxMolecularWeight)}'
print(f'File={filename} Header={H} D={D.shape}')
write_csv(filename, H, D)
"""  

size=1292154
-------0 MNXM1217910 CCCCC=CCCCCCCCCCCC(=O)OC(COC(=O)CCCCCCCC(O)C(O)C(O)C=CCCCC)COC(=O)CCCCCCCC1OC1C(O)C(O)C(O)CCCC 94


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\meyerp\.conda\envs\diophant\lib\site-packages\IPython\core\interactiveshell.py", line 3457, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\meyerp\AppData\Local\Temp\ipykernel_2288\3546549132.py", line 49, in <module>
    D = sanitize(D, MaxMolecularWeight, size)
  File "C:\Users\meyerp\AppData\Local\Temp\ipykernel_2288\3546549132.py", line 26, in sanitize
    mol, smi = SanitizeMolecule(Chem.MolFromSmiles(smi))
  File "C:\Users\meyerp\Documents\INRAE\Diophantine\Enumération\RevSig1.7rad3\library\signature.py", line 203, in SanitizeMolecule
    [a.SetAtomMapNum(0) for a in mol.GetAtoms()]
  File "C:\Users\meyerp\Documents\INRAE\Diophantine\Enumération\RevSig1.7rad3\library\signature.py", line 203, in <listcomp>
    [a.SetAtomMapNum(0) for a in mol.GetAtoms()]
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\mey

TypeError: object of type 'NoneType' has no len()

## Build training sets: filter molecules for generative models and deterministic enumeration

In [2]:
# Compute signature in various format
from src.imports import *
from src.utils import read_tsv, read_txt, read_csv, write_csv
from src.signature_alphabet import SignatureAlphabet, SignatureFromSmiles

def filter(smi, radius, verbose=False):
    if '.' in smi: # 
        return '', '', None, ''
    if '*' in smi: # 
        return '', '',  None, ''
    if '[' in smi: # cannot process [*] without kekularization
        if '@' not in smi:
            return '', '',  None, ''
    Alphabet = SignatureAlphabet(radius=radius, nBits=0)
    sig1, mol, smi = SignatureFromSmiles(smi, Alphabet, neighbor=True, verbose=False)
    Alphabet = SignatureAlphabet(radius=radius, nBits=2048)
    sig2, mol, smi = SignatureFromSmiles(smi, Alphabet, neighbor=True, verbose=False)
    if sig1 == '' or sig2 == '' :
        return '', '', None, ''  
    return sig1, sig2, mol, smi

# parameters
radius = 2
size = 1000
ext = f'_radius_{radius}_size_{size}'
#filename = './datasets/emolecule_weight_500'
filename = './datasets/MetaNetX_weight_500'
seed = 10
np.random.seed(seed=seed)

# Load Smiles file
H, D = read_csv(filename)
print(H, D.shape[0])
Smiles = np.asarray(list(set(D[:,1])))
print(f'Number of smiles: {len(Smiles)}')
np.random.shuffle(Smiles)

# Get to business
H = ['SMILES', 'SIG', 'SIG-NBIT', ]
D, i, I = {}, 0, 0
while True:
    sig1, sig2, mol, smi = filter(Smiles[i], radius)
    if sig1 == '' or sig2 == '':
        print(Smiles[i])
        i += 1
        continue
    D[I] = [smi, sig1, sig2]
    i, I = i+1, I+1
    if I == size:
        break
D = np.asarray(list(D.values()))
write_csv(filename+ext, H, D)


['ID', 'SMILES'] 188547
Number of smiles: 188547
CC1C[NH]=CN1
NC(=O)CCC([NH3])C(N)=O
*OC1CC(n2cc(C=O)c(N)nc2=O)OC1COP(*)(O)=O
CC(C1CSC(C2CSC(c3ccccc3O)=N2)[NH2]1)C(C)(C)C1=NC(C)(C(O)=O)CS1
CN1CC(C(=O)O)C=C2c3cccc4[nH]cc(c34)CC21
ON(=O)c1ccc(C(=O)OCC(=O)NC(=O)NCc2ccccc2)s1
OCl(O)O
CCO[PH](=O)O
*NC(CC(C)(C)O)C(*)=O
*C1OCC(O)C(O)C1O
COc1cc2c(cc1O)C1Cc3ccc(OC)c(O)c3CN1(C)CC2
[NH3]CCCC(NC(=O)CC[NH3])C(O)=O
CC(C)CC(N)C(=O)NC(Cc1c[nH]c2ccccc12)C(=O)NC(Cc1ccccc1)C(=O)O
*OC1OC(COS(O)(=O)=O)C(O)C(O*)C1NC(C)=O
O=C(CCCn1ccc(-c2ccc(Cl)cc2)cc1)c1ccc(F)cc1
CC(C)C(C(O)=O)N(Cc1ccc(-c2ccccc2-c2nn[nH]n2)cc1)C(=O)C(O)C(O)C(C)O
COc1c(O)c(O)cc2cc(-c3cc4c(cc3CC=O)OCO4)[nH]cc12
Cc1cccc(C(C)c2c[nH]cn2)c1C
[CH2]
COC(Cc1c(C(O)=O)[nH]c(O)c1CC=O)C(O)=O
NC(O)(CC(=O)N(CC=O)Cc1nnc(C(F)(F)F)[nH]1)Cc1cc(F)c(F)cc1F
CC(=O)NC(CCCC[NH3])C(O)=O
NC(CO)C(=O)NCC(=O)NC(Cc1c[nH]c2ccccc12)C(=O)O
O=c1c2cccnc2oc2c(Cl)cc(-c3nnn[nH]3)cc12
ON(=O)c1c(C(O)c2cc(Cl)cc(Cl)c2O)[nH]c(Cl)c1Cl
[NH3]CCCC(=O)C(O)=O
*C1CCC2C3CCC4CC(OC5OC(CO)C(O)C

## Build Signature Alphabet

In [3]:
# Compute signature in various format
from src.imports import *
from src.utils import read_tsv, read_txt, read_csv, write_csv
from src.signature_alphabet import LoadAlphabet, SignatureAlphabet, SignatureFromSmiles

# Parameters
radius = 2
nBits = 2048
allHsExplicit = False
ext = '_Alphabet'
ext += '_hydrogen' if allHsExplicit else ''
ext += f'_radius_{radius}_nBits_{nBits}'
file_smiles = './datasets/MetaNetX_weight_500'
file_alphabet = file_smiles+ext
seed = 10
np.random.seed(seed=seed)

# Load Smiles file
H, D = read_csv(file_smiles)
print(f'Header={H}\nD={D.shape}')
Smiles = np.asarray(list(D[:,1]))
print(f'Number of smiles: {len(Smiles)}')

# Create Alphabet
Alphabet = SignatureAlphabet(radius=radius, nBits=nBits)

# Get save and load Alphabet
start_time = time.time()
Alphabet.fill(Smiles, verbose=False)
Alphabet.save(file_alphabet)
Alphabet = LoadAlphabet(file_alphabet)
print(f'CPU time compute Alphabet: {time.time() - start_time:.2f}')
Alphabet.printout()

Header=['ID', 'SMILES']
D=(188547, 2)
Number of smiles: 188547
... processing alphabet iteration: 0 size: 0 time: 0.0
... processing alphabet iteration: 1000 size: 6971 time: 14.480656623840332
... processing alphabet iteration: 2000 size: 11488 time: 14.227513074874878
... processing alphabet iteration: 3000 size: 15358 time: 14.457261800765991
... processing alphabet iteration: 4000 size: 18799 time: 14.592150211334229
... processing alphabet iteration: 5000 size: 21877 time: 14.126351118087769
... processing alphabet iteration: 6000 size: 24730 time: 14.046633243560791
... processing alphabet iteration: 7000 size: 27393 time: 14.13110613822937
... processing alphabet iteration: 8000 size: 29831 time: 14.23503589630127
... processing alphabet iteration: 9000 size: 32089 time: 14.090947151184082
... processing alphabet iteration: 10000 size: 34172 time: 14.189142227172852
... processing alphabet iteration: 11000 size: 36286 time: 14.23899245262146
... processing alphabet iteration: 12

... processing alphabet iteration: 105000 size: 126396 time: 14.299549579620361
... processing alphabet iteration: 106000 size: 126887 time: 14.472020387649536
... processing alphabet iteration: 107000 size: 127437 time: 14.776380062103271
... processing alphabet iteration: 108000 size: 128110 time: 14.877512216567993
... processing alphabet iteration: 109000 size: 128681 time: 14.616986751556396
... processing alphabet iteration: 110000 size: 129232 time: 14.892893075942993
... processing alphabet iteration: 111000 size: 129801 time: 14.53908896446228
... processing alphabet iteration: 112000 size: 130405 time: 14.243186473846436
... processing alphabet iteration: 113000 size: 130936 time: 14.486886262893677
... processing alphabet iteration: 114000 size: 131483 time: 14.547106981277466
... processing alphabet iteration: 115000 size: 132057 time: 14.539445400238037
... processing alphabet iteration: 116000 size: 132635 time: 14.75304388999939
... processing alphabet iteration: 117000 