In [None]:
def smi_tokenizer(smi):
    """
    Tokenize a SMILES molecule or reaction
    """
    import re
    pattern =  "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
    regex = re.compile(pattern)
    tokens = [token for token in regex.findall(smi)]
    assert smi == ''.join(tokens), f"Smiles: {smi}, Tokens: {''.join(tokens)}"
    return ' '.join(tokens)

In [None]:

for file in ('test_src', 'test_tgt', 'train_src', 'train_tgt', 'valid_src', 'valid_tgt'):
        with open(f'/baldig/chemistry/2023_rp/mirana_data/no_maps/{file}.txt', 'r') as f:
                reactions = [smi_tokenizer(smile) for smile in f.read().splitlines()]
        
        with open(f'/baldig/chemistry/2023_rp/mirana_data/split/{file}.txt', 'w+') as f:
                f.write('\n'.join(reactions))

In [1]:
from rdkit import Chem
from tqdm import tqdm
import numpy as np
import re

# File format:
# [src]>>[tgt] [electron pushing arrows]
with open('../data/new_mayr_data_09-19-23/mayr_nomaps.csv') as f:
    reactions = [re.split(' |>>', line) for line in f.readlines()]

src, tgt, _ = np.transpose(reactions)

num_reactions = len(src)
assert len(src) == len(tgt)

train_pct, val_pct, test_pct = 0.8, 0.1, 0.1
assert train_pct + val_pct + test_pct == 1

split1 = int(train_pct * num_reactions)
split2 = int((train_pct + val_pct) * num_reactions)

indices = np.random.permutation(num_reactions)
src = src[indices]
tgt = tgt[indices]

datasets = {
    'src-train': src[:split1],
    'tgt-train': tgt[:split1],
    'src-val': src[split1:split2], 
    'tgt-val': tgt[split1:split2],
    'src-test': src[split2:],
    'tgt-test': tgt[split2:],
}

print(split1, split2)

# Write split datasets
# 
# for title, reactions in datasets.items():
#     augm = Chem.MolToSmiles(Chem.MolFromSmiles())
#     reactions = np.append(reactions, [Chem.MolToSmiles(Chem.MolFromSmiles(smile), doRandom=True) for smile in tqdm(reactions)])
#     print(reactions)
#
#     with open(f'../data/new_mayr_data_09-19-23/preprocessed/{title}.csv', 'w+') as f:
#         f.write('\n'.join(smi_tokenizer(smile) for smile in reactions))

76710 86299


In [None]:
from rdkit import Chem
from rdkit.Chem import Draw, AllChem

# Display random molecules

n = 1000
print(src[n])
print(tgt[n])

src_m = Chem.MolFromSmiles(src[n])
tgt_m = Chem.MolFromSmiles(tgt[n])
Draw.MolsToGridImage((src_m, tgt_m))


In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem

def remove_maps(smi):
    smi = smi.replace(' ','')
    mol = Chem.MolFromSmiles(smi, sanitize=False)

    if mol is None:
        print(f'Smile Error:\n{smi}')

    [a.SetAtomMapNum(0) for a in mol.GetAtoms()]
    return Chem.MolToSmiles(mol)


In [None]:
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit.Chem import AtomValenceException

def process_file(input_file, output_file):
    with open(input_file, 'r') as infile:
        lines = infile.readlines()

    std_smi = [remove_maps(line.replace(' ','')) for line in lines]

    with open(output_file, 'w+') as outfile:
        outfile.write('\n'.join(std_smi))


for file in ('test_src', 'test_tgt', 'train_src', 'train_tgt', 'valid_src', 'valid_tgt'):
    process_file(f'/baldig/chemistry/2023_rp/mirana_data/{file}.txt',
                 f'/baldig/chemistry/2023_rp/mirana_data/no_maps/{file}.txt')

In [None]:
with open('/baldig/chemistry/2023_rp/OpenNMT-py/models/mayr_20000_5.txt', 'rb') as f:
    print(num_lines:=sum(1 for _ in f), num_lines%5)