In [35]:
from SELFIES.augm import Augmentation
import selfies as sf

import pandas as pd
import numpy as np

aliphatic_organic = ["B", "C", "N", "O", "S", "P", "F", "Cl", "Br", "I"]
aromatic_organic = ["b", "c", "n", "o", "s", "p"]
bracket = ["[", "]"]  # includes isotope, symbol, chiral, hcount, charge, class
bond = ["-", "=", "#", "$", "/", "\\", "."]
lrb = ["%"]  # long ring bonds '%TWODIGITS'
terminator = [" "]  # SPACE - start/end of selfies
wildcard = ["*"]
oov = ["oov"]  # out-of-vocabulary tokens




def get_tokens(smiles_array, split_l=1):
    tokenized_smiles_list = list()
    for ismiles in smiles_array.tolist():
        tokenized_smiles_tmp = smiles_tokenizer(ismiles)
        tokenized_smiles_list.append(
            [
                "".join(tokenized_smiles_tmp[i : i + split_l])
                for i in range(0, len(tokenized_smiles_tmp) - split_l + 1, 1)
            ]
        )
    return tokenized_smiles_list

def smiles_to_selfies(smiles):
    selfies = sf.encoder(smiles).split(sep="][")
    selfies[0] = selfies[0][1:]
    selfies[-1] = selfies[-1][:-1]
    selfies
    return selfies

In [41]:
def selfies_tokenizer(selfies):
    selfies = selfies.split(sep="][")
    selfies[0] = selfies[0][1:]
    selfies[-1] = selfies[-1][:-1]
    selfies.insert(0, " ")
    selfies.extend(" ")
    selfies = [f"[{s}]" for s in selfies]
    return selfies

def get_tokens(selfies_array):
    tokenized_selfies_list = list()
    for i_selfies in selfies_array:
        tokenized_selfies_tmp = selfies_tokenizer(i_selfies)
        tokenized_selfies_list.append(tokenized_selfies_tmp)
    return tokenized_selfies_list

def extract_vocab(enum_tokens):
    return set([itoken for i_selfies in enum_tokens for itoken in i_selfies])


def add_extra_tokens(tokens, vocab_size):
    tokens.insert(0, "[unk]")
    tokens.insert(0, "[pad]")
    vocab_size = vocab_size + 2
    return tokens, vocab_size

def get_tokentoint(tokens):
    return dict((c, i) for i, c in enumerate(tokens))

In [43]:

filename = "data/FreeSolv_SAMPL.csv"
prop = "expt"
data = pd.read_csv(filename)
data = data[["smiles", prop]]
smiles = data["smiles"].values

y = data[prop].values

canonical = False
rotation = True

enum_selfies, enum_card, enum_prop = Augmentation(
    smiles, y, canon=canonical, rotate=rotation
)

enum_tokens = get_tokens(selfies_array=enum_selfies)

tokens = list(extract_vocab(enum_tokens))
vocab_size = len(tokens)

tokens, vocab_size = add_extra_tokens(tokens, vocab_size)
max_length = np.max([len(i_selfies) for i_selfies in enum_tokens]) + 1
vocab_int_dict = get_tokentoint(tokens)
print(vocab_int_dict)
int_selfies_array = np.zeros((len(enum_tokens), max_length), dtype=np.int32)

for idx, i_selfies in enumerate(enum_tokens):
    i_selfies_tmp = list()
    if len(i_selfies) <= max_length:
        i_selfies_tmp = ["[pad]"] * (max_length - len(i_selfies)) + i_selfies  # Force output vectors to have same length
    else:
        i_selfies_tmp = i_selfies[-max_length:]  # longer vectors are truncated (to be changed...)
    integer_encoded = [vocab_int_dict[itoken] if (itoken in tokens) else vocab_int_dict["[unk]"] for itoken in i_selfies_tmp]
    int_selfies_array[idx] = integer_encoded
print(int_selfies_array)

{'[pad]': 0, '[unk]': 1, '[O-1]': 2, '[C@@H1]': 3, '[ ]': 4, '[\\Cl]': 5, '[\\O]': 6, '[\\C]': 7, '[/Cl]': 8, '[#C]': 9, '[=Branch1]': 10, '[=S]': 11, '[#Branch1]': 12, '[/C]': 13, '[I]': 14, '[#N]': 15, '[O]': 16, '[=O]': 17, '[S+2]': 18, '[P]': 19, '[=Branch2]': 20, '[=Ring1]': 21, '[Cl]': 22, '[Ring1]': 23, '[F]': 24, '[=Ring2]': 25, '[=N]': 26, '[Br]': 27, '[N]': 28, '[=N+1]': 29, '[#Branch2]': 30, '[S]': 31, '[/N]': 32, '[NH1]': 33, '[C@@]': 34, '[N+1]': 35, '[C]': 36, '[Ring2]': 37, '[Branch1]': 38, '[C@H1]': 39, '[Branch2]': 40, '[=P]': 41, '[=C]': 42, '[C@]': 43}
[[ 0  0  0 ... 23 16  4]
 [ 0  0  0 ... 16 36  4]
 [ 0  0  0 ... 23 26  4]
 ...
 [ 0  0  0 ... 23 10  4]
 [ 0  0  0 ... 23 10  4]
 [ 0  0  0 ... 23 10  4]]


In [36]:
import numpy as np

enum_tokens = token.get_tokens(smiles_array=enum)
print(len(enum_tokens))
print(enum_tokens[0])

print("集合にして語彙リストを作成")
tokens = token.extract_vocab(enum_tokens)
vocab_size = len(tokens)
print(tokens)
print("tokensの数:", vocab_size)


token.save_vocab(tokens, "test.txt")
tokens = token.get_vocab("test.txt")
print("リスト化")
print(tokens)
print(len(tokens))

print("paddingとunkを追加")
tokens, vocab_size = token.add_extra_tokens(tokens, vocab_size)
print(tokens)
print(vocab_size)

print("トークン化されたSMILESの最大長")
# +1は[unk]を含めるため
max_length = np.max([len(ismiles) for ismiles in enum_tokens]) + 1
print(max_length)

# int_vec_encodeの中身
print("辞書化")
vocab_int_dict = token.get_tokentoint(tokens)
print(vocab_int_dict)

int_smiles_array = np.zeros((len(enum_tokens), max_length), dtype=np.int32)

for csmiles, ismiles in enumerate(enum_tokens):
    ismiles_tmp = list()
    if len(ismiles) <= max_length:
        ismiles_tmp = ["pad"] * (
            max_length - len(ismiles)
        ) + ismiles  # Force output vectors to have same length
    else:
        ismiles_tmp = ismiles[
            -max_length:
        ]  # longer vectors are truncated (to be changed...)
    integer_encoded = [
        vocab_int_dict[itoken] if (itoken in tokens) else vocab_int_dict["unk"]
        for itoken in ismiles_tmp
    ]
    int_smiles_array[csmiles] = integer_encoded
print(int_smiles_array)

5355
[' ', 'C', 'O', 'c', '1', 'c', 'c', 'c', '(', 'C', '(', '=', 'O', ')', 'N', '(', 'C', ')', 'C', ')', 'c', 'c', '1', ' ']
集合にして語彙リストを作成
{'5', ')', '[C@@H]', 'O', 'Br', 'c', '2', 'S', '[S+2]', 'N', '[C@H]', 'F', '-', '#', 'I', '(', 's', '4', 'P', '\\', 'Cl', '/', '[N+]', 'C', '=', '3', ' ', 'n', '[C@@]', '[nH]', '[C@]', '1', '[O-]'}
tokensの数: 33
リスト化
['5', ')', '[C@@H]', 'O', 'Br', 'c', '2', 'S', '[S+2]', 'N', '[C@H]', 'F', '-', '#', 'I', '(', 's', '4', 'P', '\\', 'Cl', '/', '[N+]', 'C', '=', '3', ' ', 'n', '[C@@]', '[nH]', '[C@]', '1', '[O-]']
33
paddingとunkを追加
['pad', 'unk', '5', ')', '[C@@H]', 'O', 'Br', 'c', '2', 'S', '[S+2]', 'N', '[C@H]', 'F', '-', '#', 'I', '(', 's', '4', 'P', '\\', 'Cl', '/', '[N+]', 'C', '=', '3', ' ', 'n', '[C@@]', '[nH]', '[C@]', '1', '[O-]']
35
トークン化されたSMILESの最大長
51
辞書化
{'pad': 0, 'unk': 1, '5': 2, ')': 3, '[C@@H]': 4, 'O': 5, 'Br': 6, 'c': 7, '2': 8, 'S': 9, '[S+2]': 10, 'N': 11, '[C@H]': 12, 'F': 13, '-': 14, '#': 15, 'I': 16, '(': 17, 's': 18, '4': 19

In [33]:
print(len(int_smiles_array))

5355
