In [66]:
from datasets import load_dataset
import selfies as sf
import numpy as np

dataset_name = 'liupf/ChEBI-20-MM'

dataset = load_dataset(dataset_name)
df_train = dataset['train'].to_pandas()
df_valid = dataset['validation'].to_pandas()
df_test = dataset['test'].to_pandas()

all_dataset = np.concatenate((df_train['SELFIES'].values, df_valid['SELFIES'].values, df_test['SELFIES'].values))
train_dataset = df_train['SELFIES'].values
valid_dataset = df_valid['SELFIES'].values
train_dataset[0:3]

array(['[C][C][C][C][C][C@@H1][O][C@@H1][Ring1][Ring1][/C][=C][/C][Branch1][C][O][C][/C][=C][\\C][/C][=C][\\C][C][C][C][=Branch1][C][=O][O-1]',
       '[125Te]',
       '[C][C][=Branch1][C][=O][O][C@@H1][C][C@H1][C][Branch1][C][C][Branch1][C][C][C][=Branch1][C][=O][C][=C][C@][Ring1][=Branch2][Branch1][C][C][C@H1][C][C][C@][Branch1][C][C][C][=Branch1][=N][=C][C][C@H1][Ring1][=Branch1][C][C][=C][O][C][=Ring1][Branch1][C@@][Ring1][#C][Ring2][Ring1][O][C]'],
      dtype=object)

In [58]:
alphabet = sf.get_alphabet_from_selfies(all_dataset)
alphabet.add("[nop]")  # [nop] is a special padding symbol
alphabet = list(sorted(alphabet))  
print(len(alphabet))

322


In [107]:
def multiple_selfies_to_hot(selfies_list, largest_molecule_len, alphabet):
    """Convert a list of selfies strings to a one-hot encoding
    """
    hot_list = []
    for s in selfies_list:
        _, onehot_encoded = selfies_to_hot(s, largest_molecule_len, alphabet)
        hot_list.append(onehot_encoded)
    return np.array(hot_list)

def selfies_to_hot(selfie, largest_selfie_len, alphabet):
    """Go from a single selfies string to a one-hot encoding.
    """
    symbol_to_int = dict((c, i) for i, c in enumerate(alphabet))

    # pad with [nop]
    selfie += '[nop]' * (largest_selfie_len - sf.len_selfies(selfie))

    # integer encode
    symbol_list = sf.split_selfies(selfie)
    try:
        integer_encoded = [symbol_to_int[symbol] for symbol in symbol_list]
    except (KeyError):
        integer_encoded = np.zeros(largest_selfie_len, dtype=int)

    # one hot-encode the integer encoded selfie
    onehot_encoded = list()
    for index in integer_encoded:
        letter = [0] * len(alphabet)
        letter[index] = 1
        onehot_encoded.append(letter)

    return integer_encoded, np.array(onehot_encoded)

In [108]:
train_dataset[16:20]

array(['[C][=C][Branch1][C][C][C][C][C][C@H1][Branch1][C][C][C][C][O][C][=Branch1][C][=O][C][C][=C][C][=C][C][=C][Ring1][=Branch1]',
       '[O].[O].[O].[O].[O].[O].[Cl-1].[Cl-1].[Ni+2]',
       '[C][C][C][C][C][C][C][C][C][O]',
       '[C][C][O][C][O][C][=C][C][=C][Branch1][#Branch2][O][S][Branch1][C][C][=Branch1][C][=O][=O][C][=C][Ring1][O][C][Ring1][=C][Branch1][C][C][C]'],
      dtype=object)

In [109]:
df_train.iloc[17]

CID                                                        24645
SMILES                            O.O.O.O.O.O.[Cl-].[Cl-].[Ni+2]
description    The molecule is a hydrate of nickel chloride c...
polararea                                                    6.0
xlogp                                                        NaN
inchi          InChI=1S/2ClH.Ni.6H2O/h2*1H;;6*1H2/q;;+2;;;;;;...
iupacname                      nickel(2+);dichloride;hexahydrate
SELFIES             [O].[O].[O].[O].[O].[O].[Cl-1].[Cl-1].[Ni+2]
Name: 17, dtype: object

In [None]:
largest_selfies_len = max(sf.len_selfies(s) for s in all_dataset)
selfies = multiple_selfies_to_hot(train_dataset, largest_selfies_len, alphabet)

In [111]:
selfies_reshaped = np.reshape(selfies, (len(selfies),-1))
len(selfies_reshaped[0])

325220

In [86]:
sf.encoder('O.O.O.O.O.O.[Cl-].[Cl-].[Ni+2]')

'[O].[O].[O].[O].[O].[O].[Cl-1].[Cl-1].[Ni+2]'

In [88]:
sf.decoder('[O].[O].[O].[O].[O].[O].[Cl-1].[Cl-1].[Ni+2]')

'O.O.O.O.O.O.[Cl-1].[Cl-1].[Ni+2]'

In [91]:
sf.de ('[O].[O].[O].[O].[O].[O].[Cl-1].[Cl-1].[Ni+2]')

'[OH0].[OH0].[OH0].[OH0].[OH0].[OH0].[Cl-1].[Cl-1].[Ni+2]'