In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import json
import re

class SmilesTokenizer:
    def __init__(self, lookup_filename, max_length=73):
        self.lookup_table = self._load_lookup_table(lookup_filename)
        self.max_length = max_length

    def _load_lookup_table(self, filename):
        with open(filename, 'r') as f:
            return json.load(f)

    def _pad_tokens_to_max_length(self, tokens):
        padding_id = self.lookup_table['<PAD>']
        sos_id = self.lookup_table['<SOS>']
        eos_id = self.lookup_table['<EOS>']
        return [sos_id] + tokens + [eos_id] + [padding_id] * (self.max_length - len(tokens) - 2)

    def _extract_atoms_from_smiles(self, smi):
        pattern =  "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])"
        regex = re.compile(pattern)
        return [token for token in regex.findall(smi)]

    def tokenize(self, smiles_string):
        tokens = [self.lookup_table[atom] for atom in self._extract_atoms_from_smiles(smiles_string)]
        padded_tokens = self._pad_tokens_to_max_length(tokens)
        return np.array(padded_tokens)

tokenizer = SmilesTokenizer('./lookup_table.json')

trained_encoder_model = tf.keras.models.load_model('./models/encoder_model.h5')

2025-03-15 09:59:15.007628: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-03-15 09:59:15.817926: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2025-03-15 09:59:15.818025: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
2025-03-15 09:59:16.679180: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libc



In [18]:
# test
cddd_rep = trained_encoder_model.predict(tokenizer.tokenize('CC(C)(C)c1ccc2occ(CC(=O)Nc3ccccc3F)c2c1').reshape(1,-1))

print('CDDD representation:')
print(cddd_rep.shape)
cddd_rep

CDDD representation:
(1, 512)


array([[ 1.        ,  1.        , -0.99998516, -1.        , -1.        ,
         1.        ,  1.        ,  1.        , -1.        , -1.        ,
         1.        , -1.        , -1.        ,  1.        ,  1.        ,
        -1.        , -1.        ,  1.        , -1.        ,  0.999995  ,
         1.        ,  1.        ,  1.        ,  1.        , -1.        ,
        -1.        , -1.        , -1.        , -1.        , -1.        ,
         0.9999974 , -1.        , -1.        ,  1.        ,  1.        ,
        -1.        , -1.        , -1.        ,  1.        ,  1.        ,
         1.        ,  0.9999982 , -1.        ,  0.9998153 ,  1.        ,
        -1.        , -1.        ,  1.        , -1.        , -1.        ,
        -1.        , -1.        , -1.        , -1.        ,  1.        ,
        -1.        ,  1.        , -1.        , -1.        ,  1.        ,
         1.        , -1.        , -1.        , -1.        ,  1.        ,
        -1.        ,  1.        ,  1.        , -1. 

In [3]:
df = pd.read_csv('./data/all_chembl.smi.zst', header=None, names=['smiles'])
print(df.info())
df.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 698595 entries, 0 to 698594
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   smiles  698595 non-null  object
dtypes: object(1)
memory usage: 5.3+ MB
None


Unnamed: 0,smiles
0,CC1(C)CCC(C)(C)c2cc(-c3cccc(-c4ccc(C(=O)O)cc4)n3)ccc21
1,Cc1cc2c(cc1C1=NOC(c3ccc(C(=O)O)cc3)C1)C(C)(C)CCC2(C)C
2,Cc1ccc(-c2ccc(C(=O)O)cc2)cc1-c1ccc2c(c1)C(C)(C)CCC2(C)C
3,Cc1cc2c(cc1-c1ncc(-c3ccc(C(=O)O)cc3)s1)C(C)(C)CCC2(C)C
4,Cc1ccc(-c2ccc(C(=O)O)cc2)cc1-c1cc2c(cc1C)C(C)(C)CCC2(C)C
5,Cc1cc2c(cc1-c1cccc(-c3ccc(C(=O)O)cc3)n1)C(C)(C)CCC2(C)C
6,CC1(C)CCC(C)(C)c2cc(-c3cccc(-c4ccc(C(=O)O)cc4)c3)ccc21
7,CC1(C)CCC(C)(C)c2cc(-c3ccc4cc(C(=O)O)ccc4c3)ccc21
8,CC(=Cc1ccc(C(=O)O)cc1)c1ccc2c(c1)C(C)(C)CCC2(C)C
9,CC(C)=C(c1ccc(C(=O)O)cc1)c1ccc2c(c1)C(C)(C)CCC2(C)C


In [4]:
from tqdm import tqdm

for smi in tqdm(df.smiles[:10000]):
    try:
        tokenizer.tokenize(smi)
    except Exception as e:
        print(e)
        print(smi)

  0%|          | 0/10000 [00:00<?, ?it/s]

'[As]'
Nc1cccc2c([As](=O)(O)O)cccc12
'[As]'
Nc1cccc2ccc([As](=O)(O)O)cc12
'[s+]'
CN(C)c1ccc2nc3ccc(N(C)C)cc3[s+]c2c1
'B'
CCOC(=O)C(=Cc1ccc(B(O)O)cc1)C(=O)c1sc(Nc2ccc(Cl)cc2)nc1N(C)C
'[As]'
Nc1nc(N)nc(Nc2ccc([As]3SCC(CO)S3)cc2)n1
'8'
C=CC[N+]12CCC34c5ccccc5N5C=C6C7CC8C9(CC[N+]8(CC=C)CC7=CCO)c7ccccc7N(C=C(C(CC31)C(=CCO)C2)C54)C69


100%|██████████| 10000/10000 [00:00<00:00, 62885.00it/s]

'8'
COc1ccc(C[N+]23CCC45c6ccccc6N6C7OCC=C8C[N+]9(Cc%10ccc(OC)c(OC)c%10)CCC%10%11c%12ccccc%12N(C%12OCC=C(C2)C(CC43)C%12C65)C%10C7C8CC%119)cc1OC
'8'
C#CC[N+]12CCC34c5ccccc5N5C6OCC=C7C[N+]8(CC#C)CCC9%10c%11ccccc%11N(C%11OCC=C(C1)C(CC32)C%11C54)C9C6C7CC%108
'8'
O=C1c2ccccc2C(=O)N1CCC[N+]12CCC34c5ccccc5N5C6OCC=C7C[N+]8(CCCN9C(=O)c%10ccccc%10C9=O)CCC9%10c%11ccccc%11N(C%11OCC=C(C1)C(CC32)C%11C54)C9C6C7CC%108
'8'
C1=C2CN3CCC45c6ccccc6N6C7OCC=C8CN9CCC%10%11c%12ccccc%12N(C(OC1)C(C2CC34)C65)C%10C7C8CC9%11
'8'
Brc1ccc(C[N+]23CCC45c6ccccc6N6C7OCC=C8C[N+]9(Cc%10ccc(Br)cc%10)CCC%10%11c%12ccccc%12N(C%12OCC=C(C2)C(CC43)C%12C65)C%10C7C8CC%119)cc1
'8'
CCC[N+]12CCC34c5ccccc5N5C6OCC=C7C[N+]8(CCC)CCC9%10c%11ccccc%11N(C%11OCC=C(C1)C(CC32)C%11C54)C9C6C7CC%108
'8'
C1=C2C[N+]3(Cc4ccccc4)CCC45c6ccccc6N6C7OCC=C8C[N+]9(Cc%10ccccc%10)CCC%10%11c%12ccccc%12N(C(OC1)C(C2CC43)C65)C%10C7C8CC%119
'8'
FC(F)(F)c1ccc(C[N+]23CCC45c6ccccc6N6C7OCC=C8C[N+]9(Cc%10ccc(C(F)(F)F)cc%10)CCC%10%11c%12ccccc%12N(C%12OCC=C(C2)C(CC43)C%12C




In [5]:
def tokenize(smi):
    try:
        token = tokenizer.tokenize(smi).reshape(1, -1)
        if token.shape[1] > 73:
            return None
        return token
    except:
        return None

In [6]:
df['tokens'] = df.smiles.apply(tokenize)

In [7]:
df.dropna(subset='tokens', inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 630201 entries, 0 to 698593
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   smiles  630201 non-null  object
 1   tokens  630201 non-null  object
dtypes: object(2)
memory usage: 14.4+ MB


In [8]:
if hasattr(tqdm, '_instances'):
    tqdm._instances.clear()

In [10]:
res = trained_encoder_model.predict(np.vstack(df.tokens[:100]), verbose=0)
res.shape

(100, 512)

In [15]:
def chunked(series, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(series), n):
        yield series[i:i + n]

results = []
for chunk in tqdm(chunked(df.tokens, 100), total=len(df) // 100):
    results.append(trained_encoder_model.predict(np.vstack(chunk), verbose=0))

results = np.vstack(results)
results.shape

  0%|          | 10/63020 [00:03<6:29:07,  2.70it/s]


(100, 512)

In [16]:
results

array([[ 1.        ,  1.        , -0.9999833 , ..., -1.        ,
         1.        , -1.        ],
       [ 1.        ,  1.        , -0.9999853 , ..., -1.        ,
         1.        , -1.        ],
       [ 1.        ,  1.        , -0.9999854 , ..., -1.        ,
         1.        , -1.        ],
       ...,
       [ 1.        ,  1.        , -0.99998516, ..., -1.        ,
         1.        , -1.        ],
       [ 1.        ,  1.        , -0.9999853 , ..., -1.        ,
         1.        , -1.        ],
       [ 1.        ,  1.        , -0.9999853 , ..., -1.        ,
         1.        , -1.        ]], dtype=float32)

In [182]:
df.loc['cddd'] = None
df.loc[:9, 'cddd'] = results

In [183]:
df.loc[:9, 'cddd'].apply(np.squeeze)

0      [1.0, 1.0, -0.99998325, -1.0, -1.0, 1.0, 1.0, 1.0, -1.0, -1.0, 1.0, -1.0, -1.0, 1.0, 1.0, -1.0, -1.0, 1.0, -1.0, 0.99999505, 1.0, 1.0, 1.0, 1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 0.9999973, -1.0, -1.0, 1.0, 1.0, -1.0, -1.0, -1.0, 1.0, 1.0, 1.0, 0.99999803, -1.0, 0.9998113, 1.0, -1.0, -1.0, 1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 1.0, -1.0, 1.0, -1.0, -1.0, 1.0, 1.0, -1.0, -1.0, -1.0, 1.0, -1.0, 1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, -1.0, -1.0, 1.0, 1.0, -1.0, -1.0, 1.0, -1.0, 1.0, 1.0, -1.0, -1.0, -1.0, 1.0, -1.0, -1.0, -1.0, 1.0, 0.9999995, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, ...]
1       [1.0, 1.0, -0.9999854, -1.0, -1.0, 1.0, 1.0, 1.0, -1.0, -1.0, 1.0, -1.0, -1.0, 1.0, 1.0, -1.0, -1.0, 1.0, -1.0, 0.99999493, 1.0, 1.0, 1.0, 1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 0.9999975, -1.0, -1.0, 1.0, 1.0, -1.0, -1.0, -1.0, 1.0, 1.0, 1.0, 0.99999833, -1.0, 0.9998176, 1.0, -1.0, -1.0, 1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 1.0, -1.0, 1.0, -1.0, -1.0, 1.0, 1.0, -1.0, -1.0,

In [180]:
df['cddd']

0         [[1.0, 1.0, -0.99998325, -1.0, -1.0, 1.0, 1.0, 1.0, -1.0, -1.0, 1.0, -1.0, -1.0, 1.0, 1.0, -1.0, -1.0, 1.0, -1.0, 0.99999505, 1.0, 1.0, 1.0, 1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 0.9999973, -1.0, -1.0, 1.0, 1.0, -1.0, -1.0, -1.0, 1.0, 1.0, 1.0, 0.99999803, -1.0, 0.9998113, 1.0, -1.0, -1.0, 1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 1.0, -1.0, 1.0, -1.0, -1.0, 1.0, 1.0, -1.0, -1.0, -1.0, 1.0, -1.0, 1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, -1.0, -1.0, 1.0, 1.0, -1.0, -1.0, 1.0, -1.0, 1.0, 1.0, -1.0, -1.0, -1.0, 1.0, -1.0, -1.0, -1.0, 1.0, 0.9999995, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, ...]]
1          [[1.0, 1.0, -0.9999854, -1.0, -1.0, 1.0, 1.0, 1.0, -1.0, -1.0, 1.0, -1.0, -1.0, 1.0, 1.0, -1.0, -1.0, 1.0, -1.0, 0.99999493, 1.0, 1.0, 1.0, 1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 0.9999975, -1.0, -1.0, 1.0, 1.0, -1.0, -1.0, -1.0, 1.0, 1.0, 1.0, 0.99999833, -1.0, 0.9998176, 1.0, -1.0, -1.0, 1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 1.0, -1.0, 1.0, -1.0, -1.0, 1.0, 1.0, -1