In [1]:
import deepchem as dc
import numpy as np
from deepchem.feat import RobertaFeaturizer
train_smiles = [
        'Cc1cccc(N2CCN(C(=O)C34CC5CC(CC(C5)C3)C4)CC2)c1C',
        'Cn1ccnc1SCC(=O)Nc1ccc(Oc2ccccc2)cc1',
        'COc1cc2c(cc1NC(=O)CN1C(=O)NC3(CCc4ccccc43)C1=O)oc1ccccc12',
        'O=C1/C(=C/NC2CCS(=O)(=O)C2)c2ccccc2C(=O)N1c1ccccc1',
        'NC(=O)NC(Cc1ccccc1)C(=O)O', 'CCn1c(CSc2nccn2C)nc2cc(C(=O)O)ccc21',
        'CCc1cccc2c1NC(=O)C21C2C(=O)N(Cc3ccccc3)C(=O)C2C2CCCN21',
        'COc1ccc(C2C(C(=O)NCc3ccccc3)=C(C)N=C3N=CNN32)cc1OC',
        'CCCc1cc(=O)nc(SCC(=O)N(CC(C)C)C2CCS(=O)(=O)C2)[nH]1',
        'CCn1cnc2c1c(=O)n(CC(=O)Nc1cc(C)on1)c(=O)n2Cc1ccccc1'
    ]
tokens = set()
for s in train_smiles:
  tokens = tokens.union(set(c for c in s))
tokens = sorted(list(tokens))
max_length = max(len(s) for s in train_smiles) + 1
s = dc.models.seqtoseq.AspuruGuzikAutoEncoder(tokens, max_length)

def generate_sequences(smiles, epochs):
  for i in range(epochs):
    for s in smiles:
      yield (s, s)

s.fit_sequences(generate_sequences(train_smiles, 100))

# Test it out.
pred1 = s.predict_from_sequences(train_smiles, beam_width=1)
pred4 = s.predict_from_sequences(train_smiles, beam_width=4)
embeddings = s.predict_embeddings(train_smiles)
pred1e = s.predict_from_embeddings(embeddings, beam_width=1)
pred4e = s.predict_from_embeddings(embeddings, beam_width=4)

for i in range(len(train_smiles)):
  assert pred1[i] == pred1e[i]
  assert pred4[i] == pred4e[i]

@pytest.mark.tensorflow
def test_variational(self):
"""Test using a SeqToSeq model as a variational autoenconder."""

sequence_length = 10
tokens = list(range(10))
s = dc.models.SeqToSeq(
    tokens,
    tokens,
    sequence_length,
    encoder_layers=2,
    decoder_layers=2,
    embedding_dimension=128,
    learning_rate=0.01,
    variational=True)

# Actually training a VAE takes far too long for a unit test.  Just run a
# few steps of training to make sure nothing crashes, then check that the
# results are at least internally consistent.

s.fit_sequences(generate_sequences(sequence_length, 1000))
for sequence, target in generate_sequences(sequence_length, 10):
  pred1 = s.predict_from_sequences([sequence], beam_width=1)
  embedding = s.predict_embeddings([sequence])
  assert pred1 == s.predict_from_embeddings(embedding, beam_width=1)

  from .autonotebook import tqdm as notebook_tqdm
2022-09-25 21:58:58.095936: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-25 21:58:58.227695: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-09-25 21:58:58.746393: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-09-25 21:58:58.746494: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7

array([[list([12, 16, 25, 20, 15, 17, 22, 19, 18, 15, 21, 15, 17, 25, 15, 25, 21, 16, 18, 25, 17, 16, 18, 15, 20, 22, 19, 13]),
        list([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])],
       [list([12, 16, 16, 17, 22, 19, 18, 23, 20, 16, 23, 17, 16, 17, 16, 18, 22, 19, 18, 16, 17, 19, 18, 16, 20, 19, 13]),
        list([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])]],
      dtype=object)

In [None]:
deepchem/deepchem/feat/roberta_tokenizer.py 