In [75]:

import os
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
from tqdm.auto import tqdm
from time import sleep


In [3]:
model = AutoModel.from_pretrained("DeepChem/ChemBERTa-77M-MLM")
tokenizer = AutoTokenizer.from_pretrained("DeepChem/ChemBERTa-77M-MLM")

Downloading:   0%|          | 0.00/631 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.7M [00:00<?, ?B/s]

Some weights of the model checkpoint at DeepChem/ChemBERTa-77M-MLM were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MLM and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be 

Downloading:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/6.96k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.26k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420 [00:00<?, ?B/s]

In [4]:
model

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(600, 384, padding_idx=1)
    (position_embeddings): Embedding(515, 384, padding_idx=1)
    (token_type_embeddings): Embedding(1, 384)
    (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.144, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=384, out_features=384, bias=True)
            (key): Linear(in_features=384, out_features=384, bias=True)
            (value): Linear(in_features=384, out_features=384, bias=True)
            (dropout): Dropout(p=0.109, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=384, out_features=384, bias=True)
            (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
            (dropout): Dr

In [66]:
smiles = [
    "[NH]C(CC(C)C([N@@](C(C)(C)C)C(N)(C)N)(C)C)c1c(c(c[nH+][o+]1)C)[O-]",
    "[NH]C(CC(C)C((C(C)(C)C)CN)C)c1c(c(c[nH+][o+]1)C)[O-]"
]

In [69]:
tokenized_instances = tokenizer(smiles, add_special_tokens=True, return_tensors='pt', padding=True)
tokenized_instances

{'input_ids': tensor([[12, 23, 16, 17, 16, 16, 17, 16, 18, 16, 17, 23, 17, 16, 17, 16, 18, 17,
         16, 18, 16, 18, 16, 17, 23, 18, 17, 16, 18, 23, 18, 17, 16, 18, 16, 18,
         15, 20, 15, 17, 15, 17, 15, 25, 44, 20, 18, 16, 18, 19, 31, 13],
        [12, 23, 16, 17, 16, 16, 17, 16, 18, 16, 17, 17, 16, 17, 16, 18, 17, 16,
         18, 16, 18, 16, 23, 18, 16, 18, 15, 20, 15, 17, 15, 17, 15, 25, 44, 20,
         18, 16, 18, 19, 31, 13,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]])}

In [70]:
hidden = model(**tokenized_instances)

In [71]:
hidden

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.4319, -0.1712, -0.0734,  ...,  0.0032,  0.1178, -0.5129],
         [ 0.0346,  0.3587, -0.0228,  ...,  0.0444, -0.1280,  0.0934],
         [ 0.2966, -0.1893,  0.0102,  ...,  0.1391, -0.1832, -0.3893],
         ...,
         [ 0.1698,  0.0009,  0.2729,  ..., -0.1011,  0.1077, -0.6349],
         [ 0.2468,  0.2819,  0.3447,  ..., -0.4136, -0.3032, -0.1212],
         [ 0.5281,  0.0718,  0.3820,  ..., -0.2419, -0.4423, -0.6131]],

        [[ 0.5213, -0.2079, -0.3280,  ..., -0.2234,  0.0519, -0.3463],
         [ 0.1170,  0.1991, -0.1065,  ...,  0.0228, -0.2716,  0.2028],
         [ 0.2878, -0.1959, -0.1298,  ..., -0.0672, -0.2435, -0.2075],
         ...,
         [ 0.3466,  0.0130,  0.0283,  ..., -0.4068, -0.7194, -0.3947],
         [-0.0550,  0.1545,  0.0393,  ..., -0.3279, -0.4374, -0.3352],
         [ 0.0027, -0.0154, -0.2576,  ..., -0.3913, -0.2815, -0.6187]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_ou

In [72]:
hidden.pooler_output.shape

torch.Size([2, 384])

In [73]:
hidden.last_hidden_state.shape

torch.Size([2, 52, 384])