In [1]:
from transformers import AutoModel, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# https://huggingface.co/emilyalsentzer/Bio_ClinicalBERT
model_type = 'emilyalsentzer/Bio_ClinicalBERT'

In [3]:
model = AutoModel.from_pretrained(model_type)

In [4]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [5]:
for param in model.parameters():
    param.requires_grad = False

In [6]:
for param in model.parameters():
    print(param.requires_grad)

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
Fals

In [7]:
model.config

BertConfig {
  "_name_or_path": "emilyalsentzer/Bio_ClinicalBERT",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.37.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

In [8]:
model.config.hidden_size

768

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_type)

In [10]:
tokenizer

BertTokenizerFast(name_or_path='emilyalsentzer/Bio_ClinicalBERT', vocab_size=28996, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [11]:
len(tokenizer.get_vocab().items())

28996

In [12]:
tokenizer.special_tokens_map

{'unk_token': '[UNK]',
 'sep_token': '[SEP]',
 'pad_token': '[PAD]',
 'cls_token': '[CLS]',
 'mask_token': '[MASK]'}

In [13]:
test = 'Hello world!'

In [14]:
tokens = tokenizer.tokenize(test)

In [15]:
tokens

['hello', 'world', '!']

In [16]:
# Transform input tokens 

In [17]:
inputs = tokenizer("Hello world!")

In [18]:
inputs

{'input_ids': [101, 19082, 1362, 106, 102], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1]}

In [19]:
inputs = tokenizer("Hello world!", return_tensors="pt")

In [20]:
inputs

{'input_ids': tensor([[  101, 19082,  1362,   106,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}

In [21]:
# Model apply
outputs = model(**inputs)

In [22]:
outputs

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.1532,  0.6183, -0.2199,  ...,  0.1997,  0.6364, -0.4686],
         [-0.0503,  0.1174, -0.0838,  ...,  0.1973,  0.3145, -0.3396],
         [ 0.1379,  0.1863, -0.1400,  ...,  0.2266,  0.0506, -0.4840],
         [-0.1522,  0.0899, -0.1756,  ...,  0.3008,  0.0847, -0.2710],
         [ 0.6556,  1.1622,  0.3298,  ..., -0.0702,  1.3476, -0.7676]]]), pooler_output=tensor([[ 4.7645e-02,  1.1744e-02,  9.9956e-01, -1.0000e+00,  9.9996e-01,
          7.5687e-01, -1.7265e-01,  7.4451e-01, -1.4755e-01,  7.3705e-02,
          9.9993e-01,  1.0000e+00, -9.3152e-01, -9.9122e-01, -2.3026e-01,
         -9.1678e-01,  1.0000e+00, -7.6820e-03, -9.9997e-01,  1.0957e-01,
         -1.2149e-01, -9.9830e-01,  1.0776e-01,  9.9897e-01,  7.1062e-02,
          7.9029e-02,  1.0000e+00,  9.9997e-01, -3.5269e-02, -3.7762e-02,
          6.3934e-02, -9.9997e-01,  9.9904e-01, -9.9999e-01, -1.9132e-02,
          2.3128e-02, -2.4202e-01, -1.5514e-02,

In [23]:
# (batch_size, sequence_length, hidden_size).

# last_hidden_state: This tensor contains the hidden states of the model's last layer. 
# In the context of BERT-based models, these hidden states represent the contextualized embeddings of each input token. 
# The tensor's shape is (batch_size, sequence_length, hidden_size). \
# In your case, it seems to be a single sequence with a length of 5 tokens and a hidden size of 768.

# pooler_output: This tensor contains the pooled representation of the entire sequence, typically used as a representation 
# of the entire input sequence. The tensor's shape is (batch_size, hidden_size), and in your case, it's a vector with a size of 768.
outputs['last_hidden_state'].shape

torch.Size([1, 5, 768])