In [1]:
from transformers import AutoModelForMaskedLM, AutoTokenizer
import torch

In [48]:
model = AutoModelForMaskedLM.from_pretrained("google-bert/bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")


Some weights of the model checkpoint at google-bert/bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [49]:
model.save_pretrained("./my-bert-model")
tokenizer.save_pretrained("./my-bert-model")

('./my-bert-model/tokenizer_config.json',
 './my-bert-model/special_tokens_map.json',
 './my-bert-model/vocab.txt',
 './my-bert-model/added_tokens.json',
 './my-bert-model/tokenizer.json')

In [50]:
! ls -lth ./my-bert-model

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


total 876288
-rw-r--r--@ 1 muhsinmohammed  1028238298   695K Jun 27 12:51 tokenizer.json
-rw-r--r--@ 1 muhsinmohammed  1028238298   226K Jun 27 12:51 vocab.txt
-rw-r--r--@ 1 muhsinmohammed  1028238298   125B Jun 27 12:51 special_tokens_map.json
-rw-r--r--@ 1 muhsinmohammed  1028238298   1.2K Jun 27 12:51 tokenizer_config.json
-rw-r--r--@ 1 muhsinmohammed  1028238298   418M Jun 27 12:51 model.safetensors
-rw-r--r--@ 1 muhsinmohammed  1028238298    90B Jun 27 12:51 generation_config.json
-rw-r--r--@ 1 muhsinmohammed  1028238298   676B Jun 27 12:51 config.json


In [51]:
local_model = AutoModelForMaskedLM.from_pretrained("./my-bert-model")
local_tokenizer = AutoTokenizer.from_pretrained("./my-bert-model")

In [6]:

print(f"Model size: {model.num_parameters():,} parameters")
print(f"Hidden size: {model.config.hidden_size}")
print(f"Number of layers: {model.config.num_hidden_layers}")
print(f"Number of attention heads: {model.config.num_attention_heads}")
print(f"Vocabulary size: {model.config.vocab_size}")


print(model.config)

Model size: 109,514,298 parameters
Hidden size: 768
Number of layers: 12
Number of attention heads: 12
Vocabulary size: 30522
BertConfig {
  "_name_or_path": "google-bert/bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.44.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [7]:
text = "Airbus A330 is a [MASK] aircraft."
inputs = local_tokenizer(text, return_tensors="pt")
# print(inputs)
print('Input shape:', inputs['input_ids'].shape)

with torch.no_grad():
    outputs = local_model(**inputs)
    logits = outputs.logits

# logits.shape

mask_token_index = (inputs.input_ids == local_tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]

print("Mask token index:", mask_token_index)

mask_logits = logits[0, mask_token_index, :]

print("Mask logits:", mask_logits)

top_token = torch.topk(mask_logits, k=1, dim=1).indices[0].tolist()
print(top_token)
word = local_tokenizer.decode(top_token)
print(f"Predicted word: {word}")


Input shape: torch.Size([1, 11])
Mask token index: tensor([7])
Mask logits: tensor([[-2.6876, -3.2343, -3.1478,  ..., -2.5129, -3.2148, -5.6780]])
[4628]
Predicted word: passenger


In [None]:
type(local_model) ## transformers.models.bert.modeling_bert.BertForMaskedLM

transformers.models.bert.modeling_bert.BertForMaskedLM

In [14]:
base_model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [23]:
from transformers import AutoConfig
config = AutoConfig.from_pretrained("google-bert/bert-base-uncased")
config_dict = config.to_dict()
config_dict

{'return_dict': True,
 'output_hidden_states': False,
 'output_attentions': False,
 'torchscript': False,
 'torch_dtype': None,
 'use_bfloat16': False,
 'tf_legacy_loss': False,
 'pruned_heads': {},
 'tie_word_embeddings': True,
 'chunk_size_feed_forward': 0,
 'is_encoder_decoder': False,
 'is_decoder': False,
 'cross_attention_hidden_size': None,
 'add_cross_attention': False,
 'tie_encoder_decoder': False,
 'max_length': 20,
 'min_length': 0,
 'do_sample': False,
 'early_stopping': False,
 'num_beams': 1,
 'num_beam_groups': 1,
 'diversity_penalty': 0.0,
 'temperature': 1.0,
 'top_k': 50,
 'top_p': 1.0,
 'typical_p': 1.0,
 'repetition_penalty': 1.0,
 'length_penalty': 1.0,
 'no_repeat_ngram_size': 0,
 'encoder_no_repeat_ngram_size': 0,
 'bad_words_ids': None,
 'num_return_sequences': 1,
 'output_scores': False,
 'return_dict_in_generate': False,
 'forced_bos_token_id': None,
 'forced_eos_token_id': None,
 'remove_invalid_values': False,
 'exponential_decay_length_penalty': None,
 'su

In [62]:
base_model.encoder

BertEncoder(
  (layer): ModuleList(
    (0-11): 12 x BertLayer(
      (attention): BertAttention(
        (self): BertSdpaSelfAttention(
          (query): Linear(in_features=768, out_features=768, bias=True)
          (key): Linear(in_features=768, out_features=768, bias=True)
          (value): Linear(in_features=768, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (output): BertSelfOutput(
          (dense): Linear(in_features=768, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (intermediate): BertIntermediate(
        (dense): Linear(in_features=768, out_features=3072, bias=True)
        (intermediate_act_fn): GELUActivation()
      )
      (output): BertOutput(
        (dense): Linear(in_features=3072, out_features=768, bias=True)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=T

In [73]:
base_model.encoder.layer[0].attention

BertAttention(
  (self): BertSdpaSelfAttention(
    (query): Linear(in_features=768, out_features=768, bias=True)
    (key): Linear(in_features=768, out_features=768, bias=True)
    (value): Linear(in_features=768, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (output): BertSelfOutput(
    (dense): Linear(in_features=768, out_features=768, bias=True)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
)