In [1]:
from transformers import BertModel
model_name = 'bert-base-uncased'

model = BertModel.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

#### Summary

1. Embeddings: `BertEmbeddings` (class)
    - token embedding
    - position embedding
    - token type embedding (same as token type)
2. Encoder: `BertEncoder` (class) containts 12 layers of `BertLayer`
    - self-attention (kqv)
    - feed-forward
3. Pooler: `BertPooler` (class) only a dense layer


**What if a BertForSequenceClassification model?**

In [3]:
from transformers import BertForSequenceClassification

cls_model = BertForSequenceClassification.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [8]:
cls_model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

- Adding a classification header (binary classification)


**Knowledge**

- Bert: encoder of transformer
    - Transformer: encoder-decoder model
    
![Transformer](transformers.png)

#### How many parameters?

why it can be called large language model?

In [10]:
# .numel(): num of elements, [2, 3] => 2 x 3 = 6
total_params = 0
total_learnable_params = 0
for name, param in model.named_parameters():
    print(name, '->', param.shape, '->', param.numel())
    if param.requires_grad:
        total_learnable_params += param.numel()
    total_params += param.numel()

embeddings.word_embeddings.weight -> torch.Size([30522, 768]) -> 23440896
embeddings.position_embeddings.weight -> torch.Size([512, 768]) -> 393216
embeddings.token_type_embeddings.weight -> torch.Size([2, 768]) -> 1536
embeddings.LayerNorm.weight -> torch.Size([768]) -> 768
embeddings.LayerNorm.bias -> torch.Size([768]) -> 768
encoder.layer.0.attention.self.query.weight -> torch.Size([768, 768]) -> 589824
encoder.layer.0.attention.self.query.bias -> torch.Size([768]) -> 768
encoder.layer.0.attention.self.key.weight -> torch.Size([768, 768]) -> 589824
encoder.layer.0.attention.self.key.bias -> torch.Size([768]) -> 768
encoder.layer.0.attention.self.value.weight -> torch.Size([768, 768]) -> 589824
encoder.layer.0.attention.self.value.bias -> torch.Size([768]) -> 768
encoder.layer.0.attention.output.dense.weight -> torch.Size([768, 768]) -> 589824
encoder.layer.0.attention.output.dense.bias -> torch.Size([768]) -> 768
encoder.layer.0.attention.output.LayerNorm.weight -> torch.Size([768])

In [11]:
print(total_params)
print(total_learnable_params)

109482240
109482240


In [14]:
total_params = 0
total_learnable_params = 0
total_embedding_params = 0
total_encoder_params = 0
total_pooler_params = 0
for name, param in model.named_parameters():
    # print(name, '->', param.shape, '->', param.numel())
    if 'embedding' in name:
        total_embedding_params += param.numel()
    if 'encoder' in name:
        total_encoder_params += param.numel()
    if 'pooler' in name:
        total_pooler += param.numel()
    if param.requires_grad:
        total_learnable_params += param.numel()
    total_params += param.numel()
    
print('embedding:', total_embedding_params)
print('encoder:', total_encoder_params)
print('pooler: ', total_pooler_params)

embedding 23837184
encoder 85054464
pooler 0
