In [1]:
# imports

from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# vanilla Bert-base model

model = BertModel.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [3]:
# get all models parameters as a list of tuples

named_params = list(model.named_parameters())

print(f'The BERT model has {len(named_params)} different named parameters.\n')

print('------Embedding Layer--------')
for p in named_params[0:5]:
    print(f'{p[0]} has shape {p[1].shape}')

print('------First Encoder Layer--------')
for p in named_params[5:21]:
    print(f'{p[0]} has shape {p[1].shape}')

print('------Output Layer--------')
for p in named_params[-2:]:
    print(f'{p[0]} has shape {p[1].shape}')

The BERT model has 199 different named parameters.

------Embedding Layer--------
embeddings.word_embeddings.weight has shape torch.Size([30522, 768])
embeddings.position_embeddings.weight has shape torch.Size([512, 768])
embeddings.token_type_embeddings.weight has shape torch.Size([2, 768])
embeddings.LayerNorm.weight has shape torch.Size([768])
embeddings.LayerNorm.bias has shape torch.Size([768])
------First Encoder Layer--------
encoder.layer.0.attention.self.query.weight has shape torch.Size([768, 768])
encoder.layer.0.attention.self.query.bias has shape torch.Size([768])
encoder.layer.0.attention.self.key.weight has shape torch.Size([768, 768])
encoder.layer.0.attention.self.key.bias has shape torch.Size([768])
encoder.layer.0.attention.self.value.weight has shape torch.Size([768, 768])
encoder.layer.0.attention.self.value.bias has shape torch.Size([768])
encoder.layer.0.attention.output.dense.weight has shape torch.Size([768, 768])
encoder.layer.0.attention.output.dense.bias has

In [4]:
# load the bert-base uncased tokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [5]:
tokenizer.encode('Ayantika loves Berlin a lot')

[101, 1037, 7054, 3775, 2912, 7459, 4068, 1037, 2843, 102]

In [6]:
# run tokens through the model
# 1 turn token with unknown words into a tensor size (10,)
# 2 unsqueeze the first dimension to simulate batches, resulting in shape (1,10)

response = model(torch.tensor(tokenizer.encode('Ayantika loves Berlin a lot')).unsqueeze(0))

In [7]:
response.last_hidden_state

tensor([[[-0.6461, -0.0264, -0.1880,  ..., -0.5233,  0.5010,  0.1433],
         [-0.0855, -0.5855, -0.2855,  ..., -0.2858,  0.9792,  0.4176],
         [ 0.6347,  0.0429, -0.4084,  ..., -0.4250,  1.1661, -0.3016],
         ...,
         [-0.9196, -0.5915, -0.1817,  ..., -1.1843,  0.1419, -0.2133],
         [-0.2791, -0.4643, -0.2025,  ..., -0.6225,  0.2504, -0.1677],
         [ 0.6998,  0.0729, -0.1787,  ...,  0.0844, -0.4267, -0.5825]]],
       grad_fn=<NativeLayerNormBackward0>)

In [8]:
response.pooler_output.shape

torch.Size([1, 768])

In [9]:
model.pooler

BertPooler(
  (dense): Linear(in_features=768, out_features=768, bias=True)
  (activation): Tanh()
)

In [10]:
# grab the final encoders representation of the CLS token

CLS_embedding = response.last_hidden_state[:,0,:].unsqueeze(0)

In [11]:
CLS_embedding.shape

torch.Size([1, 1, 768])

In [12]:
# running the embedding for CLS through the pooler gives the same output as the 'pooler-output'

(model.pooler(CLS_embedding) == response.pooler_output).all()

tensor(True)

In [13]:
total_params= 0

for p in model.parameters():
    total_params += p.numel()

print(f'The BERT model has {total_params} parameters.')

The BERT model has 109482240 parameters.
