In [2]:
# !pip install transformers -q

In [69]:
from transformers import BertModel, BertTokenizer
import torch
import pandas as pd

In [7]:
model = BertModel.from_pretrained('bert-base-uncased')

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [5]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`,  it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.


In [21]:
corpus = "Abhiraj is a MachineLearning Engineer and works in California"

In [48]:
tokens = tokenizer.tokenize(corpus)

In [49]:
tokens

['ab',
 '##hir',
 '##aj',
 'is',
 'a',
 'machine',
 '##lea',
 '##rn',
 '##ing',
 'engineer',
 'and',
 'works',
 'in',
 'california']

Special Tokens - [CLS]: Classification and [SEP]: Separate (after sentence)

In [50]:
tokens = ['[CLS]'] + tokens + ['[SEP]']

In [51]:
print(tokens)
len(tokens)

['[CLS]', 'ab', '##hir', '##aj', 'is', 'a', 'machine', '##lea', '##rn', '##ing', 'engineer', 'and', 'works', 'in', 'california', '[SEP]']


16

Adding [PAD] to match the token length and is not part of the actual token
- Model will feed [PAD] as null

In [52]:
tokens = tokens + ['[PAD]'] + ['[PAD]'] + ['[PAD]'] + ['[PAD]']

In [53]:
print(tokens)
len(tokens)

['[CLS]', 'ab', '##hir', '##aj', 'is', 'a', 'machine', '##lea', '##rn', '##ing', 'engineer', 'and', 'works', 'in', 'california', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']


20

In [54]:
attention_mask = [1 if i!= '[PAD]' else 0 for i in tokens]
print(attention_mask)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]


Token IDs

In [55]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)

In [57]:
print(token_ids)

[101, 11113, 11961, 13006, 2003, 1037, 3698, 19738, 6826, 2075, 3992, 1998, 2573, 1999, 2662, 102, 0, 0, 0, 0]


Printing the tokens with their corresponding token_ids

In [80]:
df = pd.DataFrame(list(zip(tokens, token_ids)), columns=['Token', 'ID'])
print(df)

         Token     ID
0        [CLS]    101
1           ab  11113
2        ##hir  11961
3         ##aj  13006
4           is   2003
5            a   1037
6      machine   3698
7        ##lea  19738
8         ##rn   6826
9        ##ing   2075
10    engineer   3992
11         and   1998
12       works   2573
13          in   1999
14  california   2662
15       [SEP]    102
16       [PAD]      0
17       [PAD]      0
18       [PAD]      0
19       [PAD]      0


In [81]:
token_ids = torch.tensor(token_ids).unsqueeze(0)
attention_mask = torch.tensor(attention_mask).unsqueeze(0)

In [82]:
output = model(token_ids, attention_mask = attention_mask)

In [83]:
output

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.3586,  0.1181, -0.1842,  ..., -0.2354,  0.4259, -0.0326],
         [ 0.2419, -0.2279, -0.5840,  ...,  0.2596,  0.6063, -1.0958],
         [ 0.3713, -0.2513, -0.3179,  ...,  0.3428,  0.3097, -1.6136],
         ...,
         [-0.2008, -0.1491, -0.1179,  ...,  0.2886,  0.5797, -0.3534],
         [-0.1799, -0.1063, -0.1871,  ...,  0.2207,  0.5607, -0.3443],
         [-0.1128, -0.2178, -0.1757,  ...,  0.4001,  0.5953, -0.3352]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-8.8568e-01, -5.0053e-01, -8.7941e-01,  7.9120e-01,  6.8780e-01,
         -1.7641e-01,  8.6926e-01,  4.3366e-01, -6.0805e-01, -9.9999e-01,
         -5.2290e-01,  8.3938e-01,  9.7426e-01,  5.3178e-01,  8.6414e-01,
         -7.2071e-01, -2.7403e-01, -6.0806e-01,  3.4425e-01, -5.3194e-01,
          7.0190e-01,  9.9997e-01,  1.5199e-01,  3.4045e-01,  5.2027e-01,
          9.5641e-01, -7.8259e-01,  8.7885e-01,  9.3856e-01,  7.664

In [87]:
output[0].shape

torch.Size([1, 20, 768])