# Models


## 1. tokenizer

In [17]:
import transformers

# STEP 1 of PIPELINE
tokenizer           = transformers.BertTokenizerFast    .from_pretrained("bert-base-cased")



raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
]
input_numeric_ids = tokenizer(raw_inputs           , return_tensors= "pt", padding=True, truncation= False )
print(f'MODEL INPUT FORMAT: {input_numeric_ids}')

MODEL INPUT FORMAT: {'input_ids': tensor([[  101,   146,   112,  1396,  1151,  2613,  1111,   170, 20164, 10932,
          2271,  7954,  1736,  1139,  2006,  1297,   119,   102],
        [  101,   146,  4819,  1142,  1177,  1277,   106,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}


## 2. model

In [18]:
config              = transformers.BertConfig()
model_untrained     = transformers.BertModel(config)

model_pretrained    = transformers.BertModel            .from_pretrained("bert-base-cased")

# model_pretrained.save_pretrained("MODEL_CHECKPOINT")

### model architecture

In [19]:
from torchinfo import summary

In [20]:
model_pretrained

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [21]:
summary(model_pretrained)

Layer (type:depth-idx)                             Param #
BertModel                                          --
├─BertEmbeddings: 1-1                              --
│    └─Embedding: 2-1                              22,268,928
│    └─Embedding: 2-2                              393,216
│    └─Embedding: 2-3                              1,536
│    └─LayerNorm: 2-4                              1,536
│    └─Dropout: 2-5                                --
├─BertEncoder: 1-2                                 --
│    └─ModuleList: 2-6                             --
│    │    └─BertLayer: 3-1                         7,087,872
│    │    └─BertLayer: 3-2                         7,087,872
│    │    └─BertLayer: 3-3                         7,087,872
│    │    └─BertLayer: 3-4                         7,087,872
│    │    └─BertLayer: 3-5                         7,087,872
│    │    └─BertLayer: 3-6                         7,087,872
│    │    └─BertLayer: 3-7                         7,087,872
│    │   

In [22]:
model_pretrained.config

BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.32.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

In [23]:
"""

Important **model** configuration terms
- vocab_size
- n_heads
- n_layers
- label2id
- id2label
- hidden_dim
- dim 

"""

'\n\nImportant **model** configuration terms\n- vocab_size\n- n_heads\n- n_layers\n- label2id\n- id2label\n- hidden_dim\n- dim \n\n'

## model output

In [24]:
# Understanding model's output
outputs = model_pretrained(**input_numeric_ids)
print(f'MODEL OUTPUT FORMAT: {vars(outputs)} ' )
print(f'MODEL_AUTO OUTPUT FORMAT: {vars(outputs).keys()} ' )

MODEL OUTPUT FORMAT: {'last_hidden_state': tensor([[[ 0.4222,  0.4443, -0.0659,  ..., -0.1958,  0.3611,  0.1284],
         [ 0.5728, -0.1593,  0.6014,  ..., -0.1134,  0.1791,  0.1787],
         [ 0.4699,  0.4214,  0.1695,  ...,  0.2386,  0.9851, -0.1236],
         ...,
         [ 0.5847,  0.2552,  0.0266,  ...,  0.7203,  0.0650,  0.4277],
         [ 0.5573,  0.4506,  0.0353,  ..., -0.0607,  0.4209, -0.2525],
         [ 0.7136,  1.2932, -0.2937,  ...,  0.2917,  0.4270, -0.3874]],

        [[ 0.4829,  0.4291,  0.0264,  ..., -0.1489,  0.2953, -0.3113],
         [ 0.2735,  0.4520,  0.2760,  ...,  0.2572,  0.2059,  0.4097],
         [ 0.1391,  0.4234, -0.3385,  ...,  0.5858, -0.0834,  0.4344],
         ...,
         [ 0.0709,  0.4650, -0.1060,  ...,  0.2954,  0.1990,  0.1774],
         [ 0.1649,  0.4855, -0.0801,  ...,  0.3485,  0.1970,  0.1701],
         [ 0.2887,  0.4945,  0.0196,  ...,  0.2895,  0.2056,  0.0399]]],
       grad_fn=<NativeLayerNormBackward0>), 'pooler_output': tensor([[-0.

In [25]:
all_extracted_features_map  = outputs['last_hidden_state']
compressed_features         = outputs['pooler_output']

print(all_extracted_features_map.shape, compressed_features.shape)

torch.Size([2, 18, 768]) torch.Size([2, 768])


In [26]:
BATCH_SIZE, SENTENCE_LENGTH, WORD_EMBEDDING_SIZE = outputs['last_hidden_state'].shape
print(outputs['last_hidden_state'].shape)
print(f'Number of Senteces = {BATCH_SIZE}, Num of Words in Each Sentece = {SENTENCE_LENGTH}, Every Word Embedding Length = {WORD_EMBEDDING_SIZE}')

# EMBEDDING SIZE or HIDDEN SIZE

torch.Size([2, 18, 768])
Number of Senteces = 2, Num of Words in Each Sentece = 18, Every Word Embedding Length = 768


In [27]:
model_pretrained.config

BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.32.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}