In [1]:
from datasets import load_dataset, load_from_disk
from transformers import BertModel, AutoTokenizer, DataCollatorWithPadding, get_scheduler, AdamW, BertForSequenceClassification
from transformers.modeling_outputs import SequenceClassifierOutput
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import time, datetime
from datasets import load_metric

In [2]:
imdb = load_from_disk("../data/imdb")
imdb

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    dev: Dataset({
        features: ['text', 'label'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 24000
    })
})

In [3]:
checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [4]:
tokenizer(["hell i am", "where the hell I am"])

{'input_ids': [[101, 2630, 178, 1821, 102], [101, 1187, 1103, 2630, 146, 1821, 102]], 'token_type_ids': [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1]]}

In [5]:
tokenizer.encode(["hell i am", "where the hell I am"])

[101, 2630, 178, 1821, 102, 1187, 1103, 2630, 146, 1821, 102]

In [6]:
tokenizer.encode_plus(["hell i am", "where the hell I am"])

{'input_ids': [101, 2630, 178, 1821, 102, 1187, 1103, 2630, 146, 1821, 102], 'token_type_ids': [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [7]:
print(tokenizer.tokenize(["hlel i am", "today i am going to the conflupnce"]))

['h', '##le', '##l', 'i', 'am', 'today', 'i', 'am', 'going', 'to', 'the', 'con', '##f', '##lu', '##p', '##nce']


In [8]:
bert_classifier = BertForSequenceClassification.from_pretrained(checkpoint)
bert = BertModel.from_pretrained(checkpoint)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [9]:
def tokenize_function(example):
    return tokenizer(example["text"],  truncation=True, padding="max_length", max_length=max_sequence_length)

In [10]:
inputs = tokenizer(["I am a boy", "Who are you"], padding="longest", return_tensors="pt")
inputs

{'input_ids': tensor([[ 101,  146, 1821,  170, 2298,  102],
        [ 101, 2627, 1132, 1128,  102,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 0]])}

In [12]:
output = bert(**inputs, output_hidden_states=True)
print(output.hidden_states[0])
print(output.hidden_states[12])

tensor([[[ 0.4496,  0.0977, -0.2074,  ...,  0.0578,  0.0406, -0.0951],
         [-1.0412,  0.2393,  0.6613,  ...,  1.1994, -0.8032,  0.5556],
         [-1.0983,  1.1475, -0.7432,  ..., -0.8759,  0.5111,  1.0665],
         [-0.6861,  0.7461, -0.4607,  ..., -0.0735,  0.8834,  0.3352],
         [ 0.0532, -0.4325,  0.7288,  ..., -1.2935,  0.1437,  0.0132],
         [-0.3320,  0.3105,  0.1280,  ...,  0.4768, -0.8622,  0.0551]],

        [[ 0.4496,  0.0977, -0.2074,  ...,  0.0578,  0.0406, -0.0951],
         [ 0.2038, -0.4456,  0.7812,  ..., -0.9835, -0.4263, -0.1336],
         [-1.3004,  0.3671, -0.6442,  ...,  0.5829,  0.6138,  1.0224],
         [ 0.3026, -0.3614, -0.2114,  ...,  0.6541,  0.4079,  0.0530],
         [-0.4621, -0.0644,  0.1431,  ...,  0.2833, -0.8244,  0.2640],
         [-0.0348, -0.2707,  0.2793,  ...,  0.2995, -0.4477, -0.0079]]],
       grad_fn=<NativeLayerNormBackward0>)
tensor([[[ 0.4466,  0.1924,  0.3259,  ..., -0.2029,  0.2098, -0.2934],
         [ 0.3257, -0.1681,  0

In [13]:
output.keys()

odict_keys(['last_hidden_state', 'pooler_output', 'hidden_states'])

In [15]:
output_classifier = bert_classifier(**inputs, output_hidden_states=True)
print(output_classifier.hidden_states[0])
print(output_classifier.hidden_states[12])

tensor([[[ 0.4496,  0.0977, -0.2074,  ...,  0.0578,  0.0406, -0.0951],
         [-1.0412,  0.2393,  0.6613,  ...,  1.1994, -0.8032,  0.5556],
         [-1.0983,  1.1475, -0.7432,  ..., -0.8759,  0.5111,  1.0665],
         [-0.6861,  0.7461, -0.4607,  ..., -0.0735,  0.8834,  0.3352],
         [ 0.0532, -0.4325,  0.7288,  ..., -1.2935,  0.1437,  0.0132],
         [-0.3320,  0.3105,  0.1280,  ...,  0.4768, -0.8622,  0.0551]],

        [[ 0.4496,  0.0977, -0.2074,  ...,  0.0578,  0.0406, -0.0951],
         [ 0.2038, -0.4456,  0.7812,  ..., -0.9835, -0.4263, -0.1336],
         [-1.3004,  0.3671, -0.6442,  ...,  0.5829,  0.6138,  1.0224],
         [ 0.3026, -0.3614, -0.2114,  ...,  0.6541,  0.4079,  0.0530],
         [-0.4621, -0.0644,  0.1431,  ...,  0.2833, -0.8244,  0.2640],
         [-0.0348, -0.2707,  0.2793,  ...,  0.2995, -0.4477, -0.0079]]],
       grad_fn=<NativeLayerNormBackward0>)
tensor([[[ 0.4466,  0.1924,  0.3259,  ..., -0.2029,  0.2098, -0.2934],
         [ 0.3257, -0.1681,  0

In [16]:
output_classifier.keys()

odict_keys(['logits', 'hidden_states'])

In [None]:
bert_embed_output = bert.base_model.embeddings(inputs['input_ids'], )
print(bert_embed_output)

In [None]:
position_ids_all = torch.arange(bert.config.max_position_embeddings).expand((1, -1))

In [None]:
position_ids_all = torch.arange(512).expand((1, -1))
seq_length = inputs['input_ids'].shape[1]
position_ids = position_ids_all[:, 0 : seq_length + 0]
print(position_ids)

In [None]:
bert.embeddings.dropout(bert.embeddings.LayerNorm(bert.embeddings.word_embeddings(inputs['input_ids'])+ 
                                                  bert.embeddings.position_embeddings(position_ids) +
                                                  bert.embeddings.token_type_embeddings(inputs['token_type_ids'])
                                                 ))

In [None]:
bert_embed_output_custom = bert.embeddings(inputs_embeds =bert.embeddings.word_embeddings(inputs['input_ids']))
bert_embed_output_custom.shape
print(bert_embed_output_custom)

In [None]:
custom_embeds = bert.embeddings.word_embeddings(inputs['input_ids'])
input_lens = [embedding.shape[0]  for embedding in custom_embeds]
input_lens

In [None]:
output_custom = bert(inputs_embeds=bert.embeddings.word_embeddings(inputs['input_ids']),
                     attention_mask=inputs['attention_mask'],
                     output_hidden_states=True 
                    )

In [None]:
print(torch.equal(output.hidden_states[12], output_custom.hidden_states[12]))
print(torch.equal(output.hidden_states[0], output_custom.hidden_states[0]))

In [None]:
print(torch.equal(output.hidden_states[12][0], output_custom.hidden_states[12][0]))
print(torch.equal(output.hidden_states[0][0], output_custom.hidden_states[0][0]))

In [None]:
print(torch.equal(output.hidden_states[12][1], output_custom.hidden_states[12][1]))
print(torch.equal(output.hidden_states[0][1], output_custom.hidden_states[0][1]))

In [None]:
output_custom.hidden_states[12]

In [None]:
output_custom_classfier = bert_classifier(inputs_embeds=bert.embeddings.word_embeddings(inputs['input_ids']),
                     attention_mask=inputs['attention_mask'],
                     output_hidden_states=True 
                    )

In [None]:
output_custom_classfier.hidden_states[12]

In [17]:
bert_classifier

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [18]:
bert

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          