In [31]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [32]:
import math

import torch
import torch.nn as nn
import torch.nn.functional as F

from base_bert import BertPreTrainedModel
from utils import get_extended_attention_mask
from bert import BertModel
from config import BertConfig

In [33]:
bert = BertModel.from_pretrained("bert-base-uncased")

In [34]:
bert

BertModel(
  (word_embedding): Embedding(30522, 768, padding_idx=0)
  (pos_embedding): Embedding(512, 768)
  (tk_type_embedding): Embedding(2, 768)
  (embed_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  (embed_dropout): Dropout(p=0.1, inplace=False)
  (bert_layers): ModuleList(
    (0-11): 12 x BertLayer(
      (self_attention): BertSelfAttention(
        (query): Linear(in_features=768, out_features=768, bias=True)
        (key): Linear(in_features=768, out_features=768, bias=True)
        (value): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (attention_dense): Linear(in_features=768, out_features=768, bias=True)
      (attention_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (attention_dropout): Dropout(p=0.1, inplace=False)
      (interm_dense): Linear(in_features=768, out_features=3072, bias=True)
      (out_dense): Linear(in_features=3072, out_features=768, bias=

In [None]:
class BertSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()

        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # initialize the linear transformation layers for key, value, query
        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)
        # this dropout is applied to normalized attention scores following the original implementation of transformer
        # although it is a bit unusual, we empirically observe that it yields better performance
        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

    def transform(self, x, linear_layer):
        # the corresponding linear_layer of k, v, q are used to project the hidden_state (x)
        bs, seq_len = x.shape[:2]
        proj = linear_layer(x)
        # next, we need to produce multiple heads for the proj
        # this is done by spliting the hidden state to self.num_attention_heads, each of size self.attention_head_size
        proj = proj.view(bs, seq_len, self.num_attention_heads, self.attention_head_size)
        # by proper transpose, we have proj of [bs, num_attention_heads, seq_len, attention_head_size]
        proj = proj.transpose(1, 2)
        return proj

    def attention(self, key, query, value, attention_mask):
        # each attention is calculated following eq (1) of https://arxiv.org/pdf/1706.03762.pdf.
        # attention scores are calculated by multiplying queries and keys
        # and get back a score matrix S of [bs, num_attention_heads, seq_len, seq_len]
        # S[*, i, j, k] represents the (unnormalized) attention score between the j-th
        # and k-th token, given by i-th attention head before normalizing the scores,
        # use the attention mask to mask out the padding token scores.

        # Note again: in the attention_mask non-padding tokens are marked with 0 and
        # adding tokens with a large negative number.

        ### TODO
        q_kt = torch.dot(query, key) 
        # / torch.sqrt(key.shape)
        # raise NotImplementedError
        # Normalize the scores.
        # Multiply the attention scores to the value and get back V'.
        # Next, we need to concat multi-heads and recover the original shape
        # [bs, seq_len, num_attention_heads * attention_head_size = hidden_size].

    def forward(self, hidden_states, attention_mask):
        """
        hidden_states: [bs, seq_len, hidden_state]
        attention_mask: [bs, 1, 1, seq_len]
        output: [bs, seq_len, hidden_state]
        """
        # first, we have to generate the key, value, query for each token for multi-head attention w/ transform (more details inside the function)
        # of *_layers are of [bs, num_attention_heads, seq_len, attention_head_size]
        key_layer = self.transform(hidden_states, self.key)
        value_layer = self.transform(hidden_states, self.value)
        query_layer = self.transform(hidden_states, self.query)
        # calculate the multi-head attention
        attn_value = self.attention(key_layer, query_layer, value_layer, attention_mask)
        return attn_value

In [6]:
from types import SimpleNamespace

config = {
        "hidden_dropout_prob": 0.3,
        "hidden_size": 768,
        "data_dir": ".",
        # "option": "pretrain",
        "local_files_only": True,
}

config = SimpleNamespace(**config)

In [16]:
config = BertConfig()
config.name_or_path = 'bert-base-uncased'

In [17]:
bert = BertModel(config)

{'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'use_bfloat16': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'is_encoder_decoder': False, 'is_decoder': False, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'chunk_size_feed_forward': 0, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'architectures': None, 'finetuning_task': None, 'id2label': None, 'label2id': None, 'num_labels': 2, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': None, 'pad_token_id': 0, 'eos_token_id': None, 'sep_token_

In [23]:
from transformers import BertTokenizer

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Input sentence
sentence = "My name is Ayan Gupta!"

# Tokenize the sentence
tokens = tokenizer(sentence, padding='max_length', max_length=10, truncation=True, return_tensors='pt')

# Extract input_ids and attention_mask
input_ids = tokens['input_ids']
attention_mask = tokens['attention_mask']


In [27]:
vocab = tokenizer.get_vocab()


In [30]:
# tokens_from_ids = tokenizer.convert_ids_to_tokens(input_ids[0])
# print("Tokens from input IDs:", tokens_from_ids)

# words_from_ids = tokenizer.convert_tokens_to_string(tokens_from_ids)
# print("Words from input IDs:", words_from_ids)

Tokens from input IDs: ['[CLS]', 'my', 'name', 'is', 'a', '##yan', 'gupta', '!', '[SEP]', '[PAD]']
Words from input IDs: [CLS] my name is ayan gupta ! [SEP] [PAD]


In [24]:
from pprint import pprint

pprint(tokens)

{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0]]),
 'input_ids': tensor([[  101,  2026,  2171,  2003,  1037,  7054, 20512,   999,   102,     0]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}
