# Loading Previous BERT model


In [5]:
from transformers import BertTokenizer
from transformers import AutoTokenizer,AutoModel,AutoConfig
from transformers.modeling_outputs import TokenClassifierOutput

import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import torch.nn.functional as F

In [3]:

class HangmanNet(nn.Module):
  def __init__(self,checkpoint, vocab_size = 26, hidden_ffn_size = 410, unfreeze_layers = 0): 
    super(HangmanNet,self).__init__() 
    self.num_labels = vocab_size 

    #Load Model with given checkpoint and extract its body
    self.model = AutoModel.from_pretrained(checkpoint,config=AutoConfig.from_pretrained(checkpoint, output_attentions=True,output_hidden_states=True))
    
    # Freeze all layers in the BERT model
    for param in self.soruce_modelparameters():
        param.requires_grad = False

    # Unfreeze the last `unfreeze_layers` layers
    if unfreeze_layers > 0:
        for layer in self.model.encoder.layer[-unfreeze_layers:]:
            for param in layer.parameters():
                param.requires_grad = True
        

    self.dropout = nn.Dropout(0.1)

    self.classifier = nn.Sequential(
        nn.Linear(768 + vocab_size, hidden_ffn_size),
        nn.ReLU(),
        nn.Linear(hidden_ffn_size, 26)
    )
    
    # self.classifier = nn.Linear(768 + vocab_size,vocab_size) # load and initialize weights
  
  def forward(self, input_ids=None, attention_mask=None, labels=None, prev_guess=None,
              token_type_ids=None):
      outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
      
      sequence_output = outputs.last_hidden_state  # (batch_size, sequence_length, hidden_size)
      sequence_output = self.dropout(sequence_output)

      # Concatenate the previous guesses to the sequence_output
      # (batch_size, sequence_length, hidden_size + vocab_size)
      sequence_output = torch.cat((sequence_output, prev_guess.unsqueeze(1).repeat(1, sequence_output.shape[1], 1)), dim=2)

      logits = self.classifier(sequence_output)  # (batch_size, sequence_length, num_labels)

      
      # Mask the logits to zero out probabilities of previously guessed characters by considering the one-hot encoding in prev guesses
      logits[prev_guess.unsqueeze(1).repeat(1, sequence_output.shape[1], 1) == 1] = -float("inf")

      loss = None
      if labels is not None:
          loss_fct = nn.CrossEntropyLoss()

          # NOTE: I already has the labels in active logits representation
          active_logits = logits.view(-1, self.num_labels)

        #   active_labels = torch.where(active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels))
          loss = loss_fct(active_logits, labels.view(-1))
        
      return TokenClassifierOutput(logits=logits, loss=loss, hidden_states=outputs.hidden_states,attentions=outputs.attentions)



In [4]:
class InferenceTokenizer:
    def __init__(self, tokenizer, max_length=42):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.char_to_index = {chr(i + ord('a')): i for i in range(26)}

    def __call__(self, masked_word, prev_guesses):

        prev_guesses_one_hot = torch.zeros(26, dtype=torch.int64)

        for char in prev_guesses:
            prev_guesses_one_hot[self.char_to_index[char]] = 1


        # NOTE: The labels are not require during inference
        # labels_word = examples['labels']

        # Replace the underscore in masked_word with the special [MASK] token
        masked_word = masked_word.replace('_', '[MASK]')

        # Tokenize the masked_words
        batch = self.tokenizer(masked_word, truncation=True, padding='max_length', return_tensors="pt", max_length=self.max_length)

        batch['prev_guess'] = prev_guesses_one_hot.unsqueeze(0)

        return batch

In [5]:
# NOTE: Example of tokenization
# word input example: "_ p p _ e "
word = ["_", "p", "p", "_", "e" ]

word = " ".join(word)
print(word)

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Initialize Custom InferenceTokenizer
tokenizer = InferenceTokenizer(tokenizer, max_length=32)


tokenized_word = tokenizer(word, prev_guesses=['p','e', 'u', 'o', 't'])
tokenized_word

_ p p _ e


{'input_ids': tensor([[ 101,  103, 1052, 1052,  103, 1041,  102,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]]), 'prev_guess': tensor([[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0,
         0, 0]])}

In [25]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

tokenizer.unk_token
tokenizer.mask_token
tokenizer.pad_token

'[PAD]'

In [56]:
import torch

input_ids_example = torch.tensor([[101, 108, 103, 103], [101, 103, 102, 103]])

# mask_idxs = torch.where(input_ids_example == 103)
mask = (input_ids_example == 103)
mask

# per row chose one true randomly and set the rest to false
mask = torch.zeros_like(input_ids_example, dtype=torch.bool)
mask[torch.arange(input_ids_example.size(0)), torch.randint(0, input_ids_example.size(1), (input_ids_example.size(0),))]
mask

tensor([[False, False,  True,  True],
        [False,  True, False,  True]])

In [64]:
encoder_outputs = torch.randn(2, 4, 6)
encoder_outputs

tensor([[[ 2.1635, -0.5571, -0.5617, -1.3012, -0.2051,  0.5336],
         [ 0.4887, -1.3061,  0.3810, -0.1133, -0.7405, -1.0193],
         [ 0.5223,  0.1506, -0.9304,  0.3422,  1.9073, -1.5074],
         [-1.1713,  0.3046, -1.4073,  0.2468,  1.5389,  0.1109]],

        [[-0.0305,  0.8902,  0.1471, -0.7971, -0.9272, -0.5414],
         [-0.4392,  1.1099,  0.0776,  0.4063, -0.1076,  0.6352],
         [-1.1726,  0.1578,  1.1572, -0.3341,  1.0961,  0.0611],
         [ 0.1343,  1.5638, -0.7380,  0.9475,  1.0093,  0.8183]]])

In [67]:
import torch

# Input tensor and mask
input_ids_example = torch.tensor([[101, 108, 103, 103], [101, 103, 102, 103]])
mask = (input_ids_example == 103)

# Create random values for tie-breaking
random_values = torch.rand_like(input_ids_example.float())

# Set random values only where the mask is True
random_values[~mask] = float('-inf')  # Set irrelevant positions to -inf

# Find the index of the maximum random value per row
_, selected_indices = random_values.max(dim=1)

# Create the final mask
final_mask = torch.zeros_like(mask, dtype=torch.bool)
final_mask[torch.arange(mask.size(0)), selected_indices] = True

# Display results
print("Original Mask:\n", mask)
print("Final Mask with One Random True per Row:\n", final_mask)

encoder_outputs[final_mask].shape

Original Mask:
 tensor([[False, False,  True,  True],
        [False,  True, False,  True]])
Final Mask with One Random True per Row:
 tensor([[False, False,  True, False],
        [False, False, False,  True]])


torch.Size([2, 6])

In [54]:
import torch

# Given tuple of indices
row_indices = torch.tensor([0, 0, 1, 1])
col_indices = torch.tensor([2, 3, 1, 3])

# Unique column indices
unique_cols = torch.unique(col_indices)

# Storage for the selected indices
selected_row_indices = []
selected_col_indices = []

num_rows = 2
for row in range(num_rows):
    # Get the row indices corresponding to this row
    mask = row_indices == row
    available_cols = col_indices[mask]
    
    # Randomly pick one column index
    selected_col = available_cols[torch.randint(len(available_cols), (1,))]
    
    # Store the selection
    selected_row_indices.append(row)
    selected_col_indices.append(selected_col.item())

selected_row_indices, selected_col_indices


([0, 1], [2, 3])

In [28]:
# Get the IDs for MASK and PAD tokens
mask_id = tokenizer.mask_token_id
pad_id = tokenizer.pad_token_id
cls_id = tokenizer.cls_token_id
unk_id = tokenizer.unk_token_id

print(f"MASK ID: {mask_id}, PAD ID: {pad_id}, CLS ID: {cls_id}, UNK ID: {unk_id}")

MASK ID: 103, PAD ID: 0, CLS ID: 101, UNK ID: 100


In [39]:
# NOTE: Function to encode with BERT in my environment

def bert_tokenizer_encode_word(masked_word, tokenizer, max_length=32):
    masked_word = " ".join(masked_word)

    # Initialize the tokenizer
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    masked_word = masked_word.replace('_', '[MASK]')

    batch_info = tokenizer(masked_word, truncation=True, padding='max_length', return_tensors="np", max_length=max_length)

    return word, batch_info["input_ids"].astype(np.int32)

word = ["_", "p", "p", "_", "e" ]
_, input_ids = bert_tokenizer_encode_word(word, tokenizer, max_length=32)

In [47]:
input_ids

array([ 101,  103, 1052, 1052,  103, 1041,  102,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
      dtype=int32)

In [5]:
# NOTE: Example of tokenization
# word input example: "_ p p _ e "
word = "_ p p _ e "

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Initialize Custom InferenceTokenizer
tokenizer = InferenceTokenizer(tokenizer, max_length=42)


tokenized_word = tokenizer(word, prev_guesses=['p','e', 'u', 'o', 't'])
tokenized_word

{'input_ids': tensor([[ 101,  103, 1052, 1052,  103, 1041,  102,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'prev_guess': tensor([[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0,
         0, 0]])}

In [6]:
# Load the model with the base bert model
model = HangmanNet(checkpoint="bert-base-uncased", vocab_size = 26, unfreeze_layers = 2)

# Load my trained model an perform an inference
# NOTE: Change the path accordingly if you are trying in other environment
checkpoint_path = "models/pretrained_bert/model_epoch_45.pth"
checkpoint = torch.load(checkpoint_path)
model.load_state_dict(checkpoint['model_state_dict'])

soruce_model = checkpoint['model_state_dict']

  checkpoint = torch.load(checkpoint_path)


In [8]:
model

HangmanNet(
  (model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [10]:
from utils_model import CustomBERT
# Initialize your CustomBERT model
custom_bert = CustomBERT(
    vocab_size=30,  # Or the appropriate vocab size
    hidden_size=768,
    num_hidden_layers=4,
    num_attention_heads=4,
    max_position_embeddings=512,
    intermediate_size=3072,
    dqn_head=True
)

# Transfer the weights
# custom_bert = transfer_weights(hangman_state_dict, custom_bert)


In [10]:
# # Compare weights

# print(torch.allclose(model.model.embeddings.word_embeddings.weight, custom_bert.embeddings.word_embeddings.weight, atol=1e-6))
# print(torch.allclose(model.model.embeddings.position_embeddings.weight, custom_bert.embeddings.position_embeddings.weight, atol=1e-6))
# print(torch.allclose(model.model.embeddings.token_type_embeddings.weight, custom_bert.embeddings.token_type_embeddings.weight, atol=1e-6))
# print(torch.allclose(model.model.embeddings.LayerNorm.weight, custom_bert.embeddings.LayerNorm.weight, atol=1e-6))
# print(torch.allclose(model.model.embeddings.LayerNorm.bias, custom_bert.embeddings.LayerNorm.bias, atol=1e-6))


In [11]:
# # Save the updated custom bert model
# torch.save(custom_bert.state_dict(), "models/pretrained_bert_epoch_45.pth")

In [1]:
from utils_model import CustomBERT

# Load the updated custom bert model
custom_bert = CustomBERT(
    vocab_size=30,  # Or the appropriate vocab size
    hidden_size=128,
    num_hidden_layers=2,
    num_attention_heads=2,
    max_position_embeddings=512,
    intermediate_size=512,
    dqn_head=True
)


  from .autonotebook import tqdm as notebook_tqdm


CustomBERT number of parameters:  4478618


In [2]:
custom_bert

CustomBERT(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affi

In [3]:
from transformers import AutoTokenizer,AutoModel,AutoConfig

# Load the model prajjwal1/bert-tiny
checkpoint = "prajjwal1/bert-tiny"
pretrained_bert = AutoModel.from_pretrained(checkpoint,config=AutoConfig.from_pretrained(checkpoint, output_attentions=True,output_hidden_states=True))
pretrained_bert



BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 128, padding_idx=0)
    (position_embeddings): Embedding(512, 128)
    (token_type_embeddings): Embedding(2, 128)
    (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-1): 2 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=128, out_features=128, bias=True)
            (key): Linear(in_features=128, out_features=128, bias=True)
            (value): Linear(in_features=128, out_features=128, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=128, out_features=128, bias=True)
            (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)


In [4]:
import torch

def copy_bert_weights(source_model, target_model):
    """
    Copy the weights of the `BertModel` component from `source_model` to `target_model`.
    
    Parameters:
        source_model: The source model (e.g., `BertModel` instance).
        target_model: The target model (e.g., `CustomBERT` instance).
    """
    # Extract the state_dict from the source and target models
    source_state_dict = source_model.state_dict()
    target_state_dict = target_model.bert.state_dict()  # Assuming target model has a .bert attribute
    
    # Filter out weights that are specific to the source or target
    filtered_source_state_dict = {
        k: v for k, v in source_state_dict.items() if k in target_state_dict
    }
    
    # Update the target model with the source weights
    target_state_dict.update(filtered_source_state_dict)
    
    # Load the updated state_dict back into the target model
    target_model.bert.load_state_dict(target_state_dict)
    print("Weights successfully copied from source to target.")
    for k, v in filtered_source_state_dict.items():
        print(f"  {k}")

# Example usage
# Assuming `source_bert_model` is an instance of `BertModel` and
# `custom_bert_model` is an instance of `CustomBERT`
copy_bert_weights(pretrained_bert, custom_bert)

Weights successfully copied from source to target.
  embeddings.word_embeddings.weight
  embeddings.position_embeddings.weight
  embeddings.token_type_embeddings.weight
  embeddings.LayerNorm.weight
  embeddings.LayerNorm.bias
  encoder.layer.0.attention.self.query.weight
  encoder.layer.0.attention.self.query.bias
  encoder.layer.0.attention.self.key.weight
  encoder.layer.0.attention.self.key.bias
  encoder.layer.0.attention.self.value.weight
  encoder.layer.0.attention.self.value.bias
  encoder.layer.0.attention.output.dense.weight
  encoder.layer.0.attention.output.dense.bias
  encoder.layer.0.attention.output.LayerNorm.weight
  encoder.layer.0.attention.output.LayerNorm.bias
  encoder.layer.0.intermediate.dense.weight
  encoder.layer.0.intermediate.dense.bias
  encoder.layer.0.output.dense.weight
  encoder.layer.0.output.dense.bias
  encoder.layer.0.output.LayerNorm.weight
  encoder.layer.0.output.LayerNorm.bias
  encoder.layer.1.attention.self.query.weight
  encoder.layer.1.atten

In [5]:
custom_bert.bert.embeddings.word_embeddings.weight

Parameter containing:
tensor([[-4.1018e-03, -3.0695e-02, -3.5295e-03,  ...,  1.8925e-02,
          3.7396e-03, -2.9233e-03],
        [-4.2748e-04, -3.6929e-02, -1.7168e-02,  ...,  2.9314e-02,
         -1.0398e-02,  2.6772e-02],
        [ 5.9418e-03,  4.2119e-03, -1.9566e-02,  ...,  1.6799e-02,
         -2.7802e-02, -6.9017e-03],
        ...,
        [ 3.5573e-02, -1.5891e-02,  4.9951e-03,  ...,  5.4071e-03,
         -1.1270e-02, -6.9528e-05],
        [-8.7018e-03, -2.2516e-02,  3.1993e-03,  ...,  2.7591e-02,
         -1.9554e-02,  2.4023e-03],
        [-7.8904e-02, -7.5407e-02, -4.6660e-03,  ..., -5.3340e-03,
         -4.4993e-02,  5.9842e-02]], requires_grad=True)

In [7]:
# torch.save(custom_bert.state_dict(), "models/bert_tiny.pth")

In [3]:
from utils_model import CustomBERT
import torch
# Load from the saved model
custom_bert = CustomBERT(
    vocab_size=30,  # Or the appropriate vocab size
    hidden_size=128,
    num_hidden_layers=2,
    num_attention_heads=2,
    max_position_embeddings=512,
    intermediate_size=512,
    dqn_head=True
)

# Load the updated custom bert model
custom_bert.load_state_dict()


CustomBERT number of parameters:  4460156


  custom_bert.load_state_dict(torch.load("models/bert_tiny.pth"))


<All keys matched successfully>

In [70]:
# # Freeze all parameters
# for param in custom_bert.parameters():
#     param.requires_grad = False

# # Unfreeze the last layer of the encoder
# # for param in custom_bert.encoder.layer[-1].parameters():
# #     param.requires_grad = True

# # Unfreeze the head
# for param in custom_bert.head.parameters():
#     param.requires_grad = True

# # Count trainable parameters
# def count_trainable_parameters(model):
#     return sum(p.numel() for p in model.parameters() if p.requires_grad)

# trainable_params = count_trainable_parameters(custom_bert)
# print(f"Number of trainable parameters: {trainable_params}")


In [71]:
import torch
import torch.nn as nn
import math

class LoRALinear(nn.Module):
    def __init__(self, in_features, out_features, r=8, alpha=16):
        """
        LoRA linear layer.
        :param in_features: Input size of the linear layer.
        :param out_features: Output size of the linear layer.
        :param r: Rank of the low-rank adaptation.
        :param alpha: Scaling factor for the low-rank matrices.
        """
        super().__init__()
        self.r = r
        self.alpha = alpha
        self.scaling = alpha / r
        self.weight = nn.Parameter(torch.zeros(out_features, in_features))
        self.lora_A = nn.Parameter(torch.randn(r, in_features))
        self.lora_B = nn.Parameter(torch.randn(out_features, r))
        self.bias = nn.Parameter(torch.zeros(out_features))

        nn.init.zeros_(self.weight)
        nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
        nn.init.zeros_(self.lora_B)

    def forward(self, x):
        return (
            torch.nn.functional.linear(x, self.weight, self.bias)
            + self.scaling * torch.nn.functional.linear(
                torch.nn.functional.linear(x, self.lora_A), self.lora_B
            )
        )


In [72]:
def apply_lora(bert_model, last_layers = 1, r=4, alpha=16):
    for i in range(-last_layers, 0):  # Replace the last two layers
        layer = bert_model.encoder.layer[i]
        
        # Replace attention linear layers with LoRA versions
        layer.attention.self.query = LoRALinear(
            in_features=layer.attention.self.query.in_features,
            out_features=layer.attention.self.query.out_features,
            r=r,
            alpha=alpha
        )
        layer.attention.self.key = LoRALinear(
            in_features=layer.attention.self.key.in_features,
            out_features=layer.attention.self.key.out_features,
            r=r,
            alpha=alpha
        )
        layer.attention.self.value = LoRALinear(
            in_features=layer.attention.self.value.in_features,
            out_features=layer.attention.self.value.out_features,
            r=r,
            alpha=alpha
        )
        
        # Replace intermediate and output dense layers with LoRA versions
        # layer.intermediate.dense = LoRALinear(
        #     in_features=layer.intermediate.dense.in_features,
        #     out_features=layer.intermediate.dense.out_features,
        #     r=r,
        #     alpha=alpha
        # )
        # layer.output.dense = LoRALinear(
        #     in_features=layer.output.dense.in_features,
        #     out_features=layer.output.dense.out_features,
        #     r=r,
        #     alpha=alpha
        # )

    return bert_model


In [73]:
def freezing_layers_and_LoRA(custom_bert):
    # Freeze all parameters
    for param in custom_bert.parameters():
        param.requires_grad = False

    # Unfreeze the last layer of the encoder
    # for param in custom_bert.encoder.layer[-1].parameters():
    #     param.requires_grad = True

    # Unfreeze the head
    for param in custom_bert.head.parameters():
        param.requires_grad = True

    # Apply LoRA to the last two layers
    custom_bert = apply_lora(custom_bert, r=4, alpha=16)


    # Count trainable parameters
    def count_trainable_parameters(model):
        return sum(p.numel() for p in model.parameters() if p.requires_grad)

    trainable_params = count_trainable_parameters(custom_bert)
    print(f"Number of trainable parameters: {trainable_params}")

    return custom_bert

In [74]:
custom_bert = freezing_layers_and_LoRA(custom_bert)

Number of trainable parameters: 2126844


In [None]:
# Load the following model prajjwal1/bert-tiny

# Load the model with the base bert model
model = HangmanNet(checkpoint="prajjwal1/bert-tiny", vocab_size = 26, unfreeze_layers = 2)

In [41]:


# # Check if the encoder layers are approximately equal
# for i in range(12):
#     for key in ['attention.self.query.weight', 'attention.self.query.bias',
#                 'attention.self.key.weight', 'attention.self.key.bias',
#                 'attention.self.value.weight', 'attention.self.value.bias',
#                 'attention.output.dense.weight', 'attention.output.dense.bias',
#                 'attention.output.LayerNorm.weight', 'attention.output.LayerNorm.bias',
#                 'intermediate.dense.weight', 'intermediate.dense.bias',
#                 'output.dense.weight', 'output.dense.bias',
#                 'output.LayerNorm.weight', 'output.LayerNorm.bias']:
#         print(torch.allclose(model.model.encoder.layer[i].__getattr__(key), custom_bert.encoder.layer[i].__getattr__(key), atol=1e-6))


True
True
True
True
True


AttributeError: 'BertLayer' object has no attribute 'attention.self.query.weight'

In [40]:
model.model.encoder.layer[0]

BertLayer(
  (attention): BertAttention(
    (self): BertSdpaSelfAttention(
      (query): Linear(in_features=768, out_features=768, bias=True)
      (key): Linear(in_features=768, out_features=768, bias=True)
      (value): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (output): BertSelfOutput(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (intermediate): BertIntermediate(
    (dense): Linear(in_features=768, out_features=3072, bias=True)
    (intermediate_act_fn): GELUActivation()
  )
  (output): BertOutput(
    (dense): Linear(in_features=3072, out_features=768, bias=True)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
)

In [36]:
custom_bert.encoder.layer

ModuleList(
  (0-11): 12 x BertLayer(
    (attention): BertAttention(
      (self): BertSelfAttention(
        (query): Linear(in_features=768, out_features=768, bias=True)
        (key): Linear(in_features=768, out_features=768, bias=True)
        (value): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (output): BertSelfOutput(
        (dense): Linear(in_features=768, out_features=768, bias=True)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (intermediate): BertIntermediate(
      (dense): Linear(in_features=768, out_features=3072, bias=True)
      (intermediate_act_fn): GELUActivation()
    )
    (output): BertOutput(
      (dense): Linear(in_features=3072, out_features=768, bias=True)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
)

Parameter containing:
tensor([[-0.0102, -0.0615, -0.0265,  ..., -0.0199, -0.0372, -0.0098],
        [-0.0117, -0.0600, -0.0323,  ..., -0.0168, -0.0401, -0.0107],
        [-0.0198, -0.0627, -0.0326,  ..., -0.0165, -0.0420, -0.0032],
        ...,
        [-0.0218, -0.0556, -0.0135,  ..., -0.0043, -0.0151, -0.0249],
        [-0.0462, -0.0565, -0.0019,  ...,  0.0157, -0.0139, -0.0095],
        [ 0.0015, -0.0821, -0.0160,  ..., -0.0081, -0.0475,  0.0753]],
       requires_grad=True)