In [None]:
!pip install accelerate
!pip install datasets
from datasets import load_dataset
# !pip install evaluate
!pip install transformers
# !pip install git+https://github.com/huggingface/transformers
import numpy as np
import torch
import matplotlib.pyplot as plt
import functools
import pickle
np.random.seed(2024)



In [None]:
# Modified code below to move model out of function

import torch
# from transformers import DistilBertModel, DistilBertTokenizer, DistilBertConfig
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from transformers import BertModel, BertConfig, BertTokenizer

model_name = "huawei-noah/TinyBERT_General_4L_312D"
config = BertConfig.from_pretrained(model_name)#, output_hidden_states=True, output_attentions=True)
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name, config=config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
dataset = load_dataset("abokbot/wikipedia-first-paragraph")

In [None]:
def get_inputs(model, texts):

    for input_text in texts:

      # Tokenize input text
      inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
      inputs.to(device)
      # print(len(inputs))

      # Forward pass through the model
      with torch.no_grad():
          outputs = model(**inputs)
      # print(len(outputs))

    return

def sample_X(num_paragraphs, num_rows, text_file_name, idxs_file_name, layer):
    # Sample input text
    # input_text = ["Hello, how are you?", "Let's try with 2 sentences...Does making this longer do anything?",
    # "What about a third entry?", "Or a 4th?", "Let's check a 5th because 4 is used somewhere else"]
    model = BertModel.from_pretrained(model_name, config=config, device_map=device)
    model = model.to(device)

    inputs = dict()
    # repeat for each of the 4 layers, change hook addition too
    inputs[f'attention_{layer}_Q'] = list()
    inputs[f'attention_{layer}_K'] = list()
    inputs[f'attention_{layer}_FF'] = list()
    inputs[f'FF_intermediate_{layer}'] = list()
    inputs[f'FF_output_{layer}'] = list()

    def get_input(name):
        def hook(model, input):
            inputs[name].append(input)
        return hook
    model.encoder.layer[layer].attention.self.query.register_forward_pre_hook(get_input(f'attention_{layer}_Q'))
    model.encoder.layer[layer].attention.self.key.register_forward_pre_hook(get_input(f'attention_{layer}_K'))
    model.encoder.layer[layer].attention.output.dense.register_forward_pre_hook(get_input(f'attention_{layer}_FF'))
    model.encoder.layer[layer].intermediate.dense.register_forward_pre_hook(get_input(f'FF_intermediate_{layer}'))
    model.encoder.layer[layer].output.dense.register_forward_pre_hook(get_input(f'FF_output_{layer}'))
    # model.encoder.layer[0].output.LayerNorm.register_forward_pre_hook(get_input('attention_1'))
    # model.encoder.layer[1].attention.output.LayerNorm.register_forward_pre_hook(get_input('FF_1'))
    # model.encoder.layer[1].output.LayerNorm.register_forward_pre_hook(get_input('attention_2'))
    # model.encoder.layer[2].attention.output.LayerNorm.register_forward_pre_hook(get_input('FF_2'))
    # model.encoder.layer[2].output.LayerNorm.register_forward_pre_hook(get_input('attention_3'))
    # model.encoder.layer[3].attention.output.LayerNorm.register_forward_pre_hook(get_input('FF_3'))
    # model.encoder.layer[0].attention.output.LayerNorm.register_forward_hook(get_input('dense_0'))
    np.random.shuffle(dataset['train']['text'])
    input_text = dataset['train']['text'][:num_paragraphs]
    with open(text_file_name, 'wb') as f:
      pickle.dump(input_text, f)

    get_inputs(model, input_text)
    Xs_complete = dict()
    for key, values in inputs.items():
      # print(key)
      # for arr in values[:10]:
      #   print(arr[0].shape)
      # print(arr[0] for arr in values[:10])
      Xs_complete[key] = torch.cat(tuple(arr[0][0] for arr in values))
    # X_complete = torch.cat(tuple(arr[0] for arr in inputs['attention_0']))
    # print(X_complete.shape)
    # return Xs_complete

    print(Xs_complete[f'attention_{layer}_Q'].shape, Xs_complete[f'attention_{layer}_K'].shape, Xs_complete[f'attention_{layer}_FF'].shape, Xs_complete[f'FF_intermediate_{layer}'].shape, Xs_complete[f'FF_output_{layer}'].shape)
    idxs = np.random.default_rng().choice(range(Xs_complete[f'attention_{layer}_Q'].shape[0]), num_rows)
    idxs = torch.tensor(idxs)
    with open(idxs_file_name, 'wb') as f:
      pickle.dump(idxs, f)
    Xs_small = dict()
    for key, values in Xs_complete.items():
      Xs_small[key] = values[idxs]
      # X_small = X_complete[idxs]
    # print(X_small.shape)
    return Xs_small

In [None]:
W = model.encoder.layer[0].intermediate.dense.weight.detach().cpu().numpy()
X_train = sample_X(5000, 5000, 'train_input_text.pkl', 'Xs_train_small_idxs.pkl', 3)
X_test = sample_X(500, 500, 'test_input_text.pkl', 'Xs_test_small_idxs.pkl', 3)

torch.Size([459971, 312]) torch.Size([459971, 312]) torch.Size([459971, 312]) torch.Size([459971, 312]) torch.Size([459971, 1200])
torch.Size([44748, 312]) torch.Size([44748, 312]) torch.Size([44748, 312]) torch.Size([44748, 312]) torch.Size([44748, 1200])


In [None]:
for key, value in X_train.items():
  with open(f"X_train_{key}.npy", 'wb') as f:
    np.save(f, value.cpu())

for key, value in X_test.items():
  with open(f"X_test_{key}.npy", 'wb') as f:
    np.save(f, value.cpu())