In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '3'
from transformers import BloomForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m", add_prefix_space=True)

# model = BloomForCausalLM.from_pretrained("bigscience/bloom-560m-intermediate", revision='global_step10000')


In [2]:
import os

# os.environ['CUDA_VISIBLE_DEVICES'] = '0'
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

from torch import nn
import torch


def compute_loss_labelsmoothed(logits, labels, ignore_index=-100, epsilon=0.1):
    logits = logits[..., :-1, :].contiguous()
    labels = labels[..., 1:].contiguous()

    log_probs = -nn.functional.log_softmax(logits, dim=-1)
    if labels.dim() == log_probs.dim() - 1:
        labels = labels.unsqueeze(-1)

    padding_mask = labels.eq(ignore_index)

    labels = torch.clamp(labels, min=0)
    nll_loss = log_probs.gather(dim=-1, index=labels)

    smoothed_loss = log_probs.sum(dim=-1, keepdim=True, dtype=torch.float32)

    nll_loss.masked_fill_(padding_mask, 0.0)
    smoothed_loss.masked_fill_(padding_mask, 0.0)

    num_active_elements = padding_mask.numel() - padding_mask.long().sum()
    nll_loss = nll_loss.sum() / num_active_elements
    smoothed_loss = smoothed_loss.sum() / (num_active_elements * log_probs.shape[-1])
    return (1 - epsilon) * nll_loss + epsilon * smoothed_loss

    
class MyNet(nn.Module): #t5 сложнее создавать создавать датасет для лм так как много пришлось бы возиться с префиксами
    def __init__(self, revision, layers):
        super().__init__()
#         self.transformer = GPT2LMHeadModel.from_pretrained('gpt2')
        self.transformer = BloomForCausalLM.from_pretrained(
            "bigscience/bloom-560m-intermediate",
            revision=revision,
        )
        
        hid_size = self.transformer.config.hidden_size
        self.voc_size = self.transformer.config.vocab_size
        
        self.early_exits = nn.ModuleList([
            nn.Linear(hid_size, self.voc_size) for _ in layers
        ])
        self._hidden_indices = layers
        print(f'will be using outputs of {self._hidden_indices} layers')
        self.ce = nn.CrossEntropyLoss()
    
    def forward(self, input_ids, attention_mask, labels=None):
        output = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
            output_hidden_states=True,
        )
        
        hidden_states = output.hidden_states[1:-1]
        heads_outputs = [
            self.early_exits[idx](hidden_states[self._hidden_indices[idx]]) 
            for idx in range(0, len(self.early_exits))
        ]
                
        if labels is None:
            heads_outputs = [
                torch.softmax(head_output, dim=-1) for head_output in heads_outputs
            ] # h_os[i][bs][seqlen][tok_num] = P(из i-го слоя на seqlen месте стоит токен tok_num)

            return {'head_outputs': heads_outputs, 'last_head': torch.softmax(output.logits, dim=-1)}
                
        losses = [
            compute_loss_labelsmoothed(head_output, labels)
            for head_output in heads_outputs
        ]
        
        losses = torch.stack(losses,)
        total_loss = torch.sum(losses)
        
        heads_outputs = [
            torch.softmax(head_output, dim=-1).detach() for head_output in heads_outputs
        ] # h_os[i][bs][seqlen][tok_num] = P(из i-го слоя на seqlen месте стоит токен tok_num)

        return {
            'loss': total_loss, 
            'head_outputs': heads_outputs,  # [num_layers, bs=1, seq_len, vocab_size] # {token: {layer_num: [probabilities, ... ]}}
            'last_head': torch.softmax(output.logits, dim=-1).detach(),
        }
        

In [3]:
dataset_path = 'files/dataset_test'
dataset_cache = 'files/.cache'

In [4]:
from datasets import load_dataset
import numpy as np

# dataset = load_dataset("wikipedia", "20220301.en", dataset_cache)

# rand_idx = np.random.choice(np.arange(len(dataset['train'])), size=100_000, replace=False)

# # import json
# # rand_idx = json.load(open('indices.json', 'r'))

# dataset = dataset['train'].select(rand_idx, )

# # # import json

# # # json.dump(rand_idx.tolist(), open('indices.json', 'w'),)

# # def tokenize_data(example):
# #     return tokenizer(example['text'], max_length=512, truncation=True)

# # dataset = dataset.map(
# #     tokenize_data, remove_columns=['text', 'id', 'url', 'title'], batched=True, num_proc=10
# # )

# dataset.save_to_disk(dataset_path)


from datasets import load_from_disk

dataset = load_from_disk(dataset_path)

In [5]:
from transformers import DataCollatorForLanguageModeling

collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

# net = MyNet("global_step100000", [3, 14, 21]).to('cuda')
# net = net.eval()

from transformers import Trainer

# net.load_state_dict(torch.load('../mnt/bloom-models/bloom-final-global-step-100000/pytorch_model.bin'))

# net.eval();

2023-05-15 18:17:57.240480: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-05-15 18:17:57.318363: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
from torch.nn import CrossEntropyLoss

In [7]:
loss_ce = CrossEntropyLoss()

In [8]:
device = 'cuda'

In [9]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

In [15]:
def get_sentence_grads(net, sent, layer_num):
    words = sent.lower().split()
    optimizer = torch.optim.SGD(net.parameters(), lr=1e-1)
    
    tokenized = tokenizer(
        words, truncation=True, max_length=512, is_split_into_words=True, return_tensors='pt'
    ).to(device)
    
    outputs = net(**tokenized)
    heads_probs = outputs['head_outputs'] + [outputs['last_head']]
    heads_probs = torch.stack(heads_probs)  # [num_layers, 1, seq_len, voc_size]
    
    return_dict = defaultdict(lambda: defaultdict(list))
    
    word_ids = tokenized.word_ids()
    
    heads_preds_per_words = []
    
    per_head_norm = []
    for i in range(len(word_ids) - 1):
        word_id = word_ids[i + 1]

        gt_token_id = tokenized['input_ids'][0, i + 1]
        heads_predictions = heads_probs[layer_num, 0, i, :]

        assert heads_predictions.requires_grad
        head_loss = loss_ce(heads_predictions, gt_token_id)
        head_loss.backward(retain_graph=True)

        norm = torch.nn.utils.clip_grad_norm_(net.parameters(), 1)
        optimizer.zero_grad()

        per_head_norm.append(norm)

    heads_preds_per_words.append(torch.stack(per_head_norm).cpu().numpy().tolist())
    
    # heads_preds_per_words: [num_layers, len(word_ids) - 1]
    
    heads_preds_per_words = np.array(heads_preds_per_words)

    prev_word_id = 0
    cur_prob = np.zeros(heads_preds_per_words.shape[0]) # [num_layers, ]
    cur_count = 0
    
    for i in range(len(word_ids) - 1):
        word_id = word_ids[i + 1]
        heads_prediction = heads_preds_per_words[:, i] # [num_layers, ]
        
        if word_id == prev_word_id:
            cur_count += 1
            for layer in range(len(heads_prediction)):
                cur_prob[layer] += heads_prediction[layer]
        else:
        
            word_key = lemmatizer.lemmatize(words[word_id])
            
            layer=0
            return_dict[word_key][layer].append(cur_prob[layer] / min(cur_count, 1))
            
            cur_prob = heads_prediction
            cur_count = 1
            
    for k, v in return_dict.items():
        for kv, arrs in v.items():
            return_dict[k][kv] = np.mean(np.stack(arrs), axis=0)
            
    return return_dict
    # {word: {layer: [probs, ...]}}

In [16]:
from tqdm.auto import tqdm
import joblib
from collections import defaultdict

In [12]:
for model_path, name in zip([
#     '../mnt/bloom-models/bloom-final-global-step-100000/pytorch_model.bin',
#      '../mnt/bloom-models/bloom-final-global-step-300000/pytorch_model.bin',
     '../mnt/bloom-models/bloom-final-global-step-600000/pytorch_model.bin',
     '../mnt/bloom-models/bloom-10000/checkpoint-91000/pytorch_model.bin',
], ['bloom-600000', 'bloom-10000']): #['bloom-100000', 'bloom-300000', 'bloom-600000', 'bloom-10000']):
    torch.cuda.empty_cache()

    net = MyNet("global_step100000", [3, 14, 21]).to(device)
    net.load_state_dict(torch.load(model_path))
    
    for name, param in net.named_parameters():
        if 'early_exits.0' not in name:
            param.requires_grad_(False)

    word_stats = defaultdict(lambda: defaultdict(list))

    for i in tqdm(range(2_000)):
        example = dataset[i]

        return_dict = get_sentence_grads(net, example['text'])

        for word in return_dict.keys():
            for layer in return_dict[word].keys():
                word_stats[word][layer].append(return_dict[word][layer])

    word_stats_dict = {k: dict(v) for k, v in word_stats.items()}
    joblib.dump(word_stats_dict, f'../mnt/{name}_grads_layer3.joblib')
    del net


will be using outputs of [3, 14, 21] layers


  0%|          | 0/2000 [00:00<?, ?it/s]

  return_dict[word_key][layer].append(cur_prob[layer] / min(cur_count, 1))


will be using outputs of [3, 14, 21] layers


  0%|          | 0/2000 [00:00<?, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [17]:
def get_run(model_path, layer_name, save_name, layer_num):
    torch.cuda.empty_cache()

    net = MyNet("global_step100000", [3, 14, 21]).to(device)
    net.load_state_dict(torch.load(model_path))
    
    for name, param in net.named_parameters():
        if layer_name not in name:
            param.requires_grad_(False)
        else:
            print(name)

    word_stats = defaultdict(lambda: defaultdict(list))

    for i in tqdm(range(1_000)):
        example = dataset[i]

        return_dict = get_sentence_grads(net, example['text'], layer_num)

        for word in return_dict.keys():
            for layer in return_dict[word].keys():
                word_stats[word][layer].append(return_dict[word][layer])
#                 assert 0 not in word_stats[word][layer]

    word_stats_dict = {k: dict(v) for k, v in word_stats.items()}
    joblib.dump(word_stats_dict, save_name)
    del net

In [18]:
get_run('../mnt/bloom-models/bloom-10000/checkpoint-91000/pytorch_model.bin',
        layer_name='early_exits.1', save_name='../mnt/grads_layer14_bloom-10000.joblib', layer_num=1)

get_run('../mnt/bloom-models/bloom-final-global-step-600000/pytorch_model.bin',
        layer_name='early_exits.0', save_name='../mnt/grads_layer3_bloom-600k.joblib', layer_num=0)

get_run( '../mnt/bloom-models/bloom-final-global-step-600000/pytorch_model.bin',
        layer_name='early_exits.1', save_name='../mnt/grads_layer14_bloom-600k.joblib', layer_num=1)

will be using outputs of [3, 14, 21] layers
early_exits.1.weight
early_exits.1.bias


  0%|          | 0/1000 [00:00<?, ?it/s]

  return_dict[word_key][layer].append(cur_prob[layer] / min(cur_count, 1))


will be using outputs of [3, 14, 21] layers
early_exits.0.weight
early_exits.0.bias


  0%|          | 0/1000 [00:00<?, ?it/s]

will be using outputs of [3, 14, 21] layers
early_exits.1.weight
early_exits.1.bias


  0%|          | 0/1000 [00:00<?, ?it/s]