In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '3'
from transformers import BloomForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")



In [2]:
from transformers import BloomForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")


In [3]:
import os


tokenizer.pad_token = tokenizer.eos_token

from torch import nn
import torch


def compute_loss_labelsmoothed(logits, labels, ignore_index=-100, epsilon=0.1):
    logits = logits[..., :-1, :].contiguous()
    labels = labels[..., 1:].contiguous()

    log_probs = -nn.functional.log_softmax(logits, dim=-1)
    if labels.dim() == log_probs.dim() - 1:
        labels = labels.unsqueeze(-1)

    padding_mask = labels.eq(ignore_index)

    labels = torch.clamp(labels, min=0)
    nll_loss = log_probs.gather(dim=-1, index=labels)

    smoothed_loss = log_probs.sum(dim=-1, keepdim=True, dtype=torch.float32)

    nll_loss.masked_fill_(padding_mask, 0.0)
    smoothed_loss.masked_fill_(padding_mask, 0.0)

    num_active_elements = padding_mask.numel() - padding_mask.long().sum()
    nll_loss = nll_loss.sum() / num_active_elements
    smoothed_loss = smoothed_loss.sum() / (num_active_elements * log_probs.shape[-1])
    return (1 - epsilon) * nll_loss + epsilon * smoothed_loss

    
class MyNet(nn.Module): 
    def __init__(self, revision, layers):
        super().__init__()
        self.transformer = BloomForCausalLM.from_pretrained(
            "bigscience/bloom-560m-intermediate",
            revision=revision,
        )
        
        hid_size = self.transformer.config.hidden_size
        self.voc_size = self.transformer.config.vocab_size
        
        self.early_exits = nn.ModuleList([
            nn.Linear(hid_size, self.voc_size) for _ in layers
        ])
        self._hidden_indices = layers
        print(f'will be using outputs of {self._hidden_indices} layers')
        self.ce = nn.CrossEntropyLoss()
    
    def forward(self, input_ids, attention_mask, labels=None):
        output = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
            output_hidden_states=True,
        )
        
        hidden_states = output.hidden_states[1:-1]
        heads_outputs = [
            self.early_exits[idx](hidden_states[self._hidden_indices[idx]]) 
            for idx in range(0, len(self.early_exits))
        ]
                
        if labels is None:
            heads_outputs = [
                torch.softmax(head_output, dim=-1) for head_output in heads_outputs
            ] # h_os[i][bs][seqlen][tok_num] = P(из i-го слоя на seqlen месте стоит токен tok_num)

            return {'head_outputs': heads_outputs, 'last_head': torch.softmax(output.logits, dim=-1)}
                
        losses = [
            compute_loss_labelsmoothed(head_output, labels)
            for head_output in heads_outputs
        ]
        
        losses = torch.stack(losses,)
        total_loss = torch.sum(losses)
        
        heads_outputs = [
            torch.softmax(head_output, dim=-1).detach() for head_output in heads_outputs
        ] # h_os[i][bs][seqlen][tok_num] = P(из i-го слоя на seqlen месте стоит токен tok_num)

        return {
            'loss': total_loss, 
            'head_outputs': heads_outputs,  # [num_layers, bs=1, seq_len, vocab_size] # {token: {layer_num: [probabilities, ... ]}}
            'last_head': torch.softmax(output.logits, dim=-1).detach(),
        }
        

In [4]:
import torch

In [5]:
# heads_output = torch.randn(12, 1, 10, 100)

# tokenized_input = {'input_ids': torch.randint(high=100, size=(1, 10))}

# # head_outputs[i][bs][seqlen][tok_num] = P(из i-го слоя на seqlen месте стоит токен tok_num)

# # (хотим взять для 5го токена градиенты из первой головы)

# # head_outputs = model(**tokenized_input)['head_outputs']
# # 
# # готовим вероятности и one-hot target
# ho = heads_output[1]
# token_probs = ho[:, 5 - 1]
# token = tokenized_input['input_ids'][:, 5]
# # # one_hot_token = torch.nn.functional.one_hot(token, num_classes = 100)

# loss_fn = torch.nn.CrossEntropyLoss()

# loss = loss_fn(token_probs, token, )

# # optimizer = torch.optim.Adam(model.parameters())
# # optimizer.zero_grad()
# # loss.backward()

# for param_name, param in model.named_parameters():
#     print(f'gradient for {param_name} =', param.grad)



In [6]:
#имею словарь с вер-ями предсказать правильный токен
#дальше выбираю слова-токены и смотрю когда они хорошо предсказыввались а когда нет

In [7]:
from transformers import DataCollatorForLanguageModeling

2023-04-18 21:04:00.673241: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-04-18 21:04:02.909495: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
dataset_path = 'files/dataset'
dataset_cache = 'files/.cache'

In [9]:
from datasets import load_dataset
import numpy as np

# dataset = load_dataset("wikipedia", "20220301.en", cache_dir=dataset_cache)

# rand_idx = np.random.choice(np.arange(len(dataset['train'])), size=500_000, replace=False)

# # import json
# # rand_idx = json.load(open('indices.json', 'r'))

# dataset = dataset['train'].select(rand_idx, )

# # import json

# # json.dump(rand_idx.tolist(), open('indices.json', 'w'),)

# def tokenize_data(example):
#     return tokenizer(example['text'], max_length=512, truncation=True)

# dataset = dataset.map(
#     tokenize_data, remove_columns=['text', 'id', 'url', 'title'], batched=True, num_proc=10
# )

# dataset.save_to_disk(dataset_path)


from datasets import load_from_disk

dataset = load_from_disk(dataset_path)

In [10]:
# 10000, 100000, 300000, 400000, 500000, 600000

In [None]:
from transformers import Trainer, TrainingArguments

collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

for global_step in [100000, 300000, 600000]:
    torch.cuda.empty_cache()
    
    print(f'doing for {global_step=}',)

    net = MyNet(f'global_step{global_step}',  [3, 14, 21]).to('cuda')
    net = net.eval()

    for n, p in net.named_parameters():
        if 'transformer' in n:
            p.requires_grad_(False)
    
    trainer = Trainer(
        model=net,
        args=TrainingArguments(
            f'/workspace/mnt/bloom-models/bloom-{global_step}',
            num_train_epochs=1,
            per_device_train_batch_size=2,
            gradient_accumulation_steps=2,
            save_total_limit=2, 
            save_steps=1000,
            fp16=True,
            logging_steps=100,
            max_steps=80_000,
        ),
        train_dataset=dataset,
        data_collator=collator,
        tokenizer=tokenizer,
    )
    trainer.train()
    trainer.save_model(f"/workspace/mnt/bloom-models/bloom-final-global-step-{global_step}")
    del net

doing for global_step=100000


Downloading pytorch_model.bin:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

will be using outputs of [3, 14, 21] layers


You're using a BloomTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
100,40.9998
200,34.1309
300,29.9115
400,27.3692
500,25.4171
600,24.3997
700,23.4564
800,23.0436
900,22.5167
1000,22.2357


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



doing for global_step=600000


Downloading pytorch_model.bin:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

will be using outputs of [3, 14, 21] layers




Step,Training Loss
100,35.0118
200,29.4202
300,27.6972
400,26.8259
500,26.1565
600,25.918
700,25.5563
800,25.4904
900,25.2489
1000,25.1841


IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceed