In [1]:
from datasets import load_dataset
import torch.nn as nn
# from transformers.models.gpt2.modeling_gpt2 import GPT2Attention

In [2]:
dataset = load_dataset("wikitext", "wikitext-103-raw-v1")['test']

In [3]:
import torch
class CustomGPT2Attention(torch.nn.Module):
    def __init__(self, nx, n_ctx, config, scale=False, layer_idx=None):
        super().__init__()

        n_state = nx  # hidden_dim (n_embd)
        assert n_state % config.n_head == 0, "n_state must be divisible by n_head"
        self.n_head = config.n_head
        self.head_dim = n_state // config.n_head  # Размер одного "head"
        self.scale = scale
        self.layer_idx = layer_idx
        self.config = config

        # Линейные слои для Q, K, V
        self.q_attn = torch.nn.Linear(nx, n_state, bias=True)
        self.k_attn = torch.nn.Linear(nx, n_state, bias=True)
        self.v_attn = torch.nn.Linear(nx, n_state, bias=True)

        # Проекция после внимания
        self.c_proj = torch.nn.Linear(n_state, nx, bias=True)

        # Dropout
        self.attn_dropout = torch.nn.Dropout(config.attn_pdrop)
        self.resid_dropout = torch.nn.Dropout(config.resid_pdrop)

        # Маски
        self.register_buffer(
            "bias", torch.tril(torch.ones((n_ctx, n_ctx), dtype=torch.uint8)).view(1, 1, n_ctx, n_ctx)
        )
        self.register_buffer("masked_bias", torch.tensor(-1e4))
    
    def split_heads(self, x):
        """
        Split heads without separate logic for key or query to ensure compatibility.
        """
        new_shape = x.size()[:-1] + (self.n_head, self.head_dim)
        x = x.view(*new_shape)
        return x.permute(0, 2, 1, 3)  # [batch, num_heads, seq_length, head_dim]

    def merge_heads(self, x):
        x = x.permute(0, 2, 1, 3).contiguous()
        new_shape = x.size()[:-2] + (self.n_head * self.head_dim,)
        return x.view(*new_shape)

    def _attn(self, q, k, v, attention_mask=None, head_mask=None, output_attentions=False):
        w = torch.matmul(q, k)

        # Scaling
        if self.scale:
            w = w / (self.head_dim ** 0.5)
        if getattr(self.config, "scale_attn_by_inverse_layer_idx", False):
            w = w / float(self.layer_idx + 1)

        # Apply causal mask
        mask = self.bias[:, :, : w.size(-2), : w.size(-1)]
        w = torch.where(mask.bool(), w, self.masked_bias.to(w.dtype))

        if attention_mask is not None:
            w = w + attention_mask

        w = nn.functional.softmax(w, dim=-1)
        w = self.attn_dropout(w)

        if head_mask is not None:
            w = w * head_mask

        outputs = (torch.matmul(w, v),)
        if output_attentions:
            outputs += (w,)
        return outputs

    def forward(self, hidden_states, layer_past=None, attention_mask=None, head_mask=None, use_cache=False, output_attentions=False):
        query = self.q_attn(hidden_states)
        key = self.k_attn(hidden_states)
        value = self.v_attn(hidden_states)
    
        query = self.split_heads(query)
        key = self.split_heads(key)
        value = self.split_heads(value)
    
        if layer_past is not None:
            past_key, past_value = layer_past
            key = torch.cat((past_key, key), dim=-2)
            value = torch.cat((past_value, value), dim=-2)
    
        if use_cache:
            present = (key, value)
        else:
            present = None
    
        attn_outputs = self._attn(query, key.transpose(-1, -2), value, attention_mask, head_mask, output_attentions)
        a = attn_outputs[0]
    
        a = self.merge_heads(a)
        a = self.c_proj(a)
        a = self.resid_dropout(a)
    
        outputs = (a, present) + attn_outputs[1:]
        return outputs

In [4]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
device = "cuda"
model_id = "openai-community/gpt2"
model = GPT2LMHeadModel.from_pretrained(model_id)
tokenizer = GPT2TokenizerFast.from_pretrained(model_id)

config = model.config
config

GPT2Config {
  "_attn_implementation_autoset": true,
  "_name_or_path": "openai-community/gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.46.2",
  "use_cache": true,
  "vocab_size": 50257
}

In [5]:
for block in model.transformer.h:
    c_attn_weight = block.attn.c_attn.weight.clone()
    c_attn_bias = block.attn.c_attn.bias.clone()
    c_proj_weight = block.attn.c_proj.weight.clone()
    c_proj_bias = block.attn.c_proj.bias.clone()
    
    
    q_weight, k_weight, v_weight = torch.chunk(c_attn_weight, chunks=3, dim=1)
    q_bias, k_bias, v_bias = torch.chunk(c_attn_bias, chunks=3, dim=0)

    custom_attn = CustomGPT2Attention(nx=config.n_embd, n_ctx=config.n_ctx, config=config)
    custom_attn.q_attn.weight.data = q_weight.T.clone()
    custom_attn.k_attn.weight.data = k_weight.T.clone()
    custom_attn.v_attn.weight.data = v_weight.T.clone()
 
    
    custom_attn.q_attn.bias.data = q_bias.clone()
    custom_attn.k_attn.bias.data = k_bias.clone()
    custom_attn.v_attn.bias.data = v_bias.clone()
    custom_attn.c_proj.weight.data = c_proj_weight.T.clone()
    custom_attn.c_proj.bias.data = c_proj_bias.clone()

    block.attn = custom_attn

In [5]:
context_length = model.config.n_positions
eos_token = tokenizer.eos_token_id
context_length, eos_token

(1024, 50256)

In [6]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())

2.5.1+cu124
True


In [7]:
filtered_data = [text for text in dataset['text'] if len(text) > 3]
filtered_data = [line.replace("\n", " ").strip() for line in filtered_data]

In [8]:
context_length = 512
current_sequence = torch.tensor([], dtype=torch.long)
input_batch = []

for element in filtered_data:
    outputs = tokenizer(
        element,
        return_tensors="pt",
    )
    input_ids = outputs["input_ids"].squeeze(0)

    current_sequence = torch.cat([current_sequence, input_ids])

    while len(current_sequence) >= context_length:
        input_batch.append(current_sequence[:context_length])
        current_sequence = current_sequence[context_length:]

if len(current_sequence) > 0:
    input_batch.append(
        torch.cat([current_sequence, torch.tensor([eos_token])])
    )

In [9]:
input_batch = input_batch[:-1] # last token is less than 1024

In [14]:
print(type(input_batch))
print(len(input_batch))
print(type(input_batch[0]))

<class 'list'>
542
<class 'torch.Tensor'>


In [10]:
from torch.utils.data import TensorDataset, DataLoader

input_data = torch.stack(input_batch)
labels = input_data.clone()

dataset = TensorDataset(input_data, labels)

In [11]:
dataloader = DataLoader(dataset, batch_size=4, shuffle=False)

In [12]:
from transformers import Adafactor

optimizer = Adafactor(
    model.parameters(),
    scale_parameter=True,
    relative_step=True,
    warmup_init=True,
    lr=None
)

In [13]:
torch.cuda.empty_cache()

model = model.to("cuda")

In [14]:
from tqdm import tqdm

epochs = 10
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(dataloader, desc=f"Epoch {epoch+1}"):
        inputs, labels = batch
        inputs, labels = inputs.to("cuda"), labels.to("cuda")

        optimizer.zero_grad()

        outputs = model(input_ids=inputs, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1} - Average Loss: {avg_loss:.4f}")

Epoch 1:   0%|          | 0/136 [00:00<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 392.00 MiB. GPU 0 has a total capacity of 3.70 GiB of which 120.62 MiB is free. Including non-PyTorch memory, this process has 3.05 GiB memory in use. Of the allocated memory 2.90 GiB is allocated by PyTorch, and 52.22 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

## Changing Attantion Block in GPT2 model.

In [47]:
model2 = model.to("cuda")
generated = model2.generate(
    inputs.input_ids.to("cuda"),
    max_length=40,
    do_sample=True,
    temperature=0.7,
    top_k=50,
    top_p=0.9,
    repetition_penalty=1.5,
)
print(tokenizer.decode(generated[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


I love this world because of   in that, or not ". I and
 for with- it a 2_s is from there ( an so all to have the on by also's


## How quantization works

In [31]:
import torch

# Исходные веса в FP32
weights = torch.randn((4, 4), dtype=torch.float32)

# Преобразование в INT4
scale = 15 / weights.abs().max()  # Масштабируем до диапазона INT4
weights_int4 = torch.round(weights * scale).clamp(-15, 15).to(torch.int8)

# Преобразование в INT6
scale = 31 / weights.abs().max()  # Масштабируем до диапазона INT6
weights_int6 = torch.round(weights * scale).clamp(-31, 31).to(torch.int8)


In [32]:
weights

tensor([[ 1.2927,  0.1660, -0.6576,  1.4048],
        [ 0.3357, -1.2038, -0.5292,  1.2458],
        [-0.0752,  0.0526,  1.4541,  0.2409],
        [ 0.2059, -0.1143, -0.9621, -1.1691]])

In [33]:
weights_int4

tensor([[ 13,   2,  -7,  14],
        [  3, -12,  -5,  13],
        [ -1,   1,  15,   2],
        [  2,  -1, -10, -12]], dtype=torch.int8)

In [34]:
weights_int6

tensor([[ 28,   4, -14,  30],
        [  7, -26, -11,  27],
        [ -2,   1,  31,   5],
        [  4,  -2, -21, -25]], dtype=torch.int8)

In [35]:
weights_restored = weights_int6 / scale
weights_restored

tensor([[ 1.3133,  0.1876, -0.6567,  1.4072],
        [ 0.3283, -1.2195, -0.5160,  1.2664],
        [-0.0938,  0.0469,  1.4541,  0.2345],
        [ 0.1876, -0.0938, -0.9850, -1.1726]])

In [36]:
weights_restored - weights

tensor([[ 0.0207,  0.0216,  0.0009,  0.0023],
        [-0.0073, -0.0158,  0.0132,  0.0207],
        [-0.0186, -0.0057,  0.0000, -0.0063],
        [-0.0182,  0.0205, -0.0229, -0.0035]])