### Low-Rank Adaptation (LoRA)

<img src="https://heidloff.net/assets/img/2023/08/lora.png" alt="drawing" width="600"/>

__Задание 1 (3 балла).__ Реализуйте самостоятельно модуль LoRA для эффективного обучения LLM по схеме, описанной в [статье](https://arxiv.org/pdf/2106.09685). Встройте его в свою любимую LLM и убедитесь, что ошибка убывает при обучении параметров LoRA на безусловную генерацию. Для этого возьмите любые данные на свой выбор. Замерьте насколько уменьшилось число обучаемых параметров, как изменилась скорость во время forward и backward процессов и как изменились затраты по памяти. Сделайте выводы и напишите о них в отчете.

In [3]:
import torch
import torch.nn as nn
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    GPT2LMHeadModel,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer,
)
from transformers.models.gpt2.modeling_gpt2 import Conv1D

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device


'cuda'

In [5]:
dataset = load_dataset("stas/openwebtext-10k", split="train", trust_remote_code=True)
dataset

Repo card metadata block was not found. Setting CardData to empty.


Dataset({
    features: ['text'],
    num_rows: 10000
})

In [6]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def tokenize(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=256,
    )

tokenized = dataset.map(
    tokenize,
    batched=True,
    remove_columns=["text"],
)

train_dataset = tokenized

collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [7]:
class LoRAConv1D(nn.Module):
    def __init__(self, base: Conv1D, r=8, alpha=16):
        super().__init__()
        self.base = base
        in_f, out_f = base.weight.shape
        self.r = r
        self.alpha = alpha
        self.scale = alpha / r

        self.lora_A = nn.Parameter(torch.randn(in_f, r) * 0.01)
        self.lora_B = nn.Parameter(torch.zeros(r, out_f))

        for p in base.parameters():
            p.requires_grad = False

    def forward(self, x):
        out = self.base(x)
        x2 = x.view(-1, x.shape[-1])
        lora = x2 @ self.lora_A @ self.lora_B
        lora = lora.view(*x.shape[:-1], -1)
        return out + self.scale * lora

In [8]:
def add_lora(model, r=8, alpha=16):
    for name, module in model.named_modules():
        if name.endswith(".attn"):
            if isinstance(module.c_attn, Conv1D):
                module.c_attn = LoRAConv1D(module.c_attn, r, alpha)
            if isinstance(module.c_proj, Conv1D):
                module.c_proj = LoRAConv1D(module.c_proj, r, alpha)

    for p in model.parameters():
        p.requires_grad = False

    for n, p in model.named_parameters():
        if "lora_" in n:
            p.requires_grad = True

    return model

In [9]:
def print_params(model, title=""):
    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"{title}TOTAL PARAMS: {total:,}")
    print(f"{title}TRAINABLE PARAMS: {trainable:,}")
    print(f"{title}PERCENT TRAINABLE: {100 * trainable / total:.4f}%")

In [10]:
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False

model = add_lora(model, r=8, alpha=16)
model.to("cuda")

print_params(model, "[LoRA] ")

[LoRA] TOTAL PARAMS: 124,882,176
[LoRA] TRAINABLE PARAMS: 442,368
[LoRA] PERCENT TRAINABLE: 0.3542%


In [11]:
args = TrainingArguments(
    output_dir="./lora_output",
    per_device_train_batch_size=4,
    num_train_epochs=1,
    learning_rate=1e-4,
    logging_steps=50,
    save_steps=1000000,
    report_to="none"
)


In [12]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    data_collator=collator,
)

In [13]:
trainer.train()

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
50,3.42
100,3.3439
150,3.2939
200,3.2554
250,3.2535
300,3.2671


TrainOutput(global_step=313, training_loss=3.303958746571891, metrics={'train_runtime': 66.283, 'train_samples_per_second': 150.868, 'train_steps_per_second': 4.722, 'total_flos': 1313254932480000.0, 'train_loss': 3.303958746571891, 'epoch': 1.0})

In [14]:
import time

def measure_speed_and_memory(model, tokenizer, collator, dataset, steps=30):
    model.eval()
    model.to("cuda")

    from torch.utils.data import DataLoader
    dl = DataLoader(dataset, batch_size=4, shuffle=False, collate_fn=collator)

    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()

    it = iter(dl)

    times = []
    for _ in range(steps):
        batch = next(it)

        for k in batch:
            batch[k] = batch[k].to("cuda")

        torch.cuda.synchronize()
        start = time.time()

        model.zero_grad(set_to_none=True)
        out = model(**batch)
        loss = out.loss
        loss.backward()

        torch.cuda.synchronize()
        end = time.time()

        times.append(end - start)

    avg_step = sum(times) / len(times)
    max_mem = torch.cuda.max_memory_allocated() / 1024**2

    print(f"Average step time (forward+backward): {avg_step:.4f} sec")
    print(f"Peak GPU memory: {max_mem:.1f} MB")

    return avg_step, max_mem


In [15]:
print("=== Measuring LoRA model performance ===")
avg_time, peak_mem = measure_speed_and_memory(model, tokenizer, collator, train_dataset)


=== Measuring LoRA model performance ===
Average step time (forward+backward): 0.0486 sec
Peak GPU memory: 2137.6 MB


In [16]:
model_full = GPT2LMHeadModel.from_pretrained("gpt2")
model_full.config.pad_token_id = tokenizer.pad_token_id
model_full.config.use_cache = False

# full finetuning — все параметры trainable
for p in model_full.parameters():
    p.requires_grad = True

model_full.to("cuda")

print("=== Full FT parameters ===")
print_params(model_full, "[FULL] ")

print("=== Measuring FULL FT speed ===")
full_time, full_mem = measure_speed_and_memory(model_full, tokenizer, collator, train_dataset)


'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 31571926-422a-405c-bece-dcdbddcb7c26)')' thrown while requesting HEAD https://huggingface.co/gpt2/resolve/main/config.json
Retrying in 1s [Retry 1/5].


=== Full FT parameters ===
[FULL] TOTAL PARAMS: 124,439,808
[FULL] TRAINABLE PARAMS: 124,439,808
[FULL] PERCENT TRAINABLE: 100.0000%
=== Measuring FULL FT speed ===
Average step time (forward+backward): 0.0629 sec
Peak GPU memory: 2820.2 MB


In [17]:

print_params(model, "[LoRA] ")

print(f"\nLoRA avg step time:   {avg_time:.4f} sec")
print(f"LoRA peak memory:     {peak_mem:.1f} MB")


[LoRA] TOTAL PARAMS: 124,882,176
[LoRA] TRAINABLE PARAMS: 442,368
[LoRA] PERCENT TRAINABLE: 0.3542%

LoRA avg step time:   0.0486 sec
LoRA peak memory:     2137.6 MB


In [18]:
print(f"\nFull FT avg step time: {full_time:.4f} sec")
print(f"Full FT peak memory:   {full_mem:.1f} MB")

reduction_ratio = 100 * (1 - avg_time/full_time)
print(f"Speed improvement:     {reduction_ratio:.1f}% faster")



Full FT avg step time: 0.0629 sec
Full FT peak memory:   2820.2 MB
Speed improvement:     22.7% faster
