In [1]:
from typing import Any
import torch

import torch.nn.functional as F

from nnsight import LanguageModel, util
from torch.utils.data import DataLoader, Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class LORA(torch.nn.Module):
    def __init__(self, r: int) -> None:
        super(LORA, self).__init__()
        self.r = r

        self.WA = torch.nn.Parameter(torch.empty(768, self.r), requires_grad=True)
        self.WB = torch.nn.Parameter(torch.empty(self.r, 768), requires_grad=True)

    def __call__(self, module_input, module_output, alpha:float=1.0) -> Any:

        inp = module_input[0][0]
        out = module_output

        return (torch.matmul(torch.matmul(inp, self.WA), self.WB) + out) * alpha

In [3]:
model = LanguageModel("gpt2", device_map="cuda:0")

n_tokens = 10
epochs = 3
answer = "Paris"
answer_tokens = model.tokenizer(answer)
answer_token = answer_tokens["input_ids"][0]

lora = LORA(10)
lora.to("cuda:0")

optimizer = torch.optim.AdamW(lora.parameters(), lr=.1)
dataset = [[" ".join(["_"] * n_tokens), answer_token]] * 100
dataloader = DataLoader(dataset, batch_size=10)

In [4]:
for epoch in range(epochs):
    print(epoch)

    for i, (inputs, targets) in enumerate(dataloader):
        print(f"  {i}")

        optimizer.zero_grad()

        with model.trace(inputs) as tracer:

            model.transformer.h[0].mlp.output = lora(model.transformer.h[0].mlp.input, model.transformer.h[0].mlp.output)

            logits = model.lm_head.output.save()

        loss = F.cross_entropy(logits.value[:, -1], targets.to("cuda:0"))

        print(loss)

        loss.backward()

        optimizer.step()

0
  0


You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor(20.6252, device='cuda:0', grad_fn=<NllLossBackward0>)
  1
tensor(20.6252, device='cuda:0', grad_fn=<NllLossBackward0>)
  2
tensor(20.6252, device='cuda:0', grad_fn=<NllLossBackward0>)
  3
tensor(20.6252, device='cuda:0', grad_fn=<NllLossBackward0>)
  4
tensor(20.5754, device='cuda:0', grad_fn=<NllLossBackward0>)
  5
tensor(12.2831, device='cuda:0', grad_fn=<NllLossBackward0>)
  6
tensor(5.1909, device='cuda:0', grad_fn=<NllLossBackward0>)
  7
tensor(4.9941, device='cuda:0', grad_fn=<NllLossBackward0>)
  8
tensor(1.0922, device='cuda:0', grad_fn=<NllLossBackward0>)
  9
tensor(0.0077, device='cuda:0', grad_fn=<NllLossBackward0>)
1
  0
tensor(0.0002, device='cuda:0', grad_fn=<NllLossBackward0>)
  1
tensor(1.5223e-05, device='cuda:0', grad_fn=<NllLossBackward0>)
  2
tensor(1.7881e-06, device='cuda:0', grad_fn=<NllLossBackward0>)
  3
tensor(2.3842e-07, device='cuda:0', grad_fn=<NllLossBackward0>)
  4
tensor(0., device='cuda:0', grad_fn=<NllLossBackward0>)
  5
tensor(0., device='cuda:

In [5]:
with model.generate() as generator:
    with generator.invoke(dataset[0][0]) as invoker:
        pass

print(model.tokenizer.decode(generator.output[0]))


with model.generate() as generator:
    with generator.invoke(dataset[0][0]) as invoker:
        lora()

print(model.tokenizer.decode(generator.output[0]))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


AttributeError: 'GPT2LMHeadModel' object has no attribute 'output'