In [1]:
import torch
if torch.cuda.get_device_capability() < (7, 5):
  raise ValueError(f"You got a GPU with capability {torch.cuda.get_device_capability()}, need at least (7, 5)")
else: print("OK")

%pip install --quiet bitsandbytes==0.35.4 transformers==4.24.0 datasets==2.7.0 accelerate==0.14.0 loralib

OK
Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

cache_dir = '/media/tfsservices/DATA/NLP/cache/'

# # note: these flags slow down the code to save RAM; remove them if you have >32GB RAM
# model = AutoModelForCausalLM.from_pretrained(
#     "facebook/opt-6.7b", load_in_8bit=True, device_map='auto',
#     low_cpu_mem_usage=True, torch_dtype=torch.float16, offload_state_dict=True, cache_dir=cache_dir)

model = AutoModelForCausalLM.from_pretrained(
    "facebook/opt-6.7b", load_in_8bit=True, device_map='auto',
    torch_dtype=torch.float16, cache_dir=cache_dir)
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-6.7b")

for module in model.modules():
    if isinstance(module, bnb.nn.Linear8bitLt):
        module.state.memory_efficient_backward = True

for param in model.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.model.decoder.project_in = lambda x: x.requires_grad_(True)

# cast model outputs to float32 to unfuck the top-k sampler
class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
batch = tokenizer("Mark Zuckerberg is", return_tensors='pt').to('cuda')
# batch = tokenizer("Amat/Applied materials P5000 is", return_tensors='pt')
# note to self: find a less controversial example

with torch.cuda.amp.autocast():
  output_tokens = model.generate(**batch, min_length=30, max_length=150, do_sample=True)

print('\n\n', tokenizer.decode(output_tokens[0].cpu().numpy()))





 </s>Mark Zuckerberg is the man who is most influential in world affairs. Just ask anyone in the Arab world about the “Arab Spring.” Or ask the Kurds about what happened to Syria. Or ask Pakistanis what’s happening with their country. People do listen to what he says. And he’s not saying that the Arab Spring was a mistake. He has said that he was wrong to think Facebook is a force for good in the Middle East.

Is Mark Zuckerberg making any changes and improvements to deal with these kinds of consequences?

There’s definitely been an acknowledgement of the fact that Facebook is not in charge of the Middle East. So Zuckerberg actually wrote a post on Facebook saying, “


In [5]:
batch = tokenizer("Hi {FirstName} ", return_tensors='pt').to('cuda')
with torch.cuda.amp.autocast():
    output_tokens = model.generate(**batch, min_length=30, max_length=150, do_sample=True)

print('\n\n', tokenizer.decode(output_tokens[0].cpu().numpy()))



 </s>Hi {FirstName}  We've updated this subreddit to support the new, mobile-friendly reddit design. Please take some time to discover our new design, and to test it out with some of our links!   Thank you.
I tried to read this but the font and the content are too small and can't be read :/</s>


#### Custom generation 

In [12]:
prefix = "Mark Zuckerberg is"
print(prefix, end = ' ')

batch = tokenizer(prefix, return_tensors='pt')
past_key_values = None

temperature = 0.8

for i in range(100):
    with torch.cuda.amp.autocast():
        outputs = model.forward(**batch, use_cache=True, past_key_values=past_key_values)
        probs = outputs.logits[0, -1].div(temperature).softmax(-1)
        token = torch.multinomial(probs, 1).view([])

        print(tokenizer.decode(token), end='', flush=True)
        past_key_values = outputs.past_key_values
        batch = dict(input_ids=token.reshape(1,1),
                     attention_mask=torch.ones(1, past_key_values[0][0].shape[-2]+1, 
                     device='cuda'))



Mark Zuckerberg is  poised to testify before lawmakers about how his company failed to protect users' data.

"Facebook CEO Mark Zuckerberg will testify before a joint hearing of the U.S. Senate Judiciary and Commerce Committees on April 11th," said Senate Judiciary Chairman Chuck Grassley in a statement.

The hearing will focus on data privacy, Facebook's use and misuse of user data and the social network's effect on "election integrity."

The hearing comes after Facebook disclosed earlier this month that the data of 87

In [4]:
print(outputs['past_key_values'])

((tensor([[[[-1.8818,  0.1099,  0.5370,  ..., -0.5458, -0.5813, -0.0637],
          [-1.1682, -0.2850,  0.0319,  ..., -0.0845,  0.3852, -0.7236],
          [-0.3081,  0.4722,  0.5135,  ..., -1.3965,  0.7854, -1.3075],
          ...,
          [ 0.0703,  0.3601, -0.1473,  ..., -1.1992,  0.3462, -0.8950],
          [ 0.0099, -0.9126, -0.3528,  ..., -0.3584,  1.0117, -0.8135],
          [-0.2688, -0.7275,  0.3240,  ..., -1.1367,  0.8931, -0.5054]],

         [[ 0.9610, -0.6360,  0.4149,  ...,  0.4170, -0.3887, -0.7012],
          [-0.1329, -0.5032, -0.4677,  ...,  0.1764, -0.2458,  0.9876],
          [ 0.4849, -0.3130, -0.9736,  ...,  0.5302,  0.3323, -0.1196],
          ...,
          [ 0.2179,  0.4158, -0.0125,  ..., -0.0492, -0.8452, -0.4556],
          [ 0.2269,  0.3584, -0.3877,  ..., -0.6255, -0.8159,  0.0338],
          [-0.9668, -0.6157, -1.0195,  ..., -1.5264,  0.7983, -1.1846]],

         [[ 0.4381, -0.6630, -0.1808,  ...,  0.4689, -0.5411,  0.3177],
          [ 0.4337,  0.2476,

#### Parameter-efficient finetuning: OWN LoRA

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class LoRALayer(nn.Module):
    """Wraps a linear layer with LoRA-like adapter. Wraps an existing OPT linear layer"""
    def __init__(self, module: nn.Linear, rank: int):
        super().__init__()
        self.module = module
        self.adapter_A = nn.Parameter(torch.empty(module.in_features, rank, device=module.weight.device))
        nn.init.kaiming_uniform_(self.adapter_A, a=5 ** 0.5)
        self.adapter_B = nn.Parameter(torch.zeros(rank, module.out_features, device=module.weight.device))

    def forward(self, input):
        result = self.module(input)
        result += input @ self.adapter_A @ self.adapter_B

        return result


In [6]:
# test your implementation
test_linear = nn.Linear(128, 128)
test_linear.weight.data[...] = torch.eye(128)
test_adapter = LoRALayer(test_linear, rank=8)

assert torch.allclose(test_adapter(torch.ones(1, 1, 128)), test_linear.bias + 1), "please check your forward pass"

test_adapter.adapter_A.data[...] = torch.linspace(0.1, -0.5, 128 * 8).view(128, 8)
test_adapter.adapter_B.data[...] = torch.linspace(0.5, -0.1, 128 * 8).view(8, 128)
test_linear.bias.data[...] = torch.linspace(1., -1., 128)

dummy_loss = F.mse_loss(test_adapter(torch.ones(1, 128) / 128), torch.linspace(-1, 1, 128).unsqueeze(0))
assert torch.allclose(dummy_loss, torch.tensor(1.3711389), rtol=0, atol=1e-4)
dummy_loss.backward()
assert all(w.grad is not None for w in [test_adapter.adapter_A, test_adapter.adapter_B]), "some adapter weights have no grad"
assert torch.allclose(test_adapter.adapter_A.grad.sum(), torch.tensor(-0.60158), rtol=0, atol=1e-4), "bad grad w.r.t. A"
assert torch.allclose(test_adapter.adapter_B.grad.sum(), torch.tensor(0.9931), rtol=0, atol=1e-4), "bad grad w.r.t. B"
# note: bad grad means that your code is different from LoRA paper OR that your code is not autograd-friendly (e.g. no_grad)
del dummy_loss, test_linear, test_adapter
print("All tests passed!")

All tests passed!


### Apply LoRA to the model

The code below applies LoRA adapters on top of Q/K/V linear layers in OPT attention. You may also choose to modify other layers:
* OPTAttention.out_proj - attention output projection
* OPTDecoderLayer.fc1, .fc2 - transformer feedforward layers
* OPTForCausalLM.lm_head - output LM head

In [7]:
model.eval()

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 4096, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 4096)
      (final_layer_norm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
      (layers): ModuleList(
        (0-31): 32 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=True)
            (v_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=True)
            (q_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=True)
            (out_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear8bitLt(in_features=4096, out_features=16384, bias=True)
          (fc2): Linear8bitLt(in_features=16384, out_features=4096, bias=True

In [8]:
# Apply Own LoRa layer

for name, module in model.named_modules():
  if 'OPTAttention' in repr(type(module)):
    module.q_proj = LoRALayer(module.q_proj, rank=8)
    module.k_proj = LoRALayer(module.k_proj, rank=8)
    module.v_proj = LoRALayer(module.v_proj, rank=8)

assert sum(isinstance(module, LoRALayer) for module in model.modules()) == 96  # for opt-6b7

In [9]:
batch = tokenizer("Mark Zuckerberg is", return_tensors='pt')
# test a single training step, make sure we get meaningful gradients
with torch.cuda.amp.autocast():
  out = model.forward(**batch)
  out.logits.norm().backward()

for module in model.modules():
  if isinstance(module, LoRALayer):
    assert module.adapter_B.grad is not None
    assert module.adapter_B.grad.norm().item() > 0

model.zero_grad(set_to_none=True)



### To train your model (dummy dataset)

The example below shows how to train the LoRA adapters on a dummy dataset. You will need to run a _similar_ training task later.

__Note:__ please scroll down for the homework task

In [13]:
import transformers
from datasets import load_dataset
data = load_dataset("Abirate/english_quotes")
data = data.map(lambda samples: tokenizer(samples['quote']), batched=True)

trainer = transformers.Trainer(
    model=model, train_dataset=data['train'],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=4, gradient_accumulation_steps=4,
        warmup_steps=250, max_steps=1000, learning_rate=2e-4, fp16=True,
        logging_steps=1, output_dir='outputs'),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

Using custom data configuration Abirate--english_quotes-6e72855d06356857
Found cached dataset json (/home/tfsservices/.cache/huggingface/datasets/Abirate___json/Abirate--english_quotes-6e72855d06356857/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)
100%|██████████| 1/1 [00:00<00:00, 282.05it/s]
Loading cached processed dataset at /home/tfsservices/.cache/huggingface/datasets/Abirate___json/Abirate--english_quotes-6e72855d06356857/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-a48b8d6b51bfd590.arrow
max_steps is given, it will override any value given in num_train_epochs
Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `OPTForCausalLM.forward` and have been ignored: tags, author, quote. If tags, author, quote are not expected by `OPTForCausalLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2508
  Num Epochs = 7
  Inst

Step,Training Loss
1,2.3502


Saving model checkpoint to outputs/checkpoint-500
Configuration saved in outputs/checkpoint-500/config.json
Configuration saved in outputs/checkpoint-500/config.json
Model weights saved in outputs/checkpoint-500/pytorch_model.bin
Saving model checkpoint to outputs/checkpoint-1000
Configuration saved in outputs/checkpoint-1000/config.json


TrainOutput(global_step=1000, training_loss=1.7311577512621878, metrics={'train_runtime': 3165.2821, 'train_samples_per_second': 5.055, 'train_steps_per_second': 0.316, 'total_flos': 5.710897015455744e+16, 'train_loss': 1.7311577512621878, 'epoch': 6.41})

In [15]:
batch = tokenizer("It is our choices", return_tensors='pt')

with torch.cuda.amp.autocast():
    output_tokens = model.generate(**batch, min_length=30, max_length=150, do_sample=True)

print('\n\n', tokenizer.decode(output_tokens[0].numpy()))





 </s>It is our choices... that show what we truly are, far more than our actions.” J. K. Rowling

“A woman who knows the value of a good friend will go to the ends of the earth to find one.” Eleanor Roosevelt

“No one ever really knows another person. I wish I did. But then I wouldn't wonder why people seemed so strange to me.” Marilyn Monroe

“Friendship is born at the moment when one man says to another: 'What! you too? I thought that no one but myself...'” Italian Proverb

“To find a friend in one's own soul is to be found at the moment of


In [16]:
batch = tokenizer("It is", return_tensors='pt')

with torch.cuda.amp.autocast():
    output_tokens = model.generate(**batch, min_length=30, max_length=60, do_sample=True)

print('\n\n', tokenizer.decode(output_tokens[0].numpy()))



 </s>It is not just that the church has had its setbacks, that it hasn't managed the situation on the whole as well as it should. The church has also failed its own promises. It is often promised an open heaven. Nothing is more absurd: if heaven is the open-air paradise of


In [19]:
# data.shape
data

DatasetDict({
    train: Dataset({
        features: ['quote', 'author', 'tags', 'input_ids', 'attention_mask'],
        num_rows: 2508
    })
})

#### Own dataset

In [1]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files="./datasets/marketing.csv")
dataset = dataset.map(lambda samples: tokenizer(samples['text']), batched=True)

  from .autonotebook import tqdm as notebook_tqdm
Using custom data configuration default-9329f19ebd7265ee
Found cached dataset csv (/home/tfsservices/.cache/huggingface/datasets/csv/default-9329f19ebd7265ee/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|██████████| 1/1 [00:00<00:00, 929.59it/s]
  0%|          | 0/1 [00:00<?, ?ba/s]


NameError: name 'tokenizer' is not defined

In [None]:
import transformers

trainer = transformers.Trainer(
    model=model, train_dataset=data['train'],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=4, gradient_accumulation_steps=4,
        warmup_steps=250, max_steps=1000, learning_rate=2e-4, fp16=True,
        logging_steps=1, output_dir='outputs-own'),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()