# Finetuning Llama 2 with _Adapters_ and QLoRA

This notebook is based on the [qlora-minimal notebook by geronimi73](https://github.com/geronimi73/qlora-minimal/blob/main/qlora-minimal.ipynb).

In [1]:
# !pip install -qq -U adapters accelerate bitsandbytes datasets

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ["WANDB_ENTITY"] = "clif"
os.environ["WANDB_PROJECT"] = "adapters"

## Load Open Assistant dataset

In [3]:
from datasets import load_dataset

dataset = load_dataset("OpenAssistant/oasst_top1_2023-08-25")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 12947
    })
    test: Dataset({
        features: ['text'],
        num_rows: 690
    })
})

In [5]:
print(dataset["train"][0]["text"])

<|im_start|>user
Consigliami 5 nomi per il mio cucciolo di dobberman<|im_end|>
<|im_start|>assistant
Ecco 5 nomi per il tuo cucciolo di dobermann:

- Zeus
- Apollo
- Thor
- Athena
- Odin<|im_end|>



## Load and prepare model and tokenizer

In [6]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig

modelpath="meta-llama/Llama-2-7b-hf"

# Load 4-bit quantized model
model = AutoModelForCausalLM.from_pretrained(
    modelpath,    
    device_map="auto",
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
    ),
    torch_dtype=torch.bfloat16,
)
model.config.use_cache = False

# Load (slow) Tokenizer, fast tokenizer sometimes ignores added tokens
tokenizer = AutoTokenizer.from_pretrained(modelpath, use_fast=False)

# Add tokens <|im_start|> and <|im_end|>, latter is special eos token 
tokenizer.pad_token = "</s>"
tokenizer.add_tokens(["<|im_start|>"])
tokenizer.add_special_tokens(dict(eos_token="<|im_end|>"))
model.resize_token_embeddings(len(tokenizer))
model.config.eos_token_id = tokenizer.eos_token_id

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.51s/it]


In [7]:
# # Add LoRA adapters to model
# model = prepare_model_for_kbit_training(model)
# config = LoraConfig(
#     r=64, 
#     lora_alpha=16, 
#     target_modules = ['q_proj', 'k_proj', 'down_proj', 'v_proj', 'gate_proj', 'o_proj', 'up_proj'],
#     lora_dropout=0.1, 
#     bias="none", 
#     modules_to_save = ["lm_head", "embed_tokens"],		# needed because we added new tokens to tokenizer/model
#     task_type="CAUSAL_LM"
# )
# model = get_peft_model(model, config)
# model.config.use_cache = False

In [8]:
# for param in model.parameters():
#   # param.requires_grad = False  # freeze the model - train adapters later
#   if param.ndim == 1:
#     # cast the small parameters (e.g. layernorm) to fp32 for stability
#     param.data = param.data.to(torch.float32)

# # model.gradient_checkpointing_enable()  # reduce number of stored activations
# model.enable_input_require_grads()

# class CastOutputToFloat(torch.nn.Sequential):
#   def forward(self, x): return super().forward(x).to(torch.float32)
# model.lm_head = CastOutputToFloat(model.lm_head)

# # Hack to prevent HF Trainer from throwing an error due to peft missing.
# model._hf_peft_config_loaded = True

In [9]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32002, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): Lla

In [10]:
import adapters
from adapters import LoRAConfig

adapters.init(model)

config = LoRAConfig(alpha=16, r=64, dropout=0.1)
model.add_adapter("assistant_adapter", config=config)
model.train_adapter("assistant_adapter")

print(model.adapter_summary())

Name                     Architecture         #Param      %Param  Active   Train
--------------------------------------------------------------------------------
assistant_adapter        lora             33,554,432       0.996       1       1
--------------------------------------------------------------------------------
Full model                              3,369,349,120     100.000               0


In [11]:
# Verifying the datatypes.
dtypes = {}
for _, p in model.named_parameters():
    dtype = p.dtype
    if dtype not in dtypes:
        dtypes[dtype] = 0
    dtypes[dtype] += p.numel()
total = 0
for k, v in dtypes.items():
    total += v
for k, v in dtypes.items():
    print(k, v, v / total)

torch.float32 295981056 0.08375280630606093
torch.uint8 3238002688 0.916247193693939


## Prepare data for training

In [12]:
import os 

def tokenize(element):
    return tokenizer(
        element["text"],
        truncation=True,
        max_length=512,
        add_special_tokens=False,
    )

dataset_tokenized = dataset.map(
    tokenize, 
    batched=True, 
    num_proc=os.cpu_count(),    # multithreaded
    remove_columns=["text"]     # don't need this anymore, we have tokens from here on
)

In [13]:
dataset_tokenized

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 12947
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 690
    })
})

## Training

In [14]:
args = TrainingArguments(
    output_dir="out",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    evaluation_strategy="epoch",
    logging_steps=50,
    save_steps=500,
    gradient_accumulation_steps=4,
    num_train_epochs=5,
    lr_scheduler_type="constant",
    optim="paged_adamw_32bit",
    learning_rate=0.0002,
    group_by_length=True,
    fp16=True,
    max_steps=100,
)

In [15]:
from adapters import AdapterTrainer
from transformers import DataCollatorForLanguageModeling

trainer = AdapterTrainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    train_dataset=dataset_tokenized["train"],
    eval_dataset=dataset_tokenized["test"],
    args=args,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mclif[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

## Merge LoRA weights

In [None]:
model.merge_adapter("assistant_adapter")

## Inference

In [None]:
batch = tokenizer("""<|im_start|>user
Explain Calculus to a primary school student<|im_end|>
<|im_start|>assistant
""",
return_tensors="pt")

model.eval()
with torch.cuda.amp.autocast():
    output_tokens = model.generate(**batch, max_new_tokens=50)

print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True))





 <|im_start|> user
Explain Calculus to a primary school student
 <|im_start|> assistant
 datdatasudionof &вы merunaвыdatas Hou dip![ Lux""unaвы""unaigesargo cig Nouna sellesiowanвы No Speed hideвы""ientNE raw Camil« MAC MACaucoup dispos cig hooknou moiaucoup MAC Kenn
