# Finetuning

This module contains some functions useful to finetune models.

In [1]:
#| default_exp finetune

## Implementation

In [2]:
#| export
from peft import PeftModel

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# These imports are only used in test
from llama_4bit_wrapper import import_llama
from peft import LoraConfig, get_peft_model
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
import torch
import os

In [4]:
#| export
def lora_model_zeros_and_scales_to_half(
        model: PeftModel # Original model
    ) -> PeftModel: # Converted model
    """
    Convert zeros and scales for PeftModel to half-precision
    """
    for _, m in model.named_modules():
        if "Autograd4bitQuantLinear" in str(type(m)) or "Linear4bitLt" in str(type(m)):
            if hasattr(m, "is_v1_model") and m.is_v1_model:
                m.zeros = m.zeros.half()
            m.scales = m.scales.half()
    return model

## Testing

To make a *very* simple test here I will just try to:

- see the perplexity of pretrained Vicuna model for the sample text file
- train the LoRA adapters on top of Vicuna
- see the perplexity of such a model

In [5]:
_, train_data, load_llama_model_4bit_low_ram, _, model_to_half, _, apply_gradient_checkpointing, _, AMPWrapper = import_llama(
    use_flash_attention=False,
    use_xformers=False,
    autograd_4bit_cuda=True,
    autograd_4bit_triton=False
)

Triton not found. Please run "pip install triton".
Using CUDA implementation.


### Pretrained state

In [6]:
if not os.path.exists("../vicuna-13b-GPTQ-4bit-128g"):
    !git clone "https://huggingface.co/anon8231489123/vicuna-13b-GPTQ-4bit-128g"
    !mv "vicuna-13b-GPTQ-4bit-128g" ..

In [7]:
model_pretrained, tokenizer = load_llama_model_4bit_low_ram(
    config_path="../vicuna-13b-GPTQ-4bit-128g/",
    model_path="../vicuna-13b-GPTQ-4bit-128g/vicuna-13b-4bit-128g.safetensors",
    groupsize=128,
    is_v1_model=False,
)
tokenizer.pad_token_id = 0

Loading Model ...


The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.
The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.
The safetensors archive passed at ../vicuna-13b-GPTQ-4bit-128g/vicuna-13b-4bit-128g.safetensors does not contain metadata. Make sure to save your model with the `save_pretrained` method. Defaulting to 'pt' metadata.


Loaded the model in 6.77 seconds.


In [8]:
dataset = train_data.TrainTxt(
    dataset="01_alpaca_text.txt",
    val_set_size=0,
    tokenizer=tokenizer,
    cutoff_len=256,
)
dataset.prepare_data(thd=-1, use_eos_token=1)

                                                  

Train Data: 0.00% outliers


In [9]:
model_to_half(model_pretrained)
model_pretrained_wrapper = AMPWrapper(model_pretrained)
model_pretrained_wrapper.apply_forward()

Converted as Half.


In [10]:
def _test_model(model, data):
    probabilities = []
    with torch.no_grad():
        for sample in data:
            input_ids = torch.LongTensor([sample["input_ids"]]).cuda()
            response = model.forward(input_ids, return_dict=True)
            logits = response['logits'][0]
            probas = torch.nn.functional.softmax(logits, dim=-1)
            proba = probas.max(dim=-1).values.mean().item()
            probabilities.append(proba)
    average_proba = sum(probabilities) / len(probabilities)
    return average_proba

In [11]:
model_pretrained.eval()
_test_model(model_pretrained, dataset.train_data)

0.7032277960526315

In [12]:
model_pretrained.cpu()
torch.cuda.empty_cache()

### Finetune

In [13]:
model, tokenizer = load_llama_model_4bit_low_ram(
    config_path="../vicuna-13b-GPTQ-4bit-128g/",
    model_path="../vicuna-13b-GPTQ-4bit-128g/vicuna-13b-4bit-128g.safetensors",
    groupsize=128,
    is_v1_model=False,
)
tokenizer.pad_token_id = 0

Loading Model ...


The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.
The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.
The safetensors archive passed at ../vicuna-13b-GPTQ-4bit-128g/vicuna-13b-4bit-128g.safetensors does not contain metadata. Make sure to save your model with the `save_pretrained` method. Defaulting to 'pt' metadata.


Loaded the model in 5.54 seconds.


In [14]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.0,
    bias="none",
    task_type="CAUSAL_LM",
)
lora_model = get_peft_model(model, lora_config)
lora_model = lora_model_zeros_and_scales_to_half(lora_model)

In [15]:
apply_gradient_checkpointing(lora_model, checkpoint_ratio=1);

Forward Patch Applied For Block 0
Forward Patch Applied For Block 1
Forward Patch Applied For Block 2
Forward Patch Applied For Block 3
Forward Patch Applied For Block 4
Forward Patch Applied For Block 5
Forward Patch Applied For Block 6
Forward Patch Applied For Block 7
Forward Patch Applied For Block 8
Forward Patch Applied For Block 9
Forward Patch Applied For Block 10
Forward Patch Applied For Block 11
Forward Patch Applied For Block 12
Forward Patch Applied For Block 13
Forward Patch Applied For Block 14
Forward Patch Applied For Block 15
Forward Patch Applied For Block 16
Forward Patch Applied For Block 17
Forward Patch Applied For Block 18
Forward Patch Applied For Block 19
Forward Patch Applied For Block 20
Forward Patch Applied For Block 21
Forward Patch Applied For Block 22
Forward Patch Applied For Block 23
Forward Patch Applied For Block 24
Forward Patch Applied For Block 25
Forward Patch Applied For Block 26
Forward Patch Applied For Block 27
Forward Patch Applied For Bloc

In [16]:
training_arguments = TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    warmup_steps=5,
    optim="adamw_torch",
    num_train_epochs=10,
    learning_rate=3e-4,
    fp16=True,
    logging_steps=20,
    evaluation_strategy="no",
    save_strategy="steps",
    eval_steps=None,
    save_steps=50,
    output_dir="lora-output-directory",
    save_total_limit=3,
    load_best_model_at_end=False,
    ddp_find_unused_parameters=False,
    report_to="none",
)

In [17]:
trainer = Trainer(
    lora_model,
    train_dataset=dataset.train_data,
    eval_dataset=dataset.val_data,
    args=training_arguments,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
lora_model.config.use_cache = False

In [18]:
trainer.train()

  0%|          | 0/570 [00:00<?, ?it/s]

                                                

{'loss': 2.5157, 'learning_rate': 0.0002925663716814159, 'epoch': 0.35}


                                                

{'loss': 2.9429, 'learning_rate': 0.0002830088495575221, 'epoch': 0.7}


                                                  

{'loss': 2.2259, 'learning_rate': 0.00027238938053097343, 'epoch': 1.05}


                                                

{'loss': 2.0958, 'learning_rate': 0.00026176991150442475, 'epoch': 1.4}


                                                 

{'loss': 1.4413, 'learning_rate': 0.00025115044247787607, 'epoch': 1.75}


                                                   

{'loss': 1.5469, 'learning_rate': 0.00024053097345132742, 'epoch': 2.11}


                                                 

{'loss': 1.2576, 'learning_rate': 0.00022991150442477874, 'epoch': 2.46}


                                                   

{'loss': 1.1367, 'learning_rate': 0.00021929203539823008, 'epoch': 2.81}


                                                 

{'loss': 0.9065, 'learning_rate': 0.00020867256637168138, 'epoch': 3.16}


                                                 

{'loss': 0.6604, 'learning_rate': 0.00019805309734513272, 'epoch': 3.51}


                                                   

{'loss': 0.7005, 'learning_rate': 0.00018743362831858404, 'epoch': 3.86}


                                                 

{'loss': 0.4831, 'learning_rate': 0.0001768141592920354, 'epoch': 4.21}


                                                   

{'loss': 0.3282, 'learning_rate': 0.00016619469026548668, 'epoch': 4.56}


                                                 

{'loss': 0.5085, 'learning_rate': 0.00015557522123893803, 'epoch': 4.91}


                                                 

{'loss': 0.2686, 'learning_rate': 0.00014495575221238938, 'epoch': 5.26}


                                                   

{'loss': 0.2448, 'learning_rate': 0.0001343362831858407, 'epoch': 5.61}


                                                 

{'loss': 0.2158, 'learning_rate': 0.00012371681415929202, 'epoch': 5.96}


                                                   

{'loss': 0.2495, 'learning_rate': 0.00011309734513274335, 'epoch': 6.32}


                                                 

{'loss': 0.2347, 'learning_rate': 0.00010247787610619469, 'epoch': 6.67}


                                                 

{'loss': 0.124, 'learning_rate': 9.185840707964601e-05, 'epoch': 7.02}


                                                   

{'loss': 0.1026, 'learning_rate': 8.123893805309734e-05, 'epoch': 7.37}


                                                 

{'loss': 0.0842, 'learning_rate': 7.061946902654867e-05, 'epoch': 7.72}


                                                   

{'loss': 0.3344, 'learning_rate': 5.9999999999999995e-05, 'epoch': 8.07}


                                                 

{'loss': 0.1178, 'learning_rate': 4.938053097345133e-05, 'epoch': 8.42}


                                                 

{'loss': 0.1294, 'learning_rate': 3.8761061946902655e-05, 'epoch': 8.77}


                                                 

{'loss': 0.1697, 'learning_rate': 2.814159292035398e-05, 'epoch': 9.12}


                                                 

{'loss': 0.1052, 'learning_rate': 1.752212389380531e-05, 'epoch': 9.47}


                                                 

{'loss': 0.2013, 'learning_rate': 6.9026548672566364e-06, 'epoch': 9.82}


                                                 

{'train_runtime': 1937.9553, 'train_samples_per_second': 0.294, 'train_steps_per_second': 0.294, 'train_loss': 0.7503459926237139, 'epoch': 10.0}


100%|██████████| 570/570 [32:17<00:00,  3.40s/it]


TrainOutput(global_step=570, training_loss=0.7503459926237139, metrics={'train_runtime': 1937.9553, 'train_samples_per_second': 0.294, 'train_steps_per_second': 0.294, 'train_loss': 0.7503459926237139, 'epoch': 10.0})

### Tuned model

In [19]:
lora_model.eval()
_test_model(lora_model, dataset.train_data)



0.9679228182424578

In [20]:
import nbdev; nbdev.nbdev_export()