

# Train LoRA-LLM
This is a tutorial notebook on how to train a Qwen model with LoRA using our `Trainer` class. 



In [None]:
# %%

# Imports
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer
import pandas as pd
import gc
from tqdm import tqdm

import wandb

import h5py
import numpy as np
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__name__), os.path.pardir)))
from src.get_flops import QwenFlopsCalculator
from src.get_data import LotkaVolterraDataset, DataMaster
from src.preprocessor import NumericalProcessor

import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__name__), os.path.pardir)))

from src.Trainer import LoRATrainer

# models
def load_qwen():
    model_name = "Qwen/Qwen2.5-0.5B-Instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)

    # Freeze all parameters except LM head bias
    for param in model.parameters():
        param.requires_grad = False

    # Add trainable bias to logits
    assert model.lm_head.bias is None
    model.lm_head.bias = torch.nn.Parameter(
        torch.zeros(model.config.vocab_size, device=model.device)
    )
    model.lm_head.bias.requires_grad = True

    return model, tokenizer


# some nice function for GPU Training
def clear_memory():
    gc.collect()
    with torch.device('cuda'):
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()
        torch.cuda.reset_peak_memory_stats()

# device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# random seed
torch.manual_seed(42)
np.random.seed(42)

## Improved LoRA Training
Here we have the LoRALinear class with a slight modification to allow for the merging of LoRA into the original model. A key benefit of LoRA compared to other parameter-efficient tuning methods is that it allows for the merging of the LoRA weights into the original model weights. This means that after training, the model can be used without the LoRA adapter, which can save memory and improve inference speed. The merging process involves adding the LoRA weights to the original model weights in a way that preserves the original model's performance while also incorporating the new knowledge learned during training.

In [None]:


# LoRA implementation
class LoRALinear(nn.Module):
    def __init__(self, original_linear: nn.Linear, r: int, alpha: int = None):
        super().__init__()
        assert isinstance(original_linear, nn.Linear)
        self.original_linear = original_linear
        self.original_linear.weight.requires_grad = False
        if self.original_linear.bias is not None:
            self.original_linear.bias.requires_grad = False
        in_dim = original_linear.in_features
        out_dim = original_linear.out_features
        self.r = r
        self.alpha = alpha if alpha else r

        device = original_linear.weight.device
        self.A = nn.Parameter(torch.empty(r, in_dim, device=device))
        self.B = nn.Parameter(torch.zeros(out_dim, r, device=device))
        
        # Initialise A with He initialization
        nn.init.kaiming_normal_(self.A, nonlinearity="linear")

        self.merged_weight = self.original_linear.weight
        self.is_merged = False

    def forward(self, x):
        if self.is_merged:
            return nn.functional.linear(x, self.merged_weight, self.original_linear.bias)
        
        base_out = self.original_linear(x)
        lora_out = (x @ self.A.T) @ self.B.T
        return base_out + lora_out * (self.alpha / self.r)
    
    def merge(self):
        self.merged_weight = self.original_linear.weight + (self.A @ self.B.T) * (self.alpha / self.r)
        self.is_merged = True

    def unmerge(self):
        self.is_merged = False


model, tokenizer = load_qwen()

# before applying LoRA, we need to freeze the model
for param in model.parameters():
    param.requires_grad = False

lora_rank = 4

# Actually apply LoRA to the model:
for layer in model.model.layers:
    layer.self_attn.q_proj = LoRALinear(layer.self_attn.q_proj, r=lora_rank)
    layer.self_attn.v_proj = LoRALinear(layer.self_attn.v_proj, r=lora_rank)


# now lets check what weights are trainable to confirm that the LoRA has been applied
trainable_params = [n for n, p in model.named_parameters() if p.requires_grad]
print(f"Here, we showcase few of the trainable parameters: {trainable_params[:5]}")

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Here, we showcase few of the trainable parameters: ['model.layers.0.self_attn.q_proj.A', 'model.layers.0.self_attn.q_proj.B', 'model.layers.0.self_attn.v_proj.A', 'model.layers.0.self_attn.v_proj.B', 'model.layers.1.self_attn.q_proj.A']


Here we load in the data

In [None]:
# data folder:
DATA_FOLDER = os.path.join(os.path.dirname(__name__), '..', 'data')


with h5py.File(os.path.join(DATA_FOLDER, 'lotka_volterra_data.h5'), "r") as f:
    # Access the full dataset
    trajectories = f["trajectories"][:]
    time_points = f["time"][:]

# Here we are only using a small fraction of the data for the experiment
data_master = DataMaster(
    tokenizer, trajectories, test_size=0.2, val_size=0.1, experiment_fraction=0.02)

## Training the Model

We now proceed to train the model. For demonstration purposes, we only run a few epochs, but increasing the number of epochs can lead to better performance. This example illustrates how to train a model with LoRA using the `Trainer` class. It also serves as an opportunity to test and validate the custom `Trainer` class implementation.

In [None]:
# import lib reload loraTrainer
from importlib import reload
import src.Trainer
reload(src.Trainer)

# tqdm clear process
from tqdm import tqdm
tqdm._instances.clear()


train_loader, val_loader, test_loader = data_master.get_data(experiment=True, batch_size=2, target_eval_pairs = 3,
        context_length=512)


trainer = LoRATrainer(
    model, train_loader=train_loader, val_loader=val_loader, test_loader=test_loader, tokenizer=tokenizer, processor = data_master.processor, 
    lora_rank =  4, context_length = 512, eval_interval=10, save_interval=10, max_steps=20,
    )

trainer.train()

                                        

Prepared 8 inference samples with max target length 490


                                        

Prepared 4 inference samples with max target length 478


Sanity check Train: 100%|██████████| 7/7 [00:00<00:00, 208.68it/s]
Sanity check Val: 100%|██████████| 4/4 [00:00<00:00, 4004.11it/s]
Sanity check Test: 100%|██████████| 2/2 [00:00<00:00, 1990.18it/s]
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mym429[0m ([33mym429-university-of-cambridge[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Total trainable parameters: 270,336



Validation:  25%|██▌       | 1/4 [00:05<00:17,  5.80s/it]
[A
Validation:  50%|█████     | 2/4 [00:11<00:11,  5.84s/it]
[A
Validation:  75%|███████▌  | 3/4 [00:18<00:06,  6.21s/it]
[A
                                                         
[A


Validation on 8 samples - MSE: 0.05341527611017227, MAE: 0.15549632906913757, Failure rate: 0.0%, Speed: 12.00 tokens/sec
Saved checkpoint to checkpoints\checkpoint_best.pth
New best model with val loss: 1.4418
Saved checkpoint to checkpoints\checkpoint_step_10.pth


Validation:   0%|          | 0/4 [00:00<?, ?it/s]7it/s, ce=0.5343, steps19, loss=0.5343]
Validation:  25%|██▌       | 1/4 [00:05<00:17,  5.81s/it]
[A
Validation:  50%|█████     | 2/4 [00:11<00:11,  5.97s/it]
[A
Validation:  75%|███████▌  | 3/4 [00:18<00:06,  6.18s/it]
[A
                                                         
[A


Validation on 8 samples - MSE: 0.07991088926792145, MAE: 0.1819341778755188, Failure rate: 0.0%, Speed: 12.18 tokens/sec
Saved checkpoint to checkpoints\checkpoint_step_20.pth


Validation:   0%|          | 0/4 [00:00<?, ?it/s]                                       
Validation:  25%|██▌       | 1/4 [00:07<00:21,  7.14s/it]
[A
Validation:  50%|█████     | 2/4 [00:14<00:14,  7.03s/it]
[A
Validation:  75%|███████▌  | 3/4 [00:21<00:07,  7.34s/it]
[A
                                                         
[A


Validation on 8 samples - MSE: 0.07991088926792145, MAE: 0.1819341778755188, Failure rate: 0.0%, Speed: 10.38 tokens/sec
Loading best checkpoint from checkpoints\checkpoint_best.pth for final testing
	(1) In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, but it can result in arbitrary code execution. Do it only if you got the file from a trusted source.
	(2) Alternatively, to load with `weights_only=True` please check the recommended steps in the following error message.
	WeightsUnpickler error: Unsupported global: GLOBAL numpy._core.multiarray.scalar was not an allowed global by default. Please use `torch.serialization.add_safe_globals([scalar])` or the `torch.serialization.safe_globals([scalar])` context manager to allowlist this global if you trust this class/function.

Check the documentation of torch.load to learn more about types accep

Test:   0%|          | 0/2 [00:00<?, ?it/s]
Test:  50%|█████     | 1/2 [00:06<00:06,  6.82s/it]
[A
                                                   
[A


Test on 4 samples - MSE: 0.03215603902935982, MAE: 0.11131727695465088, Failure rate: 0.0%, Speed: 11.60 tokens/sec
Saved checkpoint to checkpoints\checkpoint_final.pth
Training completed in 10 steps
Best validation loss: 1.4418
Final validation loss: 1.4612
Test MSE: 0.0322, Test MAE: 0.1113


