

# Train LoRA-LLM
we will first apply the lora to the model

In [1]:
# %%

# Imports
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer
import pandas as pd
import gc
from tqdm import tqdm

import wandb

import h5py
import numpy as np
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__name__), os.path.pardir)))
from src.get_flops import QwenFlopsCalculator
from src.get_data import LotkaVolterraDataset, DataMaster
from src.preprocessor import NumericalProcessor


# models
def load_qwen():
    model_name = "Qwen/Qwen2.5-0.5B-Instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)

    # Freeze all parameters except LM head bias
    for param in model.parameters():
        param.requires_grad = False

    # Add trainable bias to logits
    assert model.lm_head.bias is None
    model.lm_head.bias = torch.nn.Parameter(
        torch.zeros(model.config.vocab_size, device=model.device)
    )
    model.lm_head.bias.requires_grad = True

    return model, tokenizer


# some nice function for GPU Training
def clear_memory():
    gc.collect()
    with torch.device('cuda'):
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()
        torch.cuda.reset_peak_memory_stats()

# device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# random seed
torch.manual_seed(42)
np.random.seed(42)

In [2]:
# %%  

# Here we have the LoRALinear class with a slight modification to allow for the merging of LoRA into the original model

# LoRA implementation
class LoRALinear(nn.Module):
    def __init__(self, original_linear: nn.Linear, r: int, alpha: int = None):
        super().__init__()
        assert isinstance(original_linear, nn.Linear)
        self.original_linear = original_linear
        self.original_linear.weight.requires_grad = False
        if self.original_linear.bias is not None:
            self.original_linear.bias.requires_grad = False
        in_dim = original_linear.in_features
        out_dim = original_linear.out_features
        self.r = r
        self.alpha = alpha if alpha else r

        device = original_linear.weight.device
        self.A = nn.Parameter(torch.empty(r, in_dim, device=device))
        self.B = nn.Parameter(torch.zeros(out_dim, r, device=device))
        
        # Initialise A with He initialization
        nn.init.kaiming_normal_(self.A, nonlinearity="linear")

        self.merged_weight = self.original_linear.weight
        self.is_merged = False

    def forward(self, x):
        if self.is_merged:
            return nn.functional.linear(x, self.merged_weight, self.original_linear.bias)
        
        base_out = self.original_linear(x)
        lora_out = (x @ self.A.T) @ self.B.T
        return base_out + lora_out * (self.alpha / self.r)
    
    def merge(self):
        self.merged_weight = self.original_linear.weight + (self.A @ self.B.T) * (self.alpha / self.r)
        self.is_merged = True

    def unmerge(self):
        self.is_merged = False


model, tokenizer = load_qwen()

# before applying LoRA, we need to freeze the model
for param in model.parameters():
    param.requires_grad = False

lora_rank = 4

# Actually apply LoRA to the model:
for layer in model.model.layers:
    layer.self_attn.q_proj = LoRALinear(layer.self_attn.q_proj, r=lora_rank)
    layer.self_attn.v_proj = LoRALinear(layer.self_attn.v_proj, r=lora_rank)


# now lets check what weights are trainable to confirm that the LoRA has been applied
trainable_params = [n for n, p in model.named_parameters() if p.requires_grad]
print(f"Here, we showcase few of the trainable parameters: {trainable_params[:5]}")

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Here, we showcase few of the trainable parameters: ['model.layers.0.self_attn.q_proj.A', 'model.layers.0.self_attn.q_proj.B', 'model.layers.0.self_attn.v_proj.A', 'model.layers.0.self_attn.v_proj.B', 'model.layers.1.self_attn.q_proj.A']


# %% [markdown]

 Great! Now that we have applied the LoRA to the model, we can now train the model with the LoRA applied.

In [3]:
# %%

# Load the data


# data folder:
DATA_FOLDER = os.path.join(os.path.dirname(__name__), '..', 'data')


with h5py.File(os.path.join(DATA_FOLDER, 'lotka_volterra_data.h5'), "r") as f:
    # Access the full dataset
    trajectories = f["trajectories"][:]
    time_points = f["time"][:]

# Here we are only using a small fraction of the data for the experiment
data_master = DataMaster(
    tokenizer, trajectories, test_size=0.2, val_size=0.1, experiment_fraction=0.1)

# %% [markdown]

 %% [markdown]

In [4]:
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__name__), os.path.pardir)))

from src.Trainer import LoRATrainer

# %% [markdown]

 ### Training the model

In [5]:
# import lib reload loraTrainer
from importlib import reload
import src.Trainer
reload(src.Trainer)

# tqdm clear process
from tqdm import tqdm
tqdm._instances.clear()


train_loader, val_loader, test_loader = data_master.get_data(experiment=True, batch_size=2, target_eval_pairs = 3)


trainer = LoRATrainer(
    model, train_loader=train_loader, val_loader=val_loader, test_loader=test_loader, tokenizer=tokenizer, processor = data_master.processor, 
    lora_rank =  4, context_length = 128, eval_interval=10
    )

trainer.train()

                                        

Prepared 80 inference samples with max target length 874


                                        

Prepared 40 inference samples with max target length 862


Sanity check Train: 100%|██████████| 35/35 [00:00<00:00, 1038.68it/s]
Sanity check Val: 100%|██████████| 40/40 [00:00<00:00, 10000.13it/s]
Sanity check Test: 100%|██████████| 20/20 [00:00<00:00, 6652.87it/s]
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mym429[0m ([33mym429-university-of-cambridge[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Total trainable parameters: 270,336


Validation:   0%|          | 0/40 [00:00<?, ?it/s]it/s, ce=7.1802, loss=7.1802]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Validation:   2%|▎         | 1/40 [00:03<02:14,  3.46s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Validation:   5%|▌         | 2/40 [00:06<01:57,  3.09s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Validation:   8%|▊         | 3/40 [00:09<01:50,  2.99s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Validation:  10%|█         | 4/40 [00:12<01:46,  2.95s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Validation:  12%|█▎        | 5/40 [00:15<01:48,  3.09s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Validation:  15%|█▌  

KeyboardInterrupt: 