In [1]:
# Kill all processess on GPU
# !fuser -v /dev/nvidia* -k

# Libraries

In [2]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    %pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    %pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    %pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    %pip install --no-deps unsloth

In [3]:
%pip install trl==0.19.1 # Fix error: ImportError: cannot import name 'ConstantLengthDataset' from 'trl.trainer.utils'

Collecting trl==0.19.1
  Downloading trl-0.19.1-py3-none-any.whl.metadata (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate>=1.4.0->trl==0.19.1)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate>=1.4.0->trl==0.19.1)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate>=1.4.0->trl==0.19.1)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate>=1.4.0->trl==0.19.1)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate>=1.4.0->trl==0.19.1)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manyl

In [1]:
from unsloth import FastLanguageModel
import os
import math
import functools
import torch
import torch.nn as nn
import torch.nn.functional as F
# from transformers import TrainingArguments
from peft import LoraConfig
from huggingface_hub import snapshot_download
from safetensors.torch import load_file
from pprint import pprint

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
def download_hf_model(repo_id, checkpoint):
    local_dir = repo_id.split('/')[-1]
    ignore_checkpoints = [f'checkpoint-{i}/*' for i in range(0, 2000, 25) if i != checkpoint]

    snapshot_download(
        repo_id=repo_id,
        local_dir=local_dir,
        ignore_patterns=ignore_checkpoints,
    )

    if checkpoint:
        return os.path.join(local_dir, f'checkpoint-{checkpoint}')
    return local_dir

@torch.no_grad()
def check_lora_parameters(model):
    for n, p in model.named_parameters():
        if 'lora' in n:
            print(f"- {'Name':<8}:", n)
            print(f"- {'Mean':<8}:", p.mean().item())
            print(f"- {'Min':<8}:", p.min().item())
            print(f"- {'Max':<8}:", p.max().item())
            break

@torch.no_grad()
def generate_text(model, tokenizer, prompt, max_new_tokens=50, skip_special_tokens=True):
    device = next(model.parameters()).device
    inputs = tokenizer(prompt, return_tensors='pt')
    outputs = model.generate(input_ids=inputs['input_ids'].to(device), max_new_tokens=max_new_tokens)
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=skip_special_tokens)[0])

# Config

In [3]:
# Project configuration
seed = 69
device = 'mps'

# Model configuration
max_seq_length = 1024
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model_configs = {
    'L1T1': {
        'lora_id': 'alxxtexxr/L3.1-8B-wikipedia-en-5K-LoRA-v20250630122650',
        'checkpoint': 650,
    },
    'L2T1': {
        'lora_id': 'alxxtexxr/L3.1-8B-wikipedia-ja-5K-LoRA-v20250728141629',
        'checkpoint': 650,
    },
}

for key, config in model_configs.items():
    model_configs[key]['lora_dir'] = download_hf_model(config['lora_id'], config['checkpoint'])

print("Model configurations:",)
pprint(model_configs)

Model configurations:
{'L1T1': {'checkpoint': 650,
          'lora_dir': 'L3.1-8B-wikipedia-en-5K-LoRA-v20250630122650/checkpoint-650',
          'lora_id': 'alxxtexxr/L3.1-8B-wikipedia-en-5K-LoRA-v20250630122650'},
 'L2T1': {'checkpoint': 650,
          'lora_dir': 'L3.1-8B-wikipedia-ja-5K-LoRA-v20250728141629/checkpoint-650',
          'lora_id': 'alxxtexxr/L3.1-8B-wikipedia-ja-5K-LoRA-v20250728141629'}}


# Model

## LoRA Model

### References
- https://github.com/huggingface/peft/blob/main/src/peft/tuners/lora/bnb.py
- https://github.com/huggingface/peft/blob/main/src/peft/tuners/lora/layer.py

In [20]:
class LoraLayer(nn.Module):
    def __init__(self, base_layer, rank, alpha, dropout, lora_bias, use_rslora, return_lora_output=False):
        super().__init__()
        self.base_layer = base_layer
        self.device = base_layer.weight.device
        self.alpha = alpha
        self.lora_bias = lora_bias
        self.scaling = alpha / math.sqrt(rank) if use_rslora else alpha / rank
        self.dropout = nn.Dropout(dropout) if dropout > 0.0 else nn.Identity()
        self.return_lora_output = return_lora_output

        # Extract input and output features from the base layer
        in_features = getattr(base_layer, 'in_features', None)
        out_features = getattr(base_layer, 'out_features', None)

        if in_features is None or out_features is None:
            raise ValueError(f"Cannot determine in_features or out_features from {base_layer}.")
        
        # LoRA decomposition: A (down-projection) and B (up-projection)
        self.lora_A = nn.Linear(in_features, rank, bias=lora_bias).to(self.device)  # Projects down
        self.lora_B = nn.Linear(rank, out_features, bias=lora_bias).to(self.device) # Projects up

        # Initialize LoRA matrices: A ~ N(0, 1/rank), B initialized to 0
        std = 1 / torch.sqrt(torch.tensor(rank).float())
        nn.init.normal_(self.lora_A.weight, mean=0.0, std=std)
        nn.init.zeros_(self.lora_B.weight)
        
    def forward(self, x):
        # Forward through base layer
        base_out = self.base_layer(x)

        # LoRA transformation
        requires_conversion = not torch.is_autocast_enabled()
        if requires_conversion:
            x = x.to(self.lora_A.weight.dtype)
        lora_out = self.lora_B(self.lora_A(self.dropout(x))) * self.scaling
        if requires_conversion:
            lora_out = lora_out.to(base_out.dtype)

        output = base_out + lora_out

        if self.return_lora_output:
            return output, lora_out
        
        return output

    def load_lora_weights(self, state_dict, prefix):
        self.lora_A.weight.data = state_dict[f'{prefix}.lora_A.weight'].to(self.device)
        self.lora_B.weight.data = state_dict[f'{prefix}.lora_B.weight'].to(self.device)
        if self.lora_bias:
            self.lora_A.bias.data = state_dict[f'{prefix}.lora_A.bias'].to(self.device)
            self.lora_B.bias.data = state_dict[f'{prefix}.lora_B.bias'].to(self.device)
    
class LoraModel(nn.Module):
    def __init__(self, base_model: nn.Module, lora_config: LoraConfig, return_lora_outputs=False):
        super().__init__()
        self.base_model = base_model
        self.lora_layers = nn.ModuleDict()
        self.return_lora_outputs = return_lora_outputs

        # Wrap target layers with NeroLayer
        self._wrap_target_layers(lora_config)
    
    def _wrap_target_layers(self, lora_config):
        for module_name, module in self.base_model.named_modules():
            if isinstance(module, LoraLayer):
                # Convert module name format and store reference
                module_name = module_name.rsplit('model.', 1)[-1]
                module_name = module_name.replace('.', '__DOT__')
                self.lora_layers[module_name] = module
                continue

            if any(module_name.endswith(target_module) for target_module in lora_config.target_modules) and isinstance(module, nn.Linear):    
                parent_module, child_name = self._get_parent_module(module_name)
                lora_layer = LoraLayer(
                    module, 
                    lora_config.r, 
                    lora_config.lora_alpha, 
                    lora_config.lora_dropout, 
                    lora_config.lora_bias, 
                    lora_config.use_rslora,
                    return_lora_output=self.return_lora_outputs,
                )
                setattr(parent_module, child_name, lora_layer)

                # Store LoRA layers for weight loading
                module_name = module_name.rsplit('model.', 1)[-1]
                module_name = module_name.replace('.', '__DOT__')
                self.lora_layers[module_name] = lora_layer
    
    def _get_parent_module(self, module_name):
        parts = module_name.split('.')
        parent_module = self.base_model
        for part in parts[:-1]:
            parent_module = getattr(parent_module, part)
        return parent_module, parts[-1]

    def freeze_all(self):
        for param in self.base_model.parameters():
            param.requires_grad = False
    
    def unfreeze_all(self):
        for param in self.base_model.parameters():
            param.requires_grad = True
        
        for lora_layer in self.lora_layers.values():
            for param in lora_layer.parameters():
                param.requires_grad = True
    
    def load_lora_weights(self, lora_path):
        state_dict = load_file(lora_path)
        prefix = list(state_dict.keys())[0].rsplit('model.', 1)[0] + 'model.'
        for lora_layer_name, lora_layer in self.lora_layers.items():
            lora_layer_name = lora_layer_name.replace('__DOT__', '.')
            lora_layer_name = prefix + lora_layer_name
            if f'{lora_layer_name}.lora_A.weight' in state_dict and f'{lora_layer_name}.lora_B.weight' in state_dict:
                lora_layer.load_lora_weights(state_dict, lora_layer_name)
            else:
                # TODO: Print warning message
                pass
        print("LoRA weights loaded successfully!")
    
    def forward(self, input_ids, attention_mask=None):
        if self.return_lora_outputs:
            lora_outs = {}
            
            def _hook_fn(layer_name, module, _in, _out):
                if isinstance(_out, tuple) and len(_out) == 2:
                    layer_out, lora_out = _out
                    lora_outs[layer_name] = lora_out # Store nero_out separately
                    return layer_out # Return only layer_out to avoid breaking model flow

            # Register hooks to extract nero_out during forward pass
            hooks = []
            for layer_name, layer in self.lora_layers.items():
                hook = layer.register_forward_hook(functools.partial(_hook_fn, layer_name))
                hooks.append(hook)
        
            try:
                output = self.base_model(input_ids, attention_mask=attention_mask)
            finally:
                # Remove hooks after forward pass, ensuring it's done even if an error occurs
                for hook in hooks:
                    hook.remove()

            return output, lora_outs
        
        return self.base_model(input_ids, attention_mask=attention_mask)
    
    def __getattr__(self, name):
        try:
            return super().__getattr__(name) # Try getting attribute from self
        except AttributeError:
            return getattr(self.base_model, name) # Fallback to base_model

base_model1, tokenizer = FastLanguageModel.from_pretrained(
    model_name='unsloth/Meta-Llama-3.1-8B',
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)
lora_config = LoraConfig.from_pretrained(model_configs['L1T1']['lora_dir'])
lora_model = LoraModel(base_model1, lora_config, return_lora_outputs=True)
lora_model.freeze_all()

==((====))==  Unsloth 2025.7.11: Fast Llama patching. Transformers: 4.53.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [21]:
print("Check LoRA parameters (unloaded):")
check_lora_parameters(lora_model)
print()

lora_path = os.path.join(model_configs['L1T1']['lora_dir'], 'adapter_model.safetensors')
lora_model.load_lora_weights(lora_path)
print()

print("Check LoRA parameters (loaded):")
check_lora_parameters(lora_model)

Check LoRA parameters (unloaded):
- Name    : base_model.model.layers.0.self_attn.q_proj.lora_A.weight
- Mean    : -0.00032052432652562857
- Min     : -1.5238006114959717
- Max     : 1.4642316102981567

LoRA weights loaded successfully!

Check LoRA parameters (loaded):
- Name    : base_model.model.layers.0.self_attn.q_proj.lora_A.weight
- Mean    : 6.287686119321734e-05
- Min     : -0.04176201671361923
- Max     : 0.04242725297808647


In [22]:
# generate_text(lora_model, tokenizer, prompt="Preheat the oven to 350 degrees and place the cookie dough", skip_special_tokens=False)

## Nero Layer

In [4]:
class NeroLayer(nn.Module):
    def __init__(self, base_layer, 
                 # LoRA parameters
                 rank, alpha, dropout, lora_bias, use_rslora, 
                 # Nero parameters
                 nero_bias=False, 
                 # return_nero_output=False,
                 ):
        super().__init__()
        self.base_layer = base_layer
        self.device = base_layer.weight.device
        self.alpha = alpha
        self.lora_bias = lora_bias
        self.scaling = alpha / math.sqrt(rank) if use_rslora else alpha / rank
        self.dropout = nn.Dropout(dropout) if dropout > 0.0 else nn.Identity()
        # self.return_nero_output = return_nero_output

        # Extract input and output features from the base layer
        in_features = getattr(base_layer, 'in_features', None)
        out_features = getattr(base_layer, 'out_features', None)

        if in_features is None or out_features is None:
            raise ValueError(f"Cannot determine in_features or out_features from {base_layer}.")
        
        # LoRA decomposition: A (down-projection) and B (up-projection)
        self.lora_A = nn.Linear(in_features, rank, bias=lora_bias).to(self.device)  # Projects down
        self.lora_B = nn.Linear(rank, out_features, bias=lora_bias).to(self.device) # Projects up

        # Initialize LoRA matrices: A ~ N(0, 1/rank), B initialized to 0
        std = 1 / torch.sqrt(torch.tensor(rank).float())
        nn.init.normal_(self.lora_A.weight, mean=0.0, std=std)
        nn.init.zeros_(self.lora_B.weight)

        # Nero decomposition: additional transformation applied to LoRA output
        self.nero_A = nn.Linear(out_features, rank, bias=nero_bias).to(self.device)
        self.nero_B = nn.Linear(rank, out_features, bias=nero_bias).to(self.device)

        # Initialize Nero matrices similarly
        nn.init.normal_(self.nero_A.weight, mean=0.0, std=std)
        nn.init.zeros_(self.nero_B.weight)

        # This will store the last nero output for access in NeroModel
        self.last_nero_out = None
        
    def forward(self, x):
        print(f"[NeroLayer] Forward pass executed for {self}")

        # Forward through base layer
        base_out = self.base_layer(x)
        # As per Tim Dettmers, for 4bit, we need to defensively clone here.
        # The reason is that in some cases, an error can occur that backprop
        # does not work on a manipulated view. This issue may be solved with
        # newer PyTorch versions but this would need extensive testing to be
        # sure.
        base_out = base_out.clone()

        print("base_out.requires_grad:", base_out.requires_grad)
        print("base_out.grad_fn:", base_out.grad_fn)

        # LoRA transformation
        requires_conversion = not torch.is_autocast_enabled()
        if requires_conversion:
            x = x.to(self.lora_A.weight.dtype)
        lora_out = self.lora_B(self.lora_A(self.dropout(x))) * self.scaling
        # if requires_conversion:
        #     lora_out = lora_out.to(base_out.dtype)

        print("lora_out.requires_grad:", lora_out.requires_grad)
        print("lora_out.grad_fn:", lora_out.grad_fn)

        # Nero transformation (applied on top of LoRA output)
        print('self.nero_A.weight:', self.nero_A.weight)
        test_out = self.nero_A(torch.randn(lora_out.shape).to(self.device))
        print("test_out:", test_out)

        nero_out = F.relu(self.nero_B(self.nero_A(self.dropout(lora_out))) * self.scaling)
        # nero_in = self.dropout(lora_out)
        # print("nero_in:", nero_in)
        # nero_A_out = self.nero_A(nero_in)
        # print("nero_A_out:", nero_A_out)
        # nero_B_out = self.nero_B(nero_A_out)
        # print("nero_B_out:", nero_B_out)
        # nero_out = F.relu(nero_B_out * self.scaling)
        # print("nero_out:", nero_out)
        if requires_conversion:
            nero_out = nero_out.to(base_out.dtype)
        self.last_nero_out = nero_out

        print("nero_out.requires_grad:", nero_out.requires_grad)
        print("nero_out.grad_fn:", nero_out.grad_fn)

        output = base_out + nero_out

        # if self.return_nero_output:
            # self.last_nero_out = nero_out
            # return output, nero_out
        
        return output

    def load_lora_weights(self, state_dict, prefix):
        self.lora_A.weight.data = state_dict[f'{prefix}.lora_A.weight'].to(self.device)
        self.lora_B.weight.data = state_dict[f'{prefix}.lora_B.weight'].to(self.device)
        if self.lora_bias:
            self.lora_A.bias.data = state_dict[f'{prefix}.lora_A.bias'].to(self.device)
            self.lora_B.bias.data = state_dict[f'{prefix}.lora_B.bias'].to(self.device)
    
class NeroModel(nn.Module):
    def __init__(self, base_model: nn.Module, lora_config: LoraConfig, nero_bias: bool=False, 
                 # return_nero_outputs: bool=False,
                 ):
        super().__init__()
        self.base_model = base_model
        self.nero_bias = nero_bias
        self.nero_layers = nn.ModuleDict()
        # self.return_nero_outputs = return_nero_outputs

        # Wrap target layers with NeroLayer
        self._wrap_target_layers(lora_config)
        
    def _wrap_target_layers(self, lora_config):
        for module_name, module in self.base_model.named_modules():
            if isinstance(module, NeroLayer):
                # Convert module name format and store reference
                module_name = module_name.rsplit('model.', 1)[-1]
                module_name = module_name.replace('.', '__DOT__')
                self.nero_layers[module_name] = module
                continue

            if any(module_name.endswith(target_module) for target_module in lora_config.target_modules) and isinstance(module, nn.Linear):    
                parent_module, child_name = self._get_parent_module(module_name)
                nero_layer = NeroLayer(
                    module, 
                    lora_config.r, 
                    lora_config.lora_alpha, 
                    lora_config.lora_dropout, 
                    lora_config.lora_bias, 
                    lora_config.use_rslora,
                    nero_bias=self.nero_bias,
                    # return_nero_output=self.return_nero_outputs,
                )
                setattr(parent_module, child_name, nero_layer)

                # Store LoRA layers for weight loading
                module_name = module_name.rsplit('model.', 1)[-1]
                module_name = module_name.replace('.', '__DOT__')
                self.nero_layers[module_name] = nero_layer
    
    def _get_parent_module(self, module_name):
        parts = module_name.split('.')
        parent_module = self.base_model
        for part in parts[:-1]:
            parent_module = getattr(parent_module, part)
        return parent_module, parts[-1]

    def freeze_except_nero(self):
        for param in self.base_model.parameters():
            param.requires_grad = False
        
        for nero_layer in self.nero_layers.values():
            for param_name, param in nero_layer.named_parameters():
                if 'nero_A' in param_name or 'nero_B' in param_name:
                    param.requires_grad = True
                else:
                    param.requires_grad = False
    
    def unfreeze_all(self):
        for param in self.base_model.parameters():
            param.requires_grad = True
        
        for nero_layer in self.nero_layers.values():
            for param in nero_layer.parameters():
                param.requires_grad = True
    
    def load_lora_weights(self, lora_path):
        state_dict = load_file(lora_path)
        prefix = list(state_dict.keys())[0].rsplit('model.', 1)[0] + 'model.'
        for nero_layer_name, nero_layer in self.nero_layers.items():
            nero_layer_name = nero_layer_name.replace('__DOT__', '.')
            nero_layer_name = prefix + nero_layer_name
            if f'{nero_layer_name}.lora_A.weight' in state_dict and f'{nero_layer_name}.lora_B.weight' in state_dict:
                nero_layer.load_lora_weights(state_dict, nero_layer_name)
            else:
                # TODO: Print warning message
                pass
        print("LoRA weights loaded successfully!")
    
    # def forward(self, input_ids, attention_mask=None):
    #     if self.training:
    #         nero_outs = {}
            
    #         def _hook_fn(layer_name, module, _in, _out):
    #             if isinstance(_out, tuple) and len(_out) == 2:
    #                 layer_out, nero_out = _out
    #                 nero_outs[layer_name] = nero_out # Store nero_out separately
    #                 return layer_out # Return only layer_out to avoid breaking model flow

    #         # Register hooks to extract nero_out during forward pass
    #         hooks = []
    #         for layer_name, layer in self.nero_layers.items():
    #             hook = layer.register_forward_hook(functools.partial(_hook_fn, layer_name))
    #             hooks.append(hook)
        
    #         try:
    #             output = self.base_model(input_ids, attention_mask=attention_mask)
    #         finally:
    #             # Remove hooks after forward pass, ensuring it's done even if an error occurs
    #             for hook in hooks:
    #                 hook.remove()

    #         return output, nero_outs
        
    #     return self.base_model(input_ids, attention_mask=attention_mask)

    def forward(self, input_ids, attention_mask=None):
        self_outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)

        # Collect nero outputs directly from layers
        nero_outs = {}
        for name, nero_layer in self.nero_layers.items():
            if hasattr(nero_layer, 'last_nero_out'):
                nero_outs[name] = nero_layer.last_nero_out  # Store after forward pass

        return self_outputs, nero_outs
    
    def __getattr__(self, name):
        try:
            return super().__getattr__(name) # Try getting attribute from self
        except AttributeError:
            return getattr(self.base_model, name) # Fallback to base_model

base_model2, tokenizer = FastLanguageModel.from_pretrained(
    model_name='unsloth/Meta-Llama-3.1-8B',
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)
lora_config = LoraConfig.from_pretrained(model_configs['L2T1']['lora_dir'])
nero_model = NeroModel(base_model2, lora_config, nero_bias=True, 
                       # return_nero_outputs=True
                       )
# nero_model.freeze_except_nero()
# print(nero_model)

==((====))==  Unsloth 2025.7.11: Fast Llama patching. Transformers: 4.53.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [5]:
nero_model.train()
nero_model.gradient_checkpointing_enable() # Fix error: 'LlamaDecoderLayer' object has no attribute '_gradient_checkpointing_func'
device = next(nero_model.parameters()).device
inputs = tokenizer("Preheat the oven to 350 degrees and place the cookie dough", return_tensors='pt')
nero_model_outs = nero_model(input_ids=inputs['input_ids'].to(device))

[NeroLayer] Forward pass executed for NeroLayer(
  (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
  (dropout): Identity()
  (lora_A): Linear(in_features=4096, out_features=8, bias=False)
  (lora_B): Linear(in_features=8, out_features=4096, bias=False)
  (nero_A): Linear(in_features=4096, out_features=8, bias=True)
  (nero_B): Linear(in_features=8, out_features=4096, bias=True)
)
base_out.requires_grad: False
base_out.grad_fn: None
lora_out.requires_grad: False
lora_out.grad_fn: None
self.nero_A.weight: Parameter containing:
tensor([[ 0.3625, -0.2265,  0.5827,  ...,  0.3783,  0.5715, -0.3697],
        [-0.1446,  0.0784,  0.2471,  ...,  0.0278,  0.0392,  0.1049],
        [-0.3740,  0.2509,  0.0829,  ..., -0.1775, -0.0762, -0.7239],
        ...,
        [-0.5322,  0.6272,  0.2937,  ...,  0.0616,  0.6675,  0.2158],
        [-0.1484,  0.2143, -0.1039,  ..., -0.2257,  0.0461,  0.0494],
        [ 0.2304, -0.4242, -0.3902,  ..., -0.7668, -0.2101,  0.4244]],
       de

In [26]:
nero_model.unfreeze_all()

RuntimeError: only Tensors of floating point and complex dtype can require gradients

In [70]:
[x for x in dir(nero_model.base_model.model.layers[0].self_attn.q_proj.lora_A.weight) if 'grad' in x]

['_grad',
 '_grad_fn',
 '_post_accumulate_grad_hooks',
 'grad',
 'grad_fn',
 'register_post_accumulate_grad_hook',
 'requires_grad',
 'requires_grad_',
 'retain_grad',
 'retains_grad']

In [59]:
[x for x in dir(nero_model) if 'grad' in x]

['requires_grad_', 'zero_grad']

In [68]:
nero_model.base_model.model.layers[0].self_attn.q_proj.lora_A.requires_grad_()

Linear(in_features=4096, out_features=8, bias=False)

In [29]:
for n, p in nero_model.named_parameters():
    if 'base_layer.weight' in n: 
        print()
    print(n, "-->", p.requires_grad, p.dtype)

base_model.model.embed_tokens.weight --> True torch.float16

base_model.model.layers.0.self_attn.q_proj.base_layer.weight --> False torch.uint8
base_model.model.layers.0.self_attn.q_proj.lora_A.weight --> True torch.float32
base_model.model.layers.0.self_attn.q_proj.lora_B.weight --> True torch.float32
base_model.model.layers.0.self_attn.q_proj.nero_A.weight --> True torch.float32
base_model.model.layers.0.self_attn.q_proj.nero_A.bias --> True torch.float32
base_model.model.layers.0.self_attn.q_proj.nero_B.weight --> True torch.float32
base_model.model.layers.0.self_attn.q_proj.nero_B.bias --> True torch.float32

base_model.model.layers.0.self_attn.k_proj.base_layer.weight --> False torch.uint8
base_model.model.layers.0.self_attn.k_proj.lora_A.weight --> True torch.float32
base_model.model.layers.0.self_attn.k_proj.lora_B.weight --> True torch.float32
base_model.model.layers.0.self_attn.k_proj.nero_A.weight --> True torch.float32
base_model.model.layers.0.self_attn.k_proj.nero_A.bias -

In [71]:
import gc
import torch

# base_model1.to('cpu')
# lora_model.to('cpu')
# del base_model1
# del nero_model

print(torch.cuda.memory_allocated())

if 'nero_model' in globals():
    nero_model.to('cpu')
    del nero_model

gc.collect()
torch.cuda.empty_cache()
print(torch.cuda.memory_allocated())

6283761152


: 

: 

: 

In [19]:
x = torch.randn(8, 4096).to('cuda')
print(x.requires_grad)
print('nero_A:', nero_model.base_model.model.layers[0].self_attn.q_proj.nero_A.weight)
y = nero_model.base_model.model.layers[0].self_attn.q_proj.nero_A(x)
print(y.requires_grad)

False
nero_model.base_model.model.layers[0].self_attn.q_proj.nero_A: Parameter containing:
tensor([[-4.4971e-01,  3.1203e-01,  5.2366e-01,  ...,  2.6862e-01,
          2.4798e-01,  6.7461e-04],
        [-4.1525e-02,  4.5143e-01, -5.7140e-01,  ..., -6.3062e-02,
         -1.2641e-01,  2.3381e-01],
        [-6.1668e-02, -2.4167e-01, -8.9052e-02,  ..., -1.3829e-01,
         -1.7905e-01,  1.7789e-02],
        ...,
        [ 1.9156e-02, -1.1599e+00,  3.7303e-01,  ...,  4.3789e-01,
          7.5028e-02, -4.8528e-01],
        [ 1.6513e-01,  7.7233e-01,  5.8815e-01,  ...,  2.0706e-01,
         -2.5718e-01, -5.1134e-01],
        [ 2.2792e-01, -2.5864e-01, -2.1519e-01,  ...,  2.5236e-01,
         -1.1390e-01, -8.2920e-01]], device='cuda:0', requires_grad=True)
True


In [13]:
print("Check LoRA parameters (unloaded):")
check_lora_parameters(nero_model)
print()

lora_path = os.path.join(model_configs['L2T1']['lora_dir'], 'adapter_model.safetensors')
nero_model.load_lora_weights(lora_path)
print()

print("Check LoRA parameters (loaded):")
check_lora_parameters(nero_model)

Check LoRA parameters (unloaded):


NameError: name 'check_lora_parameters' is not defined

In [None]:
# generate_text(nero_model, tokenizer, prompt="Preheat the oven to 350 degrees and place the cookie dough", skip_special_tokens=False)

In [None]:
nero_model.base_model.model.layers[0].self_attn.q_proj.nero_A(torch.randn(8, 4096).to('cuda'))

tensor([[-40.7062,  -6.6492,  -3.7754,  13.8341,  20.6145,  53.7147, -21.1631,
         -40.2303],
        [ -6.0338,  12.2223,   1.5039,  18.9884,  22.6390, -32.5255,  -3.8959,
          38.7502],
        [ 17.4199,  18.8145, -19.4615, -15.9465, -22.6114, -15.0024,  29.2715,
         -41.9063],
        [-18.6150, -13.4174,  -6.4494,  22.2234,   8.2466, -11.7426,   2.5520,
         -19.9913],
        [ -9.3683,   8.1962,  16.1248, -23.4653,  25.8517, -39.0575, -30.0681,
           4.5514],
        [ 38.9216, -17.0592, -25.0972,  12.2915,  -6.2447,   2.4214, -18.8855,
          52.9850],
        [  2.4511,  -6.4056, -43.2899, -56.0964, -24.5220, -17.7764,  32.7902,
         -22.3956],
        [ 12.9756,   8.9490,   7.6858,  -6.9334,  -5.3790,  11.6487,  26.5811,
          -5.7308]], device='cuda:0', grad_fn=<AddmmBackward0>)

# Training

In [None]:
lora_model.eval()
# lora_model.gradient_checkpointing_enable() # Fix error: 'LlamaDecoderLayer' object has no attribute '_gradient_checkpointing_func'
device = next(lora_model.parameters()).device
inputs = tokenizer("Preheat the oven to 350 degrees and place the cookie dough", return_tensors='pt')
lora_model_outs = lora_model(input_ids=inputs['input_ids'].to(device))

In [None]:
nero_model.train()
nero_model.gradient_checkpointing_enable() # Fix error: 'LlamaDecoderLayer' object has no attribute '_gradient_checkpointing_func'
device = next(nero_model.parameters()).device
inputs = tokenizer("Preheat the oven to 350 degrees and place the cookie dough", return_tensors='pt')
nero_model_outs = nero_model(input_ids=inputs['input_ids'].to(device))

[NeroLayer] Forward pass executed for NeroLayer(
  (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
  (dropout): Identity()
  (lora_A): Linear(in_features=4096, out_features=8, bias=False)
  (lora_B): Linear(in_features=8, out_features=4096, bias=False)
  (nero_A): Linear(in_features=4096, out_features=8, bias=True)
  (nero_B): Linear(in_features=8, out_features=4096, bias=True)
)
base_out.requires_grad: False
base_out.grad_fn: None
lora_out.requires_grad: False
lora_out.grad_fn: None
nero_out.requires_grad: False
nero_out.grad_fn: None
[NeroLayer] Forward pass executed for NeroLayer(
  (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)
  (dropout): Identity()
  (lora_A): Linear(in_features=4096, out_features=8, bias=False)
  (lora_B): Linear(in_features=8, out_features=1024, bias=False)
  (nero_A): Linear(in_features=1024, out_features=8, bias=True)
  (nero_B): Linear(in_features=8, out_features=1024, bias=True)
)
base_out.requires_grad:

In [None]:
nero_model_outs[1]['layers__DOT__0__DOT__self_attn__DOT__q_proj']

tensor([[[0.4004, 0.0000, 0.1152,  ..., 0.0000, 0.0000, 0.4817],
         [0.4004, 0.0000, 0.1152,  ..., 0.0000, 0.0000, 0.4817],
         [0.4004, 0.0000, 0.1152,  ..., 0.0000, 0.0000, 0.4817],
         ...,
         [0.4004, 0.0000, 0.1152,  ..., 0.0000, 0.0000, 0.4817],
         [0.4004, 0.0000, 0.1152,  ..., 0.0000, 0.0000, 0.4817],
         [0.4004, 0.0000, 0.1152,  ..., 0.0000, 0.0000, 0.4817]]],
       device='cuda:0', dtype=torch.float16)

In [None]:
def loss_func_v1(nero_outs, lora_outs):
    assert nero_outs.keys() == lora_outs.keys() # TODO: Print warning message
    total_loss = 0.0

    for layer_name in lora_outs.keys():
        # Normalized MSE loss
        mse_loss = F.mse_loss(nero_outs[layer_name], lora_outs[layer_name], reduction='sum') / torch.sum(nero_outs[layer_name] ** 2)
        total_loss += mse_loss

    return total_loss / len(lora_outs)  # Averaging loss across layers

loss = loss_func_v1(nero_model_outs[1], lora_model_outs[1])
print(loss)

tensor(1.0049, device='cuda:0', dtype=torch.float16)


In [None]:
loss.backward()

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [None]:
def loss_func(pred_outs, gt_outs, lambda_reg, lora_A_list, lora_B_list):
    total_loss = 0.0
    num_layers = len(gt_outs)

    for i in range(num_layers):
        # Normalized MSE loss
        mse_loss = F.mse_loss(pred_outs[i], gt_outs[i], reduction='sum') / torch.sum(pred_outs[i] ** 2)
        
        # L2 regularization for LoRA matrices
        reg_loss = lambda_reg * (torch.norm(lora_A_list[i], p=2) ** 2 + torch.norm(lora_B_list[i], p=2) ** 2)

        total_loss += mse_loss + reg_loss

    return total_loss / num_layers  # Averaging loss across layers