In [7]:
# # Kill all processess on GPU
# !fuser -v /dev/nvidia* -k

: 

: 

: 

In [3]:
%%capture
import os
if 'COLAB_' not in ''.join(os.environ.keys()):
    %pip install unsloth
else:
    # Do this only in Colab notebooks and Kaggle notebooks!
    %pip install --no-deps bitsandbytes accelerate xformers==0.0.29 peft trl triton
    %pip install --no-deps cut_cross_entropy unsloth_zoo
    %pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    %pip install --no-deps unsloth

In [2]:
import torch  # type: ignore
# from datetime import datetime
# from datasets import load_dataset
from unsloth import FastLanguageModel # type: ignore
from trl import SFTTrainer  # type: ignore
from transformers import TrainingArguments  # type: ignore

# Project configs
seed = 69 # Nice.

# Model configs
max_seq_length = 1024
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

hf_lora_id = 'alxxtexxr/L3.1-8B-wikipedia-en-LoRA-v20250305134947'
lora_dir = hf_lora_id.split('/')[-1]

from huggingface_hub import snapshot_download # type: ignore
snapshot_download(
    repo_id=hf_lora_id, 
    local_dir=lora_dir, 
    # ignore_patterns='checkpoint-*/*',
);

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


'/content/L3.1-8B-wikipedia-en-LoRA-v20250305134947'

In [3]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name='unsloth/Meta-Llama-3.1-8B',
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)
print(model)

==((====))==  Unsloth 2025.3.10: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096, padding_idx=128004)
    (layers): ModuleList(
      (0): LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRo

In [4]:
from peft import LoraConfig # type: ignore
from pprint import pprint

config = LoraConfig.from_pretrained(lora_dir)
pprint(config.__dict__)

{'_custom_modules': None,
 'alpha_pattern': {},
 'auto_mapping': None,
 'base_model_name_or_path': 'unsloth/meta-llama-3.1-8b-unsloth-bnb-4bit',
 'bias': 'none',
 'eva_config': None,
 'exclude_modules': None,
 'fan_in_fan_out': False,
 'inference_mode': True,
 'init_lora_weights': True,
 'layer_replication': None,
 'layers_pattern': None,
 'layers_to_transform': None,
 'loftq_config': {},
 'lora_alpha': 16,
 'lora_bias': False,
 'lora_dropout': 0,
 'megatron_config': None,
 'megatron_core': 'megatron.core',
 'modules_to_save': None,
 'peft_type': <PeftType.LORA: 'LORA'>,
 'r': 8,
 'rank_pattern': {},
 'revision': None,
 'runtime_config': LoraRuntimeConfig(ephemeral_gpu_offload=False),
 'target_modules': {'down_proj',
                    'gate_proj',
                    'k_proj',
                    'o_proj',
                    'q_proj',
                    'up_proj',
                    'v_proj'},
 'task_type': 'CAUSAL_LM',
 'use_dora': False,
 'use_rslora': False}


In [8]:
from safetensors.torch import load_file # type: ignore

class LoraLayer(torch.nn.Module):
    def __init__(self, base_layer, rank=8, alpha=16):
        super().__init__()
        self.base_layer = base_layer
        self.rank = rank
        self.alpha = alpha

        in_features = getattr(base_layer, 'in_features', None)
        out_features = getattr(base_layer, 'out_features', None)

        if in_features is None or out_features is None:
            raise ValueError(f"Cannot determine in_features or out_features from {base_layer}")
        
        self.lora_A = torch.nn.Parameter(torch.randn(rank, in_features) * 0.01)
        self.lora_B = torch.nn.Parameter(torch.randn(out_features, rank) * 0.01)
        
        # Scaling factor
        self.scaling = alpha / rank

    def forward(self, x):
        return self.base_layer(x) + (self.lora_B @ (self.lora_A @ x)) * self.scaling

    def load_weights(self, state_dict, prefix):
        self.lora_A.data = state_dict[f'{prefix}.lora_A']
        self.lora_B.data = state_dict[f'{prefix}.lora_B']
    
class LoraModel(torch.nn.Module):
    def __init__(self, base_model: torch.nn.Module, config: LoraConfig):
        super().__init__()
        self.base_model = base_model
        self.lora_layers = torch.nn.ModuleDict()

        for module_name, module in self.base_model.named_modules():
            if any(module_name.endswith(target_module) for target_module in config.target_modules) and isinstance(module, torch.nn.Linear):
                # Check if the module is already a LoRA layer
                if isinstance(module, LoraLayer):
                    print(f"Skipping {module_name} as it is already a LoRA layer")
                    continue # Skip modification

                parent_module, child_name = self._get_parent_module(module_name)
                lora_layer = LoraLayer(module, config.r, config.lora_alpha)
                setattr(parent_module, child_name, lora_layer)
                self.lora_layers[module_name] = lora_layer
    
    def _get_parent_module(self, module_name):
        parts = module_name.split('.')
        parent_module = self.base_model
        for part in parts[:-1]:
            parent_module = getattr(parent_module, part)
        return parent_module, parts[-1]

    def forward(self, input_ids, attention_mask=None):
        return self.base_model(input_ids, attention_mask=attention_mask)

    def load_lora_weights(self, lora_path):
        state_dict = load_file(lora_path)
        for name, lora_layer in self.lora_layers.items():
            if f'{name}.lora_A' in state_dict and f'{name}.lora_B' in state_dict:
                lora_layer.load_weights(state_dict, name)
                print(f"Loaded LoRA weights for {name}")
        print("LoRA weights loaded successfully!")

lora_model = LoraModel(model, config)
print(lora_model)

LoraModel(
  (base_model): LlamaForCausalLM(
    (model): LlamaModel(
      (embed_tokens): Embedding(128256, 4096, padding_idx=128004)
      (layers): ModuleList(
        (0): LlamaDecoderLayer(
          (self_attn): LlamaAttention(
            (q_proj): LoraLayer(
              (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
            )
            (k_proj): LoraLayer(
              (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)
            )
            (v_proj): LoraLayer(
              (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)
            )
            (o_proj): LoraLayer(
              (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
            )
            (rotary_emb): LlamaRotaryEmbedding()
          )
          (mlp): LlamaMLP(
            (gate_proj): LoraLayer(
              (base_layer): Linear4bit(in_features=4096, out_features=14336, bias=False)
            

In [9]:
for module_name, module in lora_model.named_parameters():
    print(module_name)

base_model.model.embed_tokens.weight
base_model.model.layers.0.self_attn.q_proj.lora_A
base_model.model.layers.0.self_attn.q_proj.lora_B
base_model.model.layers.0.self_attn.q_proj.base_layer.weight
base_model.model.layers.0.self_attn.k_proj.lora_A
base_model.model.layers.0.self_attn.k_proj.lora_B
base_model.model.layers.0.self_attn.k_proj.base_layer.weight
base_model.model.layers.0.self_attn.v_proj.lora_A
base_model.model.layers.0.self_attn.v_proj.lora_B
base_model.model.layers.0.self_attn.v_proj.base_layer.weight
base_model.model.layers.0.self_attn.o_proj.lora_A
base_model.model.layers.0.self_attn.o_proj.lora_B
base_model.model.layers.0.self_attn.o_proj.base_layer.weight
base_model.model.layers.0.mlp.gate_proj.lora_A
base_model.model.layers.0.mlp.gate_proj.lora_B
base_model.model.layers.0.mlp.gate_proj.base_layer.weight
base_model.model.layers.0.mlp.up_proj.lora_A
base_model.model.layers.0.mlp.up_proj.lora_B
base_model.model.layers.0.mlp.up_proj.base_layer.weight
base_model.model.laye

In [11]:
import os

lora_path = os.path.join(lora_dir, 'adapter_model.safetensors')
lora_model.load_lora_weights(lora_path)

LoRA weights loaded successfully!
