In [76]:
# # Kill all processess on GPU
# !fuser -v /dev/nvidia* -k

In [77]:
%%capture
import os
if 'COLAB_' not in ''.join(os.environ.keys()):
    %pip install unsloth
else:
    # Do this only in Colab notebooks and Kaggle notebooks!
    %pip install --no-deps bitsandbytes accelerate xformers==0.0.29 peft trl triton
    %pip install --no-deps cut_cross_entropy unsloth_zoo
    %pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    %pip install --no-deps unsloth

In [None]:
import torch  # type: ignore
# from datetime import datetime
# from datasets import load_dataset
from unsloth import FastLanguageModel, UnslothTrainer, UnslothTrainingArguments, is_bf16_supported # type: ignore
from trl import SFTTrainer  # type: ignore
from transformers import TrainingArguments  # type: ignore

# Project configs
seed = 69 # Nice.

# Model configs
max_seq_length = 1024
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

hf_lora_id = 'alxxtexxr/L3.1-8B-wikipedia-en-LoRA-v20250305134947'
lora_dir = hf_lora_id.split('/')[-1]

from huggingface_hub import snapshot_download # type: ignore
snapshot_download(
    repo_id=hf_lora_id, 
    local_dir=lora_dir, 
    # ignore_patterns='checkpoint-*/*',
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


Fetching 11 files:   0%|          | 0/11 [00:00<?, ?it/s]

(…).tfevents.1741182714.5172c9540b89.8159.0:   0%|          | 0.00/50.1k [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/812 [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/83.9M [00:00<?, ?B/s]

(…).tfevents.1741219006.8da8ca61af90.1084.0:   0%|          | 0.00/29.5k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

(…).tfevents.1741232833.506a328f2d3f.1088.0:   0%|          | 0.00/74.1k [00:00<?, ?B/s]

(…).tfevents.1741192136.40cd2355fd32.1061.0:   0%|          | 0.00/29.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

'/content/L3.1-8B-wikipedia-en-LoRA-v20250305134947'

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name='unsloth/Meta-Llama-3.1-8B',
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)
print(model)

==((====))==  Unsloth 2025.3.8: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096, padding_idx=128004)
    (layers): ModuleList(
      (0): LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRot

In [None]:
from peft import LoraConfig
from pprint import pprint

config = LoraConfig.from_pretrained(lora_dir)
pprint(config.__dict__)

{'_custom_modules': None,
 'alpha_pattern': {},
 'auto_mapping': None,
 'base_model_name_or_path': 'unsloth/meta-llama-3.1-8b-unsloth-bnb-4bit',
 'bias': 'none',
 'eva_config': None,
 'exclude_modules': None,
 'fan_in_fan_out': False,
 'inference_mode': True,
 'init_lora_weights': True,
 'layer_replication': None,
 'layers_pattern': None,
 'layers_to_transform': None,
 'loftq_config': {},
 'lora_alpha': 16,
 'lora_bias': False,
 'lora_dropout': 0,
 'megatron_config': None,
 'megatron_core': 'megatron.core',
 'modules_to_save': None,
 'peft_type': <PeftType.LORA: 'LORA'>,
 'r': 8,
 'rank_pattern': {},
 'revision': None,
 'runtime_config': LoraRuntimeConfig(ephemeral_gpu_offload=False),
 'target_modules': {'down_proj',
                    'gate_proj',
                    'k_proj',
                    'o_proj',
                    'q_proj',
                    'up_proj',
                    'v_proj'},
 'task_type': 'CAUSAL_LM',
 'use_dora': False,
 'use_rslora': False}


In [79]:
from safetensors.torch import load_file  # Import for safetensors

class LoraLayer(torch.nn.Module):
    def __init__(self, base_layer, in_features, out_features, rank=8, alpha=16):
        super().__init__()
        self.base_layer = base_layer
        self.rank = rank
        self.alpha = alpha

        # Extract in_features and out_features safely
        in_features = getattr(base_layer, "in_features", None)
        out_features = getattr(base_layer, "out_features", None)

        if in_features is None or out_features is None:
            raise ValueError(f"Cannot determine in/out features from {base_layer}.")
        
        self.lora_A = torch.nn.Parameter(torch.randn(rank, self.in_features) * 0.01)
        self.lora_B = torch.nn.Parameter(torch.randn(self.out_features, rank) * 0.01)
        
        # Scaling factor
        self.scaling = alpha / rank

    def forward(self, x):
        return x + (self.lora_B @ (self.lora_A @ x)) * self.scaling

    def load_lora_weights(self, state_dict, prefix):
        """Loads LoRA weights from a state dictionary."""
        self.lora_A.data = state_dict[f"{prefix}.lora_A"]
        self.lora_B.data = state_dict[f"{prefix}.lora_B"]
    
    def __repr__(self):
        return f"LoraLayer(in_features={self.in_features}, out_features={self.out_features}, rank={self.rank}, alpha={self.alpha})"

class LoraModel(torch.nn.Module):
    def __init__(self, model: torch.nn.Module, config: LoraConfig):
        super().__init__()
        self.model = model  # Use preloaded model
        self.lora_layers = torch.nn.ModuleDict()  # Use ModuleDict to register submodules

        # Apply LoRA to specific target modules
        for name, module in self.model.named_modules():
            if any(target in name for target in config.target_modules) and isinstance(module, torch.nn.Linear):
                # Replace the original module with a LoRA-wrapped version
                parent_module, child_name = self._get_parent_module(name)
                setattr(parent_module, child_name, LoraLayer(module, config.r, config.lora_alpha))
    
    def _get_parent_module(self, module_name):
        """Helper function to get the parent module and attribute name."""
        parts = module_name.split('.')
        parent = self.model
        for part in parts[:-1]:
            parent = getattr(parent, part)
        return parent, parts[-1]

    def forward(self, input_ids, attention_mask=None):
        return self.model(input_ids, attention_mask=attention_mask)

    def load_lora_weights(self, lora_path):
        """Loads LoRA weights from a file."""
        state_dict = load_file(lora_path)
        for name, lora_layer in self.lora_layers.items():
            if f"{name}.lora_A" in state_dict and f"{name}.lora_B" in state_dict:
                lora_layer.load_lora_weights(state_dict, name)
        print("LoRA weights loaded successfully!")

lora_model = LoraModel(model, config)
print(lora_model)

LoraModel(
  (model): LlamaForCausalLM(
    (model): LlamaModel(
      (embed_tokens): Embedding(128256, 4096, padding_idx=128004)
      (layers): ModuleList(
        (0-31): 32 x LlamaDecoderLayer(
          (self_attn): LlamaAttention(
            (q_proj): LoraLayer()
            (k_proj): LoraLayer()
            (v_proj): LoraLayer()
            (o_proj): LoraLayer()
            (rotary_emb): LlamaRotaryEmbedding()
          )
          (mlp): LlamaMLP(
            (gate_proj): LoraLayer()
            (up_proj): LoraLayer()
            (down_proj): LoraLayer()
            (act_fn): SiLU()
          )
          (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
          (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        )
      )
      (norm): LlamaRMSNorm((4096,), eps=1e-05)
      (rotary_emb): LlamaRotaryEmbedding()
    )
    (lm_head): Linear(in_features=4096, out_features=128256, bias=False)
  )
  (lora_layers): ModuleDict()
)
