In [1]:
# Kill all processess on GPU
# !fuser -v /dev/nvidia* -k

# Libraries

In [2]:
%%capture
# Install required libraries (optimized for Colab/Kaggle notebooks)
import os
if 'COLAB_' not in ''.join(os.environ.keys()):
    %pip install unsloth
else:
    # Do this only in Colab notebooks and Kaggle notebooks!
    %pip install --no-deps bitsandbytes accelerate xformers==0.0.29 peft trl triton
    %pip install --no-deps cut_cross_entropy unsloth_zoo
    %pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    %pip install --no-deps unsloth

In [3]:
import torch
from transformers import AutoTokenizer
from peft import AutoPeftModelForCausalLM
from huggingface_hub import snapshot_download
from pprint import pprint

# Config

In [4]:
# Project config
seed = 69
device = 'cuda'

# Model config
max_seq_length = 1024
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# LoRA config
hf_lora_id = 'alxxtexxr/L3.1-8B-wikipedia-en-LoRA-v20250305134947'
lora_dir = hf_lora_id.split('/')[-1]

# Model

### References
- [PEFT Quickour - Inference](https://huggingface.co/docs/peft/main/en/quicktour#inference)

In [6]:
# Download the trained LoRA adapter to the local directory
snapshot_download(
    repo_id=hf_lora_id, 
    local_dir=lora_dir, 
    # ignore_patterns='checkpoint-*/*',
)

Fetching 99 files:   0%|          | 0/99 [00:00<?, ?it/s]

adapter_config.json:   0%|          | 0.00/812 [00:00<?, ?B/s]

'/content/L3.1-8B-wikipedia-en-LoRA-v20250305134947'

In [7]:
# Load the LoRA-adapted model and tokenizer
model = AutoPeftModelForCausalLM.from_pretrained(lora_dir)
tokenizer = AutoTokenizer.from_pretrained('unsloth/Meta-Llama-3.1-8B')

model = model.to(device)
model.eval()

config.json:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/235 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096, padding_idx=128004)
        (layers): ModuleList(
          (0): LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear4b

In [21]:
import inspect

print(inspect.getsource(model.base_model.model.model.layers[0].self_attn.q_proj.__class__))

    class Linear4bit(torch.nn.Module, LoraLayer):
        # Lora implemented in a dense layer
        def __init__(
            self,
            base_layer: torch.nn.Module,
            adapter_name: str,
            r: int = 0,
            lora_alpha: int = 1,
            lora_dropout: float = 0.0,
            init_lora_weights: bool = True,
            use_rslora: bool = False,
            use_dora: bool = False,
            lora_bias: bool = False,
            **kwargs,
        ) -> None:
            super().__init__()
            LoraLayer.__init__(self, base_layer)
            self.fan_in_fan_out = False

            self._active_adapter = adapter_name
            self.update_layer(
                adapter_name,
                r,
                lora_alpha=lora_alpha,
                lora_dropout=lora_dropout,
                init_lora_weights=init_lora_weights,
                use_rslora=use_rslora,
                use_dora=use_dora,
                lora_bias=lora_bias,
       

In [25]:
for n, p in model.named_parameters():
    print(n, "-->", p.dtype)

base_model.model.model.embed_tokens.weight --> torch.float16
base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight --> torch.uint8
base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight --> torch.float32
base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight --> torch.float32
base_model.model.model.layers.0.self_attn.k_proj.base_layer.weight --> torch.uint8
base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight --> torch.float32
base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight --> torch.float32
base_model.model.model.layers.0.self_attn.v_proj.base_layer.weight --> torch.uint8
base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight --> torch.float32
base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight --> torch.float32
base_model.model.model.layers.0.self_attn.o_proj.base_layer.weight --> torch.uint8
base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight --

: 

In [12]:
def generate_text(model, prompt, max_new_tokens=50, skip_special_tokens=True):
    inputs = tokenizer(prompt, return_tensors='pt')
    outputs = model.generate(input_ids=inputs['input_ids'].to(device), max_new_tokens=max_new_tokens)
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=skip_special_tokens)[0])

generate_text(
    model, 
    prompt="Preheat the oven to 350 degrees and place the cookie dough", 
    skip_special_tokens=False,
)

<|begin_of_text|>Preheat the oven to 350 degrees and place the cookie dough on a cookie sheet.
Bake for 12 minutes and then remove from the oven.
While the cookies are baking, combine the butter and marshmallows in a medium saucepan.
Cook over medium heat, stirring frequently, until the butter is melted


: 