In [1]:
# Kill all processess on GPU
# !fuser -v /dev/nvidia* -k

# Libraries

In [2]:
%%capture
import os
if 'COLAB_' not in ''.join(os.environ.keys()):
    %pip install unsloth==2025.3.4
else:
    # Do this only in Colab notebooks and Kaggle notebooks!
    %pip install transformers==4.48.3
    %pip install --no-deps bitsandbytes accelerate xformers==0.0.29 peft trl triton
    %pip install --no-deps cut_cross_entropy unsloth_zoo
    %pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    %pip install --no-deps unsloth==2025.3.4

In [3]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer
from peft import AutoPeftModelForCausalLM
from tqdm import tqdm

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


    PyTorch 2.5.1+cu121 with CUDA 1201 (you have 2.6.0+cu124)
    Python  3.11.11 (you have 3.11.11)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


🦥 Unsloth Zoo will now patch everything to make training faster!


# Config

In [15]:
# Project configs
seed = 69
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
lang = 'id' # 'en' | 'id'
task = 'wikipedia' # 'wikipedia' | 'gsm8k'

# Data Configs
max_seq_length = 1024
hf_data_id = 'wikimedia/wikipedia' # 'wikimedia/wikipedia' | 'openai/gsm8k'
hf_data_dir = '20231101.id' # 'wikipedia': '20231101.en' | '20231101.id' || 'gsm8k': 'main'
test_size = 100
hf_data_split = f'train[-{test_size}:]'

# Model configs
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# LoRA config
hf_lora_id = 'alxxtexxr/L3.1-8B-wikipedia-id-LoRA-v20250403044132'
lora_dir = hf_lora_id.split('/')[-1]

# Download the trained LoRA adapter to the local directory
from huggingface_hub import snapshot_download
snapshot_download(
    repo_id=hf_lora_id, 
    local_dir=lora_dir, 
    # ignore_patterns='checkpoint-*/*',
)

print("Hugging Face LoRA ID:", hf_lora_id)

Fetching 176 files:   0%|          | 0/176 [00:00<?, ?it/s]

HF LoRA ID: alxxtexxr/L3.1-8B-wikipedia-id-LoRA-v20250403044132


# Model

In [5]:
# Load the LoRA-adapted model and tokenizer
model = AutoPeftModelForCausalLM.from_pretrained(lora_dir)
tokenizer = AutoTokenizer.from_pretrained('unsloth/Meta-Llama-3.1-8B')

model = model.to(device)
model.eval()

config.json:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/235 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096, padding_idx=128004)
        (layers): ModuleList(
          (0): LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear4b

# Data

In [17]:
dataset = load_dataset(hf_data_id, data_dir=hf_data_dir, split=hf_data_split)
encodings = tokenizer("\n\n".join(dataset['text']), return_tensors='pt')
print(dataset)

Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 100
})


# Evaluation

### References
- https://huggingface.co/docs/transformers/perplexity

In [19]:
stride = 512
seq_len = encodings.input_ids.size(1)

nll_sum = 0.0
n_tokens = 0
prev_end_loc = 0
for begin_loc in tqdm(range(0, seq_len, stride)):
    end_loc = min(begin_loc + max_seq_length, seq_len)
    trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
    input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
    target_ids = input_ids.clone()
    target_ids[:, :-trg_len] = -100

    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids)

        # loss is calculated using CrossEntropyLoss which averages over valid labels
        # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
        # to the left by 1.
        neg_log_likelihood = outputs.loss

    # Accumulate the total negative log-likelihood and the total number of tokens
    num_valid_tokens = (target_ids != -100).sum().item()  # number of valid tokens in target_ids
    batch_size = target_ids.size(0)
    num_loss_tokens = num_valid_tokens - batch_size  # subtract batch_size due to internal label shift
    nll_sum += neg_log_likelihood * num_loss_tokens
    n_tokens += num_loss_tokens

    prev_end_loc = end_loc
    if end_loc == seq_len:
        break

avg_nll = nll_sum / n_tokens  # average negative log-likelihood per token
ppl = torch.exp(avg_nll)

 98%|█████████▊| 122/124 [12:26<00:12,  6.12s/it]


In [23]:
print("Avgerage Negative Log-Likelihood (NLL):", avg_nll.item())
print("Perplexity (PPL):", ppl.item())

Avgerage Negative Log-Likelihood (NLL): 2.6521875858306885
Perplexity (PPL): 14.185035705566406
