In [1]:
# Kill all processess on GPU
# !fuser -v /dev/nvidia* -k

# Libraries

In [11]:
%%capture
import os
if 'COLAB_' not in ''.join(os.environ.keys()):
    %pip install unsloth==2025.3.4
else:
    # Do this only in Colab notebooks and Kaggle notebooks!
    %pip install transformers==4.48.3
    %pip install --no-deps bitsandbytes accelerate xformers==0.0.29 peft trl triton
    %pip install --no-deps cut_cross_entropy unsloth_zoo
    %pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    %pip install --no-deps unsloth==2025.3.4

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [3]:
# To fix ValueError: Invalid pattern: ‘**’ can only be an entire path component
%pip install -qU datasets huggingface_hub fsspec

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
unsloth-zoo 2025.7.10 requires datasets<4.0.0,>=3.4.1, but you have datasets 4.0.0 which is incompatible.[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, AutoPeftModelForCausalLM
from tqdm import tqdm
from pprint import pprint

# Configurations

In [3]:
# Project configuration
seed = 69
device = 'cuda'
lang = 'es' # 'en' | 'id' | 'es'
task = 'wikipedia' # 'wikipedia' | 'gsm8k'

# Data configuration
test_size = 1000
max_seq_length = 1024

hf_data_id_map = {
    'wikipedia': 'wikimedia/wikipedia',
    'gsm8k': 'openai/gsm8k',
}
hf_data_id = hf_data_id_map[task]
hf_data_dir = f'20231101.{lang}' if task == 'wikipedia' else 'main'
hf_data_split = f'train[-{test_size}:]'

# Model configuration
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# LoRA configuration
# Hugging Face LoRA IDs:
# - wikipedia-es    : 'alxxtexxr/L3.1-8B-wikipedia-es-5K-LoRA-v20250722140838'
# - wikipedia-en    : 'alxxtexxr/L3.1-8B-wikipedia-en-5K-LoRA-v20250630122650'
# - gsm8k           : 'alxxtexxr/L3.1-8B-gsm8k-en-LoRA-v20250330164709'
hf_lora_id = 'alxxtexxr/L3.1-8B-wikipedia-es-5K-LoRA-v20250722140838'
lora_dir = hf_lora_id.split('/')[-1]

# Download the trained LoRA adapter to the local directory
from huggingface_hub import snapshot_download
snapshot_download(
    repo_id=hf_lora_id, 
    local_dir=lora_dir, 
    ignore_patterns=[f'checkpoint-{i}/*' for i in range(0, 1870, 50) if i not in [650, 1250, 1875]],
)

print("Hugging Face LoRA ID:", hf_lora_id)

Fetching 42 files:   0%|          | 0/42 [00:00<?, ?it/s]

.gitattributes: 0.00B [00:00, ?B/s]

README.md: 0.00B [00:00, ?B/s]

adapter_config.json:   0%|          | 0.00/921 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/921 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/83.9M [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/83.9M [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

optimizer.pt:   0%|          | 0.00/43.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

trainer_state.json: 0.00B [00:00, ?B/s]

training_args.bin:   0%|          | 0.00/5.88k [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/921 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/83.9M [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/43.1M [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

training_args.bin:   0%|          | 0.00/5.88k [00:00<?, ?B/s]

trainer_state.json: 0.00B [00:00, ?B/s]

README.md: 0.00B [00:00, ?B/s]

adapter_config.json:   0%|          | 0.00/921 [00:00<?, ?B/s]

rng_state.pth:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/83.9M [00:00<?, ?B/s]

optimizer.pt:   0%|          | 0.00/43.1M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

scheduler.pt:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.88k [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

(…).tfevents.1753193392.b8cf4694f9d9.4370.0:   0%|          | 0.00/402k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

trainer_state.json: 0.00B [00:00, ?B/s]

training_args.bin:   0%|          | 0.00/5.88k [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

Hugging Face LoRA ID: alxxtexxr/L3.1-8B-wikipedia-es-5K-LoRA-v20250722140838


In [4]:
lora_config = LoraConfig.from_pretrained(lora_dir)
pprint(lora_config)

LoraConfig(task_type='CAUSAL_LM',
           peft_type=<PeftType.LORA: 'LORA'>,
           auto_mapping=None,
           base_model_name_or_path='unsloth/meta-llama-3.1-8b-unsloth-bnb-4bit',
           revision=None,
           inference_mode=True,
           r=8,
           target_modules={'down_proj',
                           'gate_proj',
                           'k_proj',
                           'o_proj',
                           'q_proj',
                           'up_proj',
                           'v_proj'},
           exclude_modules=None,
           lora_alpha=16,
           lora_dropout=0,
           fan_in_fan_out=False,
           bias='none',
           use_rslora=False,
           modules_to_save=None,
           init_lora_weights=True,
           layers_to_transform=None,
           layers_pattern=None,
           rank_pattern={},
           alpha_pattern={},
           megatron_config=None,
           megatron_core='megatron.core',
           trainable_token_

Reference: https://huggingface.co/docs/transformers/perplexity

In [5]:
def compute_ppl(model, encodings, max_seq_length, stride=512, return_avg_nll=False):
    seq_length = encodings.input_ids.size(1)

    nll_sum = 0.0
    n_tokens = 0
    prev_end_loc = 0
    for begin_loc in tqdm(range(0, seq_length, stride)):
        end_loc = min(begin_loc + max_seq_length, seq_length)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)

            # loss is calculated using CrossEntropyLoss which averages over valid labels
            # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
            # to the left by 1.
            neg_log_likelihood = outputs.loss

        # Accumulate the total negative log-likelihood and the total number of tokens
        num_valid_tokens = (target_ids != -100).sum().item()  # number of valid tokens in target_ids
        batch_size = target_ids.size(0)
        num_loss_tokens = num_valid_tokens - batch_size  # subtract batch_size due to internal label shift
        nll_sum += neg_log_likelihood * num_loss_tokens
        n_tokens += num_loss_tokens

        prev_end_loc = end_loc
        if end_loc == seq_length:
            break

    avg_nll = nll_sum / n_tokens  # average negative log-likelihood per token
    ppl = torch.exp(avg_nll)
    if return_avg_nll:
        return ppl, avg_nll
    return ppl

# Data

In [6]:
tokenizer = AutoTokenizer.from_pretrained(lora_dir)
dataset = load_dataset(hf_data_id, data_dir=hf_data_dir, split=hf_data_split)
eos_token = tokenizer.eos_token

def format_gsm8k_prompts(examples):
    gsm8k_prompt = """### Instruction:
Solve the following math problem step by step.

### Question: 
{question}

### Answer: 
{answer}""" + eos_token
    
    return {'text': [gsm8k_prompt.format(question=question, answer=answer) for question, answer in zip(examples['question'], examples['answer'])]}

def format_prompts(examples):
    return {'text': [example + eos_token for example in examples['text']]}

if task == 'gsm8k':
    dataset = dataset.map(format_gsm8k_prompts, batched=True)
else:
    dataset = dataset.map(format_prompts, batched=True)

encodings = tokenizer("\n\n".join(dataset['text']), return_tensors='pt')
print(dataset)

Token indices sequence length is longer than the specified maximum sequence length for this model (882192 > 131072). Running this sequence through the model will result in indexing errors


Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 1000
})


In [7]:
# Count total tokens
def count_total_tokens(dataset, tokenizer):
    total_tokens = 0
    for text in dataset['text']:
        tokens = tokenizer.encode(text, add_special_tokens=False)
        total_tokens += len(tokens)
    return total_tokens

total_token_count = count_total_tokens(dataset, tokenizer)
print(f"Total tokens in dataset: {total_token_count}")

Total tokens in dataset: 881197


# Base Model

In [8]:
# Load the base model
base_model = AutoModelForCausalLM.from_pretrained(lora_config.base_model_name_or_path, device_map='auto')
base_model = base_model.to(device)
base_model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096, padding_idx=128004)
    (layers): ModuleList(
      (0): LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
      (1): LlamaDecoder

In [9]:
ppl, avg_nll = compute_ppl(base_model, encodings, max_seq_length, return_avg_nll=True)
print("Avgerage Negative Log-Likelihood (NLL):", avg_nll.item())
print("Perplexity (PPL):", ppl.item())

100%|█████████▉| 1722/1724 [13:16<00:00,  2.16it/s]

Avgerage Negative Log-Likelihood (NLL): 1.645264744758606
Perplexity (PPL): 5.182381629943848





# LoRA Model

In [12]:
# Load the LoRA-adapted model
lora_model = AutoPeftModelForCausalLM.from_pretrained(os.path.join(lora_dir, 'checkpoint-650'), device_map='auto')
lora_model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096, padding_idx=128004)
        (layers): ModuleList(
          (0): LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear4b

In [13]:
ppl, avg_nll = compute_ppl(lora_model, encodings, max_seq_length, return_avg_nll=True)
print("Avgerage Negative Log-Likelihood (NLL):", avg_nll.item())
print("Perplexity (PPL):", ppl.item())

100%|█████████▉| 1722/1724 [16:58<00:01,  1.69it/s]

Avgerage Negative Log-Likelihood (NLL): 1.6238372325897217
Perplexity (PPL): 5.0725178718566895



