In [None]:
# Kill all processess on GPU
# !fuser -v /dev/nvidia* -k

# Libraries

In [None]:
%%capture
import os
if 'COLAB_' not in ''.join(os.environ.keys()):
    %pip install unsloth==2025.3.4
else:
    # Do this only in Colab notebooks and Kaggle notebooks!
    %pip install transformers==4.48.3
    %pip install --no-deps bitsandbytes accelerate xformers==0.0.29 peft trl triton
    %pip install --no-deps cut_cross_entropy unsloth_zoo
    %pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    %pip install --no-deps unsloth==2025.3.4

In [None]:
# To fix ValueError: Invalid pattern: ‘**’ can only be an entire path component
%pip install -qU datasets huggingface_hub fsspec

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, AutoPeftModelForCausalLM
from tqdm import tqdm
from pprint import pprint

# Config

In [None]:
# Project configs
seed = 69
device = 'cuda'
lang = 'es' # 'en' | 'id' | 'es'
task = 'wikipedia' # 'wikipedia' | 'gsm8k'

# Data Configs
max_seq_length = 1024
hf_data_id = 'wikimedia/wikipedia' # 'wikimedia/wikipedia' | 'openai/gsm8k'
hf_data_dir = f'20231101.{lang}' if task == 'wikipedia' else 'main' # wikipedia: '20231101.en' | '20231101.id' || gsm8k: 'main'
test_size = 1000
hf_data_split = f'train[-{test_size}:]'

# Model configs
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# LoRA configs
# Hugging Face LoRA IDs:
# - wikipedia-es    : 'alxxtexxr/L3.1-8B-wikipedia-es-5K-LoRA-v20250722140838'
# - wikipedia-en    : 'alxxtexxr/L3.1-8B-wikipedia-en-5K-LoRA-v20250630122650'
# - gsm8k           : 'alxxtexxr/L3.1-8B-gsm8k-en-LoRA-v20250330164709'
hf_lora_id = 'alxxtexxr/L3.1-8B-wikipedia-en-5K-LoRA-v20250630122650'
lora_dir = hf_lora_id.split('/')[-1]

# Download the trained LoRA adapter to the local directory
from huggingface_hub import snapshot_download
snapshot_download(
    repo_id=hf_lora_id, 
    local_dir=lora_dir, 
    ignore_patterns=[f'checkpoint-{i}' for i in range(0, 1870, 50)],
)

print("Hugging Face LoRA ID:", hf_lora_id)

In [None]:
lora_config = LoraConfig.from_pretrained(lora_dir)
pprint(lora_config)

Reference: https://huggingface.co/docs/transformers/perplexity

In [None]:
def compute_ppl(model, encodings, max_seq_length, stride=512, return_avg_nll=False):
    seq_length = encodings.input_ids.size(1)

    nll_sum = 0.0
    n_tokens = 0
    prev_end_loc = 0
    for begin_loc in tqdm(range(0, seq_length, stride)):
        end_loc = min(begin_loc + max_seq_length, seq_length)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)

            # loss is calculated using CrossEntropyLoss which averages over valid labels
            # N.B. the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels
            # to the left by 1.
            neg_log_likelihood = outputs.loss

        # Accumulate the total negative log-likelihood and the total number of tokens
        num_valid_tokens = (target_ids != -100).sum().item()  # number of valid tokens in target_ids
        batch_size = target_ids.size(0)
        num_loss_tokens = num_valid_tokens - batch_size  # subtract batch_size due to internal label shift
        nll_sum += neg_log_likelihood * num_loss_tokens
        n_tokens += num_loss_tokens

        prev_end_loc = end_loc
        if end_loc == seq_length:
            break

    avg_nll = nll_sum / n_tokens  # average negative log-likelihood per token
    ppl = torch.exp(avg_nll)
    if return_avg_nll:
        return ppl, avg_nll
    return ppl

# Data

In [None]:
tokenizer = AutoTokenizer.from_pretrained(lora_dir)
dataset = load_dataset(hf_data_id, data_dir=hf_data_dir, split=hf_data_split)
eos_token = tokenizer.eos_token

def format_gsm8k_prompts(examples):
    gsm8k_prompt = """### Instruction:
Solve the following math problem step by step.

### Question: 
{question}

### Answer: 
{answer}""" + eos_token
    
    return {'text': [gsm8k_prompt.format(question=question, answer=answer) for question, answer in zip(examples['question'], examples['answer'])]}

def format_prompts(examples):
    return {'text': [example + eos_token for example in examples['text']]}

if task == 'gsm8k':
    dataset = dataset.map(format_gsm8k_prompts, batched=True)
else:
    dataset = dataset.map(format_prompts, batched=True)

encodings = tokenizer("\n\n".join(dataset['text']), return_tensors='pt')
print(dataset)

# Base Model

In [None]:
# Load the base model
base_model = AutoModelForCausalLM.from_pretrained(lora_config.base_model_name_or_path, device_map='auto')
base_model = base_model.to(device)
base_model.eval()

In [None]:
ppl, avg_nll = compute_ppl(base_model, encodings, max_seq_length, return_avg_nll=True)
print("Avgerage Negative Log-Likelihood (NLL):", avg_nll.item())
print("Perplexity (PPL):", ppl.item())

# LoRA Model

In [None]:
# Load the LoRA-adapted model
lora_model = AutoPeftModelForCausalLM.from_pretrained(lora_dir)
lora_model = lora_model.to(device)
lora_model.eval()

In [None]:
ppl, avg_nll = compute_ppl(lora_model, encodings, max_seq_length, return_avg_nll=True)
print("Avgerage Negative Log-Likelihood (NLL):", avg_nll.item())
print("Perplexity (PPL):", ppl.item())