In [1]:
# Kill all processess on GPU
# !fuser -v /dev/nvidia* -k

# Libraries

In [40]:
import time
import json
import torch
import evaluate
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import LoraConfig, AutoPeftModelForCausalLM
from tqdm import tqdm
from pprint import pprint

# Configurations

In [28]:
# Project configuration
seed = 69
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
lang = 'id' # 'en' | 'id'
task = 'wikipedia' # 'wikipedia' | 'gsm8k'

# Data configuration
test_size = 100

# Model configuration
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# LoRA configuration
hf_lora_id = 'alxxtexxr/L3.1-8B-wikipedia-id-5K-LoRA-v20250628061607'
lora_dir = hf_lora_id.split('/')[-1]

# Download the trained LoRA adapter to the local directory
from huggingface_hub import snapshot_download
snapshot_download(
    repo_id=hf_lora_id, 
    local_dir=lora_dir, 
    # ignore_patterns='checkpoint-*/*',
)

print("Hugging Face LoRA ID:", hf_lora_id)

Fetching 427 files:   0%|          | 0/427 [00:00<?, ?it/s]

Hugging Face LoRA ID: alxxtexxr/L3.1-8B-wikipedia-id-5K-LoRA-v20250628061607


In [4]:
lora_config = LoraConfig.from_pretrained(lora_dir)
pprint(lora_config)

LoraConfig(task_type='CAUSAL_LM',
           peft_type=<PeftType.LORA: 'LORA'>,
           auto_mapping=None,
           base_model_name_or_path='unsloth/meta-llama-3.1-8b-unsloth-bnb-4bit',
           revision=None,
           inference_mode=True,
           r=8,
           target_modules={'down_proj',
                           'gate_proj',
                           'k_proj',
                           'o_proj',
                           'q_proj',
                           'up_proj',
                           'v_proj'},
           exclude_modules=None,
           lora_alpha=16,
           lora_dropout=0,
           fan_in_fan_out=False,
           bias='none',
           use_rslora=False,
           modules_to_save=None,
           init_lora_weights=True,
           layers_to_transform=None,
           layers_pattern=None,
           rank_pattern={},
           alpha_pattern={},
           megatron_config=None,
           megatron_core='megatron.core',
           trainable_token_

# Data

In [5]:
# Download the Indonesian SQuAD dataset
!mkdir data
!cd data && wget https://raw.githubusercontent.com/Wikidepia/SQuAD-id/refs/heads/master/data/train-SQuAD-id.json

--2025-06-29 09:54:32--  https://raw.githubusercontent.com/Wikidepia/SQuAD-id/refs/heads/master/data/train-SQuAD-id.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 37852227 (36M) [text/plain]
Saving to: ‘train-SQuAD-id.json’


2025-06-29 09:54:33 (76.4 MB/s) - ‘train-SQuAD-id.json’ saved [37852227/37852227]



In [26]:
# Load and convert data to Huggingface format
# Source: https://github.com/Wikidepia/indonesian_datasets/blob/master/question-answering/squad/convert_huggingface.py
with open('data/train-SQuAD-id.json', 'r') as f:
    content = json.load(f)

hf_data = []
for data in content["data"]:
    title = data["title"]
    for paragraph in data["paragraphs"]:
        context = paragraph["context"]
        for qa in paragraph["qas"]:
            fill = {
                "id":  qa["id"],
                "title": title,
                "context": context,
                "question": qa["question"],
                "answers": {"answer_start": [], "text": []}
            }
            if qa["is_impossible"]:
                answers = qa["plausible_answers"]
            else:
                answers = qa["answers"]
            for answer in answers:
                fill["answers"]["answer_start"].append(answer["answer_start"])
                fill["answers"]["text"].append(answer["text"])
            hf_data.append(fill)

In [27]:
# Create Huggingface dataset
data = Dataset.from_list(hf_data)
print(data)

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 114825
})


# Base Model

In [8]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(lora_dir)

In [10]:
# Load the base model
base_model = AutoModelForCausalLM.from_pretrained(lora_config.base_model_name_or_path, device_map='auto')
base_model = base_model.to(device)
base_model.eval()

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/235 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096, padding_idx=128004)
    (layers): ModuleList(
      (0): LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
      (1): LlamaDecoder

# LoRA Model

In [30]:
# Load the LoRA-adapted model
lora_model = AutoPeftModelForCausalLM.from_pretrained(lora_dir, device_map='auto')
lora_model = lora_model.to(device)
lora_model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096, padding_idx=128004)
        (layers): ModuleList(
          (0): LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear4b

# Evaluation

In [43]:
metric = evaluate.load('squad')

def generate_answer(example, model, tokenizer, device, max_new_tokens=50):
    prompt = f"Context: {example['context']}\nQuestion: {example['question']}\nAnswer:"
    inputs = tokenizer(prompt, return_tensors='pt').to(device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True).split("Answer:")[-1].strip()
    return {'id': example['id'], 'prediction_text': answer}

def eval(metric, data, model, tokenizer, device, test_size=100):
    start_time = time.time()
    
    subset = data.select(range(test_size))
    predictions = []
    for example in tqdm(subset, desc="Evaluating", unit="sample"):
        prediction = generate_answer(example, model, tokenizer, device)
        predictions.append(prediction)
    references = [{'id': example['id'], 'answers': example['answers']} for example in subset]

    results = metric.compute(predictions=predictions, references=references)
    results['duration'] = time.time() - start_time
    return results

In [44]:
# Evaluate the base model
results = eval(metric, data, base_model, tokenizer, device, test_size=10)
pprint(results)

Evaluating: 100%|██████████| 10/10 [00:49<00:00,  4.91s/sample]

{'duration': 49.16969347000122, 'exact_match': 0.0, 'f1': 8.460798460798461}





In [45]:
# Evaluate the LoRA-adapted  model
results = eval(metric, data, lora_model, tokenizer, device, test_size)
pprint(results)

Evaluating: 100%|██████████| 100/100 [08:04<00:00,  4.84s/sample]

{'duration': 484.4948801994324, 'exact_match': 0.0, 'f1': 5.713625387686947}



