In [1]:
# Kill all processess on GPU
# !fuser -v /dev/nvidia* -k

# Libraries

In [1]:
import json
import torch
from datasets import Dataset
from evaluate import evaluator
from transformers import AutoModelForCausalLM, LlamaForQuestionAnswering, AutoTokenizer, pipeline
from peft import LoraConfig, AutoPeftModelForCausalLM
from pprint import pprint

# Configurations

In [2]:
# Project configs
seed = 69
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
lang = 'id' # 'en' | 'id'
task = 'wikipedia' # 'wikipedia' | 'gsm8k'

# Data Configs
max_seq_length = 1024
# hf_data_id = 'wikimedia/wikipedia' # 'wikimedia/wikipedia' | 'openai/gsm8k'
# hf_data_dir = f'20231101.{lang}' if task == 'wikipedia' else 'main' # wikipedia: '20231101.en' | '20231101.id' || gsm8k: 'main'
test_size = 1000
# hf_data_split = f'train[-{test_size}:]'

# Model configs
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# LoRA configs
hf_lora_id = 'alxxtexxr/L3.1-8B-wikipedia-id-5K-LoRA-v20250628061607'
lora_dir = hf_lora_id.split('/')[-1]

# Download the trained LoRA adapter to the local directory
from huggingface_hub import snapshot_download
snapshot_download(
    repo_id=hf_lora_id, 
    local_dir=lora_dir, 
    # ignore_patterns='checkpoint-*/*',
)

print("Hugging Face LoRA ID:", hf_lora_id)

Fetching 427 files:   0%|          | 0/427 [00:00<?, ?it/s]

Hugging Face LoRA ID: alxxtexxr/L3.1-8B-wikipedia-id-5K-LoRA-v20250628061607


In [3]:
lora_config = LoraConfig.from_pretrained(lora_dir)
pprint(lora_config)

LoraConfig(task_type='CAUSAL_LM',
           peft_type=<PeftType.LORA: 'LORA'>,
           auto_mapping=None,
           base_model_name_or_path='unsloth/meta-llama-3.1-8b-unsloth-bnb-4bit',
           revision=None,
           inference_mode=True,
           r=8,
           target_modules={'down_proj',
                           'gate_proj',
                           'k_proj',
                           'o_proj',
                           'q_proj',
                           'up_proj',
                           'v_proj'},
           exclude_modules=None,
           lora_alpha=16,
           lora_dropout=0,
           fan_in_fan_out=False,
           bias='none',
           use_rslora=False,
           modules_to_save=None,
           init_lora_weights=True,
           layers_to_transform=None,
           layers_pattern=None,
           rank_pattern={},
           alpha_pattern={},
           megatron_config=None,
           megatron_core='megatron.core',
           trainable_token_

# Data

In [4]:
# Download the Indonesian SQuAD dataset
!mkdir data
!cd data && wget https://raw.githubusercontent.com/Wikidepia/SQuAD-id/refs/heads/master/data/train-SQuAD-id.json

mkdir: cannot create directory ‘data’: File exists
--2025-06-28 18:24:41--  https://raw.githubusercontent.com/Wikidepia/SQuAD-id/refs/heads/master/data/train-SQuAD-id.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 37852227 (36M) [text/plain]
Saving to: ‘train-SQuAD-id.json.14’


2025-06-28 18:24:42 (83.4 MB/s) - ‘train-SQuAD-id.json.14’ saved [37852227/37852227]



In [5]:
# Load and convert data to Huggingface format
# Source: https://github.com/Wikidepia/indonesian_datasets/blob/master/question-answering/squad/convert_huggingface.py

with open('data/train-SQuAD-id.json', 'r') as f:
    content = json.load(f)

hf_data = []
for data in content["data"]:
    title = data["title"]
    for paragraph in data["paragraphs"]:
        context = paragraph["context"]
        for qa in paragraph["qas"]:
            fill = {
                "id":  qa["id"],
                "title": title,
                "context": context,
                "question": qa["question"],
                "answers": {"answer_start": [], "text": []}
            }
            if qa["is_impossible"]:
                answers = qa["plausible_answers"]
            else:
                answers = qa["answers"]
            for answer in answers:
                fill["answers"]["answer_start"].append(answer["answer_start"])
                fill["answers"]["text"].append(answer["text"])
            hf_data.append(fill)

In [6]:
# Create Huggingface dataset
data = Dataset.from_list(hf_data[:test_size])
print(data)

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 1000
})


# Base Model

In [7]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(lora_dir)

In [8]:
# Load the base model
base_model = LlamaForQuestionAnswering.from_pretrained(lora_config.base_model_name_or_path, device_map='auto')
# base_model = base_model.to(device)
base_model.eval()

Some weights of LlamaForQuestionAnswering were not initialized from the model checkpoint at unsloth/meta-llama-3.1-8b-unsloth-bnb-4bit and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight', 'transformer.embed_tokens.weight', 'transformer.layers.0.input_layernorm.weight', 'transformer.layers.0.mlp.down_proj.weight', 'transformer.layers.0.mlp.gate_proj.weight', 'transformer.layers.0.mlp.up_proj.weight', 'transformer.layers.0.post_attention_layernorm.weight', 'transformer.layers.0.self_attn.k_proj.weight', 'transformer.layers.0.self_attn.o_proj.weight', 'transformer.layers.0.self_attn.q_proj.weight', 'transformer.layers.0.self_attn.v_proj.weight', 'transformer.layers.1.input_layernorm.weight', 'transformer.layers.1.mlp.down_proj.weight', 'transformer.layers.1.mlp.gate_proj.weight', 'transformer.layers.1.mlp.up_proj.weight', 'transformer.layers.1.post_attention_layernorm.weight', 'transformer.layers.1.self_attn.k_proj.weight', 'transformer.layers.1.self_attn.o_proj.weight', 't

LlamaForQuestionAnswering(
  (transformer): LlamaModel(
    (embed_tokens): Embedding(128256, 4096, padding_idx=128004)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )


In [None]:
pipe = pipeline('question-answering', model=base_model, tokenizer=tokenizer)

In [None]:
test

# Evaluation

In [None]:
task_evaluator = evaluator('question-answering')
eval_results = task_evaluator.compute(
    model_or_pipeline=pipe,
    data=data,
    metric='squad',
    strategy='bootstrap',
    n_resamples=30,
    squad_v2_format=False,  # Whether the dataset follows the format of squad_v2 dataset, 
                            # where a question may have no answer in the context
)

In [None]:
eval_results