# Libraries

In [None]:
import json
import torch
from datasets import Dataset
from evaluate import evaluator

# Configuration

In [None]:
test_size = 1000
model = 'unsloth/Meta-Llama-3.1-8B'

# Data

In [None]:
# Download the Indonesian SQuAD dataset
!mkdir data
!cd data && wget https://raw.githubusercontent.com/Wikidepia/SQuAD-id/refs/heads/master/data/train-SQuAD-id.json

In [None]:
# Load and convert data to Huggingface format
# Source: https://github.com/Wikidepia/indonesian_datasets/blob/master/question-answering/squad/convert_huggingface.py

with open('data/train-SQuAD-id.json', 'r') as f:
    content = json.load(f)

hf_data = []
for data in content["data"]:
    title = data["title"]
    for paragraph in data["paragraphs"]:
        context = paragraph["context"]
        for qa in paragraph["qas"]:
            fill = {
                "id":  qa["id"],
                "title": title,
                "context": context,
                "question": qa["question"],
                "answers": {"answer_start": [], "text": []}
            }
            if qa["is_impossible"]:
                answers = qa["plausible_answers"]
            else:
                answers = qa["answers"]
            for answer in answers:
                fill["answers"]["answer_start"].append(answer["answer_start"])
                fill["answers"]["text"].append(answer["text"])
            hf_data.append(fill)

In [None]:
# Create Huggingface dataset
data = Dataset.from_list(hf_data[:test_size])
print(data)

# Evaluation

In [None]:
task_evaluator = evaluator('question-answering')
eval_results = task_evaluator.compute(
    model_or_pipeline=model,
    data=data,
    metric='squad',
    strategy='bootstrap',
    n_resamples=30,
    squad_v2_format=False,  # Whether the dataset follows the format of squad_v2 dataset, 
                            # where a question may have no answer in the context
)