In [1]:
%load_ext autoreload
%autoreload 2

In [13]:
from atria_logger import get_logger
from atria_transforms.core import DataTransform
from atria_types import AnnotationType, DocumentInstance, QuestionAnsweringAnnotation

from atria_datasets import load_dataset_config

logger = get_logger(__name__)

dataset_config = load_dataset_config("due_benchmark/DocVQA")
dataset = dataset_config.build(max_cache_image_size=1024)
logger.info(f"Loaded dataset:\n{dataset}")



[2025-12-20 15:13:02][atria.atria_datasets.core.dataset._dataset_builders][INFO] Downloaded files {'DocVQA': PosixPath('/mnt/zani/home/ataraxia/.atria/datasets/due_benchmark/pdfs/DocVQA')}
[2025-12-20 15:13:02][atria.__main__][INFO] Loaded dataset:
DueBenchmark(
    data_model=<class 'atria_types._data_instance._document_instance.DocumentInstance'>,
    split_iterators={
        <DatasetSplitType.train: 'train'>: SplitIterator(
            base_iterator=<atria_datasets.registry.vqa.due.SplitIterator object at 0x75447c129f10>,
            input_transform=<function DueBenchmark._input_transform at 0x754571b98c20>,
            output_transform=<atria_datasets.core.dataset._dataset_builders.DefaultOutputTransformer object at 0x75447d84cf10>,
            subset_indices=None,
            num_rows='unknown'
        ),
        <DatasetSplitType.test: 'test'>: SplitIterator(
            base_iterator=<atria_datasets.registry.vqa.due.SplitIterator object at 0x75447c124b50>,
            input_tra

In [14]:
# get first sample
sample = next(iter(dataset.train))

logger.info(f"First sample in train split:\n{sample}")

[2025-12-20 15:13:04][atria.__main__][INFO] First sample in train split:
DocumentInstance(
    index=0,
    sample_id='xnbl0037_1-7978',
    annotations=[
        QuestionAnsweringAnnotation(
            type='question_answering',
            qa_pairs=[
                QAPair(
                    id=0,
                    question_text='what is the date mentioned in this letter?',
                    answer_spans=[AnswerSpan(start=-1, end=-1, text='1/8/93')]
                ),
                QAPair(
                    id=1,
                    question_text='what is the contact person name mentioned in letter?',
                    answer_spans=[
                        AnswerSpan(start=16, end=17, text='p. carter')
                    ]
                )
            ]
        )
    ],
    page_id=0,
    pdf=PDF(file_path='pdfs/DocVQA/DocVQA/xnbl0037_1.pdf', num_pages=1),
    image=Image(
        file_path=None,
        content=<PIL.PpmImagePlugin.PpmImageFile image mode=RGB size=254

DEFAULT /mnt/zani/home/ataraxia/.atria/datasets/due_benchmark DocumentInstance(
    index=0,
    sample_id='xnbl0037_1-7978',
    annotations=[
        QuestionAnsweringAnnotation(
            type='question_answering',
            qa_pairs=[
                QAPair(
                    id=0,
                    question_text='what is the date mentioned in this letter?',
                    answer_spans=[AnswerSpan(start=-1, end=-1, text='1/8/93')]
                ),
                QAPair(
                    id=1,
                    question_text='what is the contact person name mentioned in letter?',
                    answer_spans=[
                        AnswerSpan(start=16, end=17, text='p. carter')
                    ]
                )
            ]
        )
    ],
    page_id=0,
    pdf=PDF(
        file_path='/mnt/zani/home/ataraxia/.atria/datasets/due_benchmark/pdfs/DocVQA/DocVQA/xnbl0037_1.pdf',
        num_pages=1
    ),
    image=Image(
        file_path=None,
       

In [15]:
import re
from atria_types import QAPair


class UnrollQAPairsTransform(DataTransform[list[DocumentInstance]]):
    remove_no_answer_samples: bool = False

    def __call__(self, document_instance: DocumentInstance) -> list[DocumentInstance]:
        qa_annotations = document_instance.get_annotation_by_type(
            annotation_type=AnnotationType.question_answering
        )

        document_instance_per_qa_pair = []
        for qa_pair in qa_annotations.qa_pairs:
            annotation = QuestionAnsweringAnnotation(qa_pairs=[qa_pair])

            def has_answer(qa_pair: QAPair):
                for answer_span in qa_pair.answer_spans:
                    if answer_span.start != -1 and answer_span.end != -1:
                        return True
                return False

            assert qa_pair.answer_spans is not None
            if self.remove_no_answer_samples:
                if not has_answer(qa_pair):
                    logger.debug(
                        f"Skipping QA Pair with id {qa_pair.id} due to no answer spans."
                    )
                    continue

            new_document_instance = document_instance.model_copy(
                update={
                    "sample_id": f"{document_instance.sample_id}_qa_{qa_pair.id}",
                    "annotations": [annotation],
                }
            )
            document_instance_per_qa_pair.append(new_document_instance)
        
        return document_instance_per_qa_pair


transform = UnrollQAPairsTransform(remove_no_answer_samples=True)
processed_sample = transform(sample)
for i, s in enumerate(processed_sample):
    logger.info(
        "Index: {}, Sample Id: {}, QA Pair: {}".format(
            i,
            s.sample_id,
            s.get_annotation_by_type(
                annotation_type=AnnotationType.question_answering
            ).qa_pairs[0],
        )
    )
logger.info(f"First sample after processing:\n{processed_sample}")

[2025-12-20 15:13:07][atria.__main__][INFO] Index: 0, Sample Id: xnbl0037_1-7978_qa_1, QA Pair: QAPair(
    id=1,
    question_text='what is the contact person name mentioned in letter?',
    answer_spans=[AnswerSpan(start=16, end=17, text='p. carter')]
)
[2025-12-20 15:13:07][atria.__main__][INFO] First sample after processing:
[DocumentInstance(
    index=0,
    sample_id='xnbl0037_1-7978_qa_1',
    annotations=[
        QuestionAnsweringAnnotation(
            type='question_answering',
            qa_pairs=[
                QAPair(
                    id=1,
                    question_text='what is the contact person name mentioned in letter?',
                    answer_spans=[
                        AnswerSpan(start=16, end=17, text='p. carter')
                    ]
                )
            ]
        )
    ],
    page_id=0,
    pdf=PDF(file_path='pdfs/DocVQA/DocVQA/xnbl0037_1.pdf', num_pages=1),
    image=Image(
        file_path=None,
        content=<PIL.PpmImagePlugin.

In [17]:
from atria_datasets.core.dataset._dataset_builders import DefaultOutputTransformer
from atria_datasets.core.dataset._dataset_processor import DatasetProcessor

processed_dataset = dataset.process_dataset(transform=UnrollQAPairsTransform(), num_processes=0)


[2025-12-20 15:13:27][atria.atria_datasets.core.dataset._dataset_processor][INFO] Caching split [train] to /mnt/zani/home/ataraxia/.atria/datasets/due_benchmark/storage with max_len=None
[2025-12-20 15:13:27][atria.atria_datasets.core.storage.msgpack_storage_manager][INFO] Preprocessing dataset split train to cached msgpack storage /mnt/zani/home/ataraxia/.atria/datasets/due_benchmark/storage/DocVQA-8d69bc02/a1f24409/msgpack/train





[A[A[A



Writing split train: 0it [00:00, ?it/s]
[2025-12-20 15:13:28][atria.atria_datasets.core.storage.msgpack_storage_manager][INFO] Purging dataset split train from storage /mnt/zani/home/ataraxia/.atria/datasets/due_benchmark/storage/DocVQA-8d69bc02/a1f24409/msgpack/train.


DEFAULT /mnt/zani/home/ataraxia/.atria/datasets/due_benchmark DocumentInstance(
    index=0,
    sample_id='xnbl0037_1-4238',
    annotations=[
        QuestionAnsweringAnnotation(
            type='question_answering',
            qa_pairs=[
                QAPair(
                    id=0,
                    question_text='what is the date mentioned in this letter?',
                    answer_spans=[AnswerSpan(start=-1, end=-1, text='1/8/93')]
                ),
                QAPair(
                    id=1,
                    question_text='what is the contact person name mentioned in letter?',
                    answer_spans=[
                        AnswerSpan(start=16, end=17, text='p. carter')
                    ]
                )
            ]
        )
    ],
    page_id=0,
    pdf=PDF(
        file_path='/mnt/zani/home/ataraxia/.atria/datasets/due_benchmark/pdfs/DocVQA/DocVQA/xnbl0037_1.pdf',
        num_pages=1
    ),
    image=Image(
        file_path=None,
       

RuntimeError: Error while writing dataset split train to storage. Cleaning up...