In [19]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import torch
import time
from tqdm import tqdm

import argparse
import json
import base64
from io import BytesIO
from pathlib import Path
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import time

In [22]:
class PM4BenchVQA(Dataset):
    def __init__(self, data_path, langs, max_samples=None):
        self.data_path = data_path
        self.langs = langs
        self.samples = self._load_samples(max_samples)

    def _load_samples(self, max_samples):
        samples = []
        with open(self.data_path, "r", encoding="utf-8") as f:
            for i, line in enumerate(f):
                if max_samples and i >= max_samples:
                    break
                sample_info = json.loads(line)
                for lang in self.langs:
                    if sample_info["questions"].get(lang):
                        samples.append({
                            "id": sample_info["id"],
                            "images": sample_info["images"][0],
                            "questions": sample_info["questions"][lang],
                            "options": sample_info["options"][lang],
                            "lang": lang
                        })
        return samples

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        img_b64 = sample["images"]
        try:
            image = Image.open(BytesIO(base64.b64decode(img_b64))).convert("RGB")
            question = sample["questions"]
            options = sample["options"]
            text = f"{question}\n" + "\n".join(
                [f"{chr(65+i)}. {opt}" for i, opt in enumerate(options)]
            )
            return {
                "qid": sample["id"],
                "lang": sample["lang"],
                "image": image,
                "text": text,
            }
        except Exception as e:
            print(f"Error loading sample {sample['id']}: {e}")
            return None 

In [21]:
data_path = "/root/personal/datasets/PM4Bench/MDUR/traditional/test1.jsonl"
LANGS="EN,KO,SR,HU,AR,CS,TH,ZH,RU,VI"
langs = LANGS.split(',') # a list

In [23]:
dataset = PM4BenchVQA(data_path,langs,None)

In [24]:
sample0 = dataset[1]

In [25]:
sample0

{'qid': 669,
 'lang': 'KO',
 'image': <PIL.Image.Image image mode=RGB size=276x269>,
 'text': '다음 중 <image 1>에 나타난 전반적인 경향을 가장 잘 설명하는 것은 무엇입니까?\nA. 정치적 불안정으로 인한 인구 감소\nB. 실크로드를 통한 병원체의 확산\nC. 새로운 무역로의 개발\nD.  실크로드에 영향을 미치는 기후 변화\nE. 재정착을 위한 중앙아시아 지역으로의 이주\nF. 운송 기술의 발전\nG. 몽골 부족의 침략\nH. 작물 실패로 인한 대규모 기근\nI. 경제적 번영과 인구 증가\nJ. 실크로드를 따라 발생한 종교적 갈등의 증가'}

In [27]:
sample1 = dataset[2]

In [28]:
sample1

{'qid': 669,
 'lang': 'SR',
 'image': <PIL.Image.Image image mode=RGB size=276x269>,
 'text': 'Шта од следећег најбоље објашњава укупан тренд приказан на <image 1>?\nA. Политичка нестабилност која доводи до опадања популације\nB. Ширење патогена дуж Пута свиле\nC. Развој нових трговачких путева\nD. Климатске промене које утичу на Пут свиле\nE. Миграције у области Централне Азије ради пресељења\nF. Технолошки напредак у транспорту\nG. Инвазије од стране монголских племена\nH. Велика глад због неуспеха усева\nI. Економски просперитет и раст популације\nJ. Појава верских сукоба дуж Пута свиле'}

In [33]:
class PM4BenchVQA(Dataset):
    def __init__(self, data_path, langs, max_samples=None):
        self.langs = langs
        self.samples = self._load_samples(data_path, max_samples)

    def _load_samples(self, data_path, max_samples):
        samples = []
        print("Pre-loading sample metadata...")

        with open(data_path, "r", encoding="utf-8") as f:
            for i, line in enumerate(f):
                if max_samples and i >= max_samples:
                    break

                s = json.loads(line)

                # decode all images only once
                pil_images = [
                    Image.open(BytesIO(base64.b64decode(b64))).convert("RGB")
                    for b64 in s["images"]
                ]

                # use the first image as the main visual input (same as你的代码)
                main_img = pil_images[0]

                # build multi-language sub-samples
                for lang in self.langs:
                    q = s["questions"].get(lang)
                    opts = s["options"].get(lang)

                    if q is None or opts is None:
                        continue

                    text = q + "\n" + "\n".join(
                        [f"{chr(65+i)}. {o}" for i, o in enumerate(opts)]
                    )

                    samples.append({
                        "qid": s["id"],
                        "lang": lang,
                        "image": main_img,   # <-- same image for all languages
                        "text": text,
                    })

        return samples

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]


In [34]:
data_path = "/root/personal/datasets/PM4Bench/MDUR/traditional/test1.jsonl"
LANGS="EN,KO,SR,HU,AR,CS,TH,ZH,RU,VI"
langs = LANGS.split(',') # a list

In [35]:
dataset = PM4BenchVQA(data_path,langs,None)

Pre-loading sample metadata...


In [38]:
dataset[0]

{'qid': 669,
 'lang': 'EN',
 'image': <PIL.Image.Image image mode=RGB size=276x269>,
 'text': 'Which of the following best explains the overall trend shown in the <image 1>?\nA. Political instability leading to population decline\nB. The spread of pathogens across the Silk Road\nC. Development of new trade routes\nD. Climate change affecting the Silk Road\nE. Migrations to areas of Central Asia for resettlement\nF. Technological advancements in transportation\nG. Invasions by Mongol tribes\nH. Large-scale famine due to crop failures\nI. Economic prosperity and population growth\nJ. Rise of religious conflicts along the Silk Road'}