In [None]:

import requests

repo = "PrismaX/PhysUniBench"
filename = "PhysUnivBench_en_MCQ.json"
url = f"https://huggingface.co/datasets/{repo}/raw/main/{filename}"

r = requests.get(url, timeout=30)
r.raise_for_status()
data = r.json()
print(f"{filename} -> {len(data)} questions")


PhysUnivBench_en_MCQ.json -> 393 questions


In [None]:
import requests

repo = "PrismaX/PhysUniBench"
filename = "PhysUnivBench_en_OE.json"
url = f"https://huggingface.co/datasets/{repo}/raw/main/{filename}"

r = requests.get(url, timeout=30)
r.raise_for_status()
data = r.json()   # file is a JSON array
print(f"{filename} -> {len(data)} questions")


PhysUnivBench_en_OE.json -> 629 questions


In [None]:

import argparse
import json
import random
import sys
from io import StringIO
from typing import Any, Dict, List, Optional

HF_RAW_TEMPLATE = "https://huggingface.co/datasets/{repo}/raw/{revision}/{filename}"
REPO = "PrismaX/PhysUniBench"
REVISION = "main"
FILENAME = "PhysUnivBench_en_MCQ.json"  # correct filename (note the 'v')

def load_text_from_local(path: str) -> str:
    with open(path, "r", encoding="utf-8") as f:
        return f.read()

def load_text_from_remote(repo: str = REPO, filename: str = FILENAME, revision: str = REVISION, timeout: int = 30) -> str:
    import requests
    url = HF_RAW_TEMPLATE.format(repo=repo, filename=filename, revision=revision)
    resp = requests.get(url, timeout=timeout)
    resp.raise_for_status()
    return resp.text

def parse_json_or_jsonl(text: str) -> List[Dict[str, Any]]:
    try:
        data = json.loads(text)
        if isinstance(data, list):
            return data
        if isinstance(data, dict):
            for k in ("data", "questions", "items"):
                if k in data and isinstance(data[k], list):
                    return data[k]
            list_fields = [(k, v) for k, v in data.items() if isinstance(v, list)]
            if list_fields:
                largest = max(list_fields, key=lambda kv: len(kv[1]))[1]
                return largest
            return [data]
    except json.JSONDecodeError:
        pass

    items = []
    for line in StringIO(text):
        line = line.strip()
        if not line:
            continue
        try:
            obj = json.loads(line)
            if isinstance(obj, dict):
                for k in ("data","questions","items"):
                    if k in obj and isinstance(obj[k], list):
                        items.extend(obj[k])
                        break
                else:
                    items.append(obj)
            elif isinstance(obj, list):
                items.extend(obj)
        except json.JSONDecodeError:
            continue
    return items

def extract_field(item: Dict[str, Any], candidates: List[str]) -> Optional[Any]:
    for c in candidates:
        if c in item:
            return item[c]
    return None

def get_question_text(item: Dict[str, Any]) -> str:
    txt = extract_field(item, ["question", "question_text", "stem", "content", "prompt", "body", "q"])
    if isinstance(txt, dict):
        return txt.get("text") or txt.get("value") or json.dumps(txt, ensure_ascii=False)
    if txt is None:
        title = item.get("title") or item.get("name")
        body = item.get("body") or item.get("description")
        if title and body:
            return f"{title}\n{body}"
        return json.dumps(item, ensure_ascii=False)[:200] + "..."
    return str(txt)

def get_choices(item: Dict[str, Any]) -> List[str]:
    choices = extract_field(item, ["choices", "options", "alternatives", "answers", "options_list"])
    if choices is None:
        letters = [k for k in sorted(item.keys()) if k.upper() in ("A","B","C","D","E","F")]
        if letters:
            return [item[k] for k in letters]
        return []
    if isinstance(choices, dict):
        sorted_keys = sorted(choices.keys(), key=lambda k: k)
        return [choices[k] if choices[k] is not None else "" for k in sorted_keys]
    if isinstance(choices, list):
        out = []
        for ch in choices:
            if isinstance(ch, str):
                out.append(ch)
            elif isinstance(ch, dict):
                out.append(ch.get("text") or ch.get("choice") or ch.get("label") or json.dumps(ch, ensure_ascii=False))
            else:
                out.append(str(ch))
        return out
    return [str(choices)]

def find_correct_index(item: Dict[str, Any], choices: List[str]) -> Optional[int]:
    ans = extract_field(item, ["answer", "answer_key", "correct_answer", "correct", "label", "answerKey", "ans"])
    if ans is None:
        return None
    if isinstance(ans, int):
        if 0 <= ans < len(choices):
            return ans
        if 1 <= ans <= len(choices):
            return ans - 1
        return None
    if isinstance(ans, str):
        s = ans.strip()
        if len(s) == 1 and s.upper() >= "A" and ord(s.upper()) - ord("A") < len(choices):
            return ord(s.upper()) - ord("A")
        if s.isdigit():
            idx = int(s)
            if 0 <= idx < len(choices):
                return idx
            if 1 <= idx <= len(choices):
                return idx - 1
        for i, ch in enumerate(choices):
            if ch and s.lower() == str(ch).strip().lower():
                return i
    if isinstance(ans, dict):
        if "label" in ans:
            return find_correct_index({"_tmp": ans["label"]}, choices)
        if "index" in ans and isinstance(ans["index"], int):
            return ans["index"]
    return None

def print_mcq(item: Dict[str, Any], idx: int, reveal_answer: bool = False) -> None:
    qtext = get_question_text(item)
    choices = get_choices(item)
    correct_idx = find_correct_index(item, choices)

    print(f"Q{idx}. {qtext}")
    if not choices:
        print("   (no choices found)\n")
        return

    for i, ch in enumerate(choices):
        letter = chr(ord("A") + i)
        print(f"   {letter}. {ch}")

    if reveal_answer:
        if correct_idx is not None and 0 <= correct_idx < len(choices):
            print(f"   >> Correct: {chr(ord('A')+correct_idx)}. {choices[correct_idx]}")
        else:
            raw_ans = extract_field(item, ["answer", "answer_key", "correct_answer", "correct", "label","answerKey","ans"])
            if raw_ans is not None:
                print(f"   >> Correct (raw): {raw_ans}")
            else:
                print("   >> Correct: (unknown)")
    print()

def print_some_mcqs(local_path: Optional[str] = None, n: int = 5, first: bool = False, seed: Optional[int] = None, reveal: bool = False):
    if local_path:
        text = load_text_from_local(local_path)
    else:
        text = load_text_from_remote()
    items = parse_json_or_jsonl(text)
    if not items:
        print("No items parsed from file.")
        return
    total = len(items)
    print(f"Loaded {total} items. Printing {min(n, total)} MCQs.\n")
    indices = list(range(total))
    if first:
        chosen = indices[:n]
    else:
        if seed is not None:
            random.seed(seed)
        chosen = random.sample(indices, min(n, total))
    for i, idx in enumerate(chosen, start=1):
        print_mcq(items[idx], i, reveal_answer=reveal)

def main(argv=None):
    ap = argparse.ArgumentParser()
    ap.add_argument("--local", "-l", help="Path to local JSON / JSONL file (if omitted, fetches from HuggingFace raw URL)")
    ap.add_argument("--n", "-n", type=int, default=5, help="Number of MCQs to print")
    ap.add_argument("--first", action="store_true", help="Print the first N items instead of random sample")
    ap.add_argument("--seed", type=int, default=None, help="Random seed for sampling")
    ap.add_argument("--reveal", action="store_true", help="Reveal correct answers")

    if argv is None:
        args, unknown = ap.parse_known_args()
    else:
        args = ap.parse_args(argv)

    print_some_mcqs(local_path=args.local, n=args.n, first=args.first, seed=args.seed, reveal=args.reveal)

if __name__ == "__main__":
    main()


Loaded 393 items. Printing 5 MCQs.

Q1. An air - filled toroidal solenoid has a mean radius $r = 14.5\ cm=0.145\ m$ and a cross - sectional area $A = 5.05\ cm^{2}=5.05\times10^{- 4}\ m^{2}$. The current flowing through it is $I = 11.7\ A$, and it is desired that the energy stored within the solenoid be at least $U = 0.385\ J$.
### Part A
What is the least number of turns that the winding must have? Express your answer numerically, as a whole number, to three significant figures.
View Available Hint(s)
Submit
turns
<image>
   A. A. N = 2353

B. N = 159

C. N = 2830

D. N = 130.32

Q2. Calculate the magnitude and direction of the Coulomb force on each of the three charges shown in the figure below.
1.50μC charge magnitude
1.50μC charge direction
-2.00μC charge magnitude
-2.00μC charge direction
6.00 μC charge magnitude  N
6.00 μC charge direction  ---Select---
1.50 μC charge magnitude  N
1.50 μC charge direction  ---Select---
-2.00 μC charge magnitude  N
-2.00 μC charge direction  ---Sel

In [None]:

import argparse
import json
import random
import sys
from io import StringIO
from typing import Any, Dict, List, Optional

HF_RAW_TEMPLATE = "https://huggingface.co/datasets/{repo}/raw/{revision}/{filename}"
REPO = "PrismaX/PhysUniBench"
REVISION = "main"
FILENAME = "PhysUnivBench_en_OE.json"  # correct filename (note the 'v')

def load_text_from_local(path: str) -> str:
    with open(path, "r", encoding="utf-8") as f:
        return f.read()

def load_text_from_remote(repo: str = REPO, filename: str = FILENAME, revision: str = REVISION, timeout: int = 30) -> str:
    import requests
    url = HF_RAW_TEMPLATE.format(repo=repo, filename=filename, revision=revision)
    resp = requests.get(url, timeout=timeout)
    resp.raise_for_status()
    return resp.text

def parse_json_or_jsonl(text: str) -> List[Dict[str, Any]]:
    try:
        data = json.loads(text)
        if isinstance(data, list):
            return data
        if isinstance(data, dict):
            for k in ("data", "questions", "items"):
                if k in data and isinstance(data[k], list):
                    return data[k]
            list_fields = [(k, v) for k, v in data.items() if isinstance(v, list)]
            if list_fields:
                largest = max(list_fields, key=lambda kv: len(kv[1]))[1]
                return largest
            return [data]
    except json.JSONDecodeError:
        pass

    items = []
    for line in StringIO(text):
        line = line.strip()
        if not line:
            continue
        try:
            obj = json.loads(line)
            if isinstance(obj, dict):
                for k in ("data","questions","items"):
                    if k in obj and isinstance(obj[k], list):
                        items.extend(obj[k])
                        break
                else:
                    items.append(obj)
            elif isinstance(obj, list):
                items.extend(obj)
        except json.JSONDecodeError:
            continue
    return items

def extract_field(item: Dict[str, Any], candidates: List[str]) -> Optional[Any]:
    for c in candidates:
        if c in item:
            return item[c]
    return None

def get_question_text(item: Dict[str, Any]) -> str:
    txt = extract_field(item, ["question", "question_text", "stem", "content", "prompt", "body", "q"])
    if isinstance(txt, dict):
        return txt.get("text") or txt.get("value") or json.dumps(txt, ensure_ascii=False)
    if txt is None:
        title = item.get("title") or item.get("name")
        body = item.get("body") or item.get("description")
        if title and body:
            return f"{title}\n{body}"
        return json.dumps(item, ensure_ascii=False)[:200] + "..."
    return str(txt)

def get_choices(item: Dict[str, Any]) -> List[str]:
    choices = extract_field(item, ["choices", "options", "alternatives", "answers", "options_list"])
    if choices is None:
        letters = [k for k in sorted(item.keys()) if k.upper() in ("A","B","C","D","E","F")]
        if letters:
            return [item[k] for k in letters]
        return []
    if isinstance(choices, dict):
        sorted_keys = sorted(choices.keys(), key=lambda k: k)
        return [choices[k] if choices[k] is not None else "" for k in sorted_keys]
    if isinstance(choices, list):
        out = []
        for ch in choices:
            if isinstance(ch, str):
                out.append(ch)
            elif isinstance(ch, dict):
                out.append(ch.get("text") or ch.get("choice") or ch.get("label") or json.dumps(ch, ensure_ascii=False))
            else:
                out.append(str(ch))
        return out
    return [str(choices)]

def find_correct_index(item: Dict[str, Any], choices: List[str]) -> Optional[int]:
    ans = extract_field(item, ["answer", "answer_key", "correct_answer", "correct", "label", "answerKey", "ans"])
    if ans is None:
        return None
    if isinstance(ans, int):
        if 0 <= ans < len(choices):
            return ans
        if 1 <= ans <= len(choices):
            return ans - 1
        return None
    if isinstance(ans, str):
        s = ans.strip()
        if len(s) == 1 and s.upper() >= "A" and ord(s.upper()) - ord("A") < len(choices):
            return ord(s.upper()) - ord("A")
        if s.isdigit():
            idx = int(s)
            if 0 <= idx < len(choices):
                return idx
            if 1 <= idx <= len(choices):
                return idx - 1
        for i, ch in enumerate(choices):
            if ch and s.lower() == str(ch).strip().lower():
                return i
    if isinstance(ans, dict):
        if "label" in ans:
            return find_correct_index({"_tmp": ans["label"]}, choices)
        if "index" in ans and isinstance(ans["index"], int):
            return ans["index"]
    return None

def print_mcq(item: Dict[str, Any], idx: int, reveal_answer: bool = False) -> None:
    qtext = get_question_text(item)
    choices = get_choices(item)
    correct_idx = find_correct_index(item, choices)

    print(f"Q{idx}. {qtext}")
    if not choices:
        print("   (no choices found)\n")
        return

    for i, ch in enumerate(choices):
        letter = chr(ord("A") + i)
        print(f"   {letter}. {ch}")

    if reveal_answer:
        if correct_idx is not None and 0 <= correct_idx < len(choices):
            print(f"   >> Correct: {chr(ord('A')+correct_idx)}. {choices[correct_idx]}")
        else:
            raw_ans = extract_field(item, ["answer", "answer_key", "correct_answer", "correct", "label","answerKey","ans"])
            if raw_ans is not None:
                print(f"   >> Correct (raw): {raw_ans}")
            else:
                print("   >> Correct: (unknown)")
    print()

def print_some_mcqs(local_path: Optional[str] = None, n: int = 5, first: bool = False, seed: Optional[int] = None, reveal: bool = False):
    if local_path:
        text = load_text_from_local(local_path)
    else:
        text = load_text_from_remote()
    items = parse_json_or_jsonl(text)
    if not items:
        print("No items parsed from file.")
        return
    total = len(items)
    print(f"Loaded {total} items. Printing {min(n, total)} MCQs.\n")
    indices = list(range(total))
    if first:
        chosen = indices[:n]
    else:
        if seed is not None:
            random.seed(seed)
        chosen = random.sample(indices, min(n, total))
    for i, idx in enumerate(chosen, start=1):
        print_mcq(items[idx], i, reveal_answer=reveal)

def main(argv=None):
    ap = argparse.ArgumentParser()
    ap.add_argument("--local", "-l", help="Path to local JSON / JSONL file (if omitted, fetches from HuggingFace raw URL)")
    ap.add_argument("--n", "-n", type=int, default=5, help="Number of MCQs to print")
    ap.add_argument("--first", action="store_true", help="Print the first N items instead of random sample")
    ap.add_argument("--seed", type=int, default=None, help="Random seed for sampling")
    ap.add_argument("--reveal", action="store_true", help="Reveal correct answers")

    # parse_known_args allows Jupyter/Colab injected args (like -f) to be ignored
    if argv is None:
        args, unknown = ap.parse_known_args()
    else:
        args = ap.parse_args(argv)

    print_some_mcqs(local_path=args.local, n=args.n, first=args.first, seed=args.seed, reveal=args.reveal)

if __name__ == "__main__":
    main()


Loaded 629 items. Printing 5 MCQs.

Q1. how to determine the x - component and y - component of vector $\vec{A}$, denoted $A_x$ and $A_y$.
What is the magnitude of the component vector $\vec{A}_x$ shown in (Figure 1)? Express your answer in meters to one significant figure.
Previous Answers: Incorrect; Try Again
Part B
What is the sign of the y - component $A_y$ of vector $\vec{A}$ shown in (Figure 1)?
positive
negative
Previous Answers: Correct
<image>
   (no choices found)

Q2. VP 21.4.1
Charge $q_1 = + 5.60nC$ is on the $x$-axis at $x = 0$ and charge $q_2=-2.00nC$ is on the $x$-axis at $x = 6.00cm$. Find the $x$-component of total electric force exerted by $q_1$ and $q_2$ on a third charge $q_3=-1.50nC$ on the $x$-axis at $x = 4.00cm$.
Express your answer in newtons.
$F_x=$
Incorrect; Try Again; 5 attempts remaining
Charge $q_1 = +5.60\ nC$ is on the $x$-axis at $x = 0$ and charge $q_2=-2.00\ nC$ is on the $x$-axis at $x = 6.00\ cm$. Find the $x$-component of total electric force ex

In [None]:


import requests
import json
import re
import io
import os
import zipfile
from PIL import Image

try:
    from tqdm import tqdm
except Exception:
    tqdm = lambda x, **k: x

# ==== CONFIG ====
RAW_JSON_URL = "https://huggingface.co/datasets/PrismaX/PhysUniBench/raw/main/PhysUnivBench_en_MCQ.json"
IMAGE_BASE_URL = "https://huggingface.co/datasets/PrismaX/PhysUniBench/raw/main/images/"

OUTPUT_JSON = "phys_unibench_mcqs.json"
OUTPUT_IMAGES_ZIP = "phys_unibench_images.zip"
START_ID = 174
# =================

def safe_get_json(url):
    r = requests.get(url, timeout=60)
    r.raise_for_status()
    return r.json()

def parse_options(options_text):
    """
    Best-effort parse of options string into dict {'a':..., 'b':..., 'c':..., 'd':...}.
    Works for forms like "A. ... B. ... C. ... D. ..." or newline-separated variants.
    """
    if not options_text:
        return {"a":"", "b":"", "c":"", "d":""}
    s = options_text.strip()
    # unify different unicode dots/spaces (best-effort)
    # find positions of A., B., C., D. (capital letters expected)
    labels = ['A','B','C','D']
    positions = []
    for lab in labels:
        m = re.search(r'(?<!\w)'+re.escape(lab)+r'\.', s)
        if m:
            positions.append((lab, m.start()))
    # If none found, try lowercase
    if not positions:
        for lab in labels:
            m = re.search(r'(?<!\w)'+lab.lower()+r'\.', s)
            if m:
                positions.append((lab, m.start()))
    # If still none, fallback: split by newlines and pick first 4 lines
    if not positions:
        parts = [line.strip() for line in s.splitlines() if line.strip()]
        out = {}
        for i, lab in enumerate(['a','b','c','d']):
            out[lab] = parts[i] if i < len(parts) else ""
        return out

    # sort by position
    positions.sort(key=lambda x: x[1])
    out = {}
    for idx, (lab, pos) in enumerate(positions):
        start = pos + 2  # skip e.g. "A."
        end = positions[idx+1][1] if idx+1 < len(positions) else len(s)
        piece = s[start:end].strip()
        # strip leading punctuation/colons/hyphens
        piece = re.sub(r'^[\s:–—\-)]*', '', piece)
        out[lab.lower()] = piece

    # ensure keys exist
    for k in ['a','b','c','d']:
        out.setdefault(k, "")

    return out

def download_and_convert_image(image_filename, target_id):
    """
    Fetch image from IMAGE_BASE_URL + image_filename (e.g. '1.jpg'),
    convert to PNG and return PNG bytes. Returns None on failure.
    """
    if not image_filename:
        return None
    url = IMAGE_BASE_URL + image_filename
    try:
        resp = requests.get(url, timeout=60)
        if resp.status_code != 200:
            # try jpg->png name variants (best-effort)
            alt = os.path.splitext(image_filename)[0] + ".png"
            resp = requests.get(IMAGE_BASE_URL + alt, timeout=60)
            if resp.status_code != 200:
                return None
        # load into PIL and convert to PNG bytes
        img = Image.open(io.BytesIO(resp.content)).convert("RGB")
        buf = io.BytesIO()
        img.save(buf, format="PNG")
        return buf.getvalue()
    except Exception as e:
        # print or log as needed
        # print(f"Image download/convert failed for {url}: {e}")
        return None

def main():
    print("Downloading JSON from dataset...")
    data = safe_get_json(RAW_JSON_URL)
    print(f"Loaded {len(data)} entries from the MCQ file.")

    output_list = []
    # Prepare zip file for images
    zf = zipfile.ZipFile(OUTPUT_IMAGES_ZIP, mode="w", compression=zipfile.ZIP_DEFLATED)
    current_id = START_ID
    for row in tqdm(data):
        qid = current_id
        current_id+=1
        # Use int/string id as-is (keep same as dataset)
        if qid is None:
            # skip if no id
            continue

        # question text
        text = row.get("question", "").strip()

        # parse options from the dataset's 'options' field (often a printable string)
        options_parsed = parse_options(row.get("options", "") or "")

        # map correct answer (dataset uses 'answer' like "A")
        correct = row.get("answer", "")
        if isinstance(correct, str):
            correct = correct.strip()
        else:
            correct = str(correct)

        # image file in dataset (e.g. "1.jpg" or "0.jpg")
        src_image_name = row.get("image", "").strip() if row.get("image") else ""

        # target image path inside JSON output (and inside zip)
        image_field_path = f"images/image{qid}.png"

        # attempt to download and convert to PNG bytes
        png_bytes = None
        if src_image_name:
            png_bytes = download_and_convert_image(src_image_name, qid)
        if png_bytes:
            # write into zip under images/image{qid}.png
            zf.writestr(image_field_path, png_bytes)
        else:
            # If missing, we still include the JSON path but no image is placed in zip.
            # Option: you can choose to set image_field_path="" instead.
            print(f"Warning: image for id={qid} ({src_image_name}) not downloaded.")

        # build output object
        out_obj = {
            "id": qid,
            "image": image_field_path,
            "text": text,
            "options": {
                "a": options_parsed.get("a", ""),
                "b": options_parsed.get("b", ""),
                "c": options_parsed.get("c", ""),
                "d": options_parsed.get("d", "")
            },
            "type": "MCQs with One Correct Answer",
            "correct_answer": correct
        }
        output_list.append(out_obj)

    zf.close()

    # save output JSON
    with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
        json.dump(output_list, f, ensure_ascii=False, indent=2)

    print(f"Done. {len(output_list)} MCQs written to {OUTPUT_JSON}")
    print(f"Images written (when available) into {OUTPUT_IMAGES_ZIP}")

if __name__ == "__main__":
    main()


Downloading JSON from dataset...
Loaded 393 entries from the MCQ file.


  1%|          | 2/393 [00:00<00:31, 12.27it/s]



  1%|          | 4/393 [00:00<00:31, 12.31it/s]



  2%|▏         | 6/393 [00:00<00:32, 11.96it/s]



  2%|▏         | 8/393 [00:00<00:31, 12.18it/s]



  3%|▎         | 10/393 [00:00<00:34, 11.10it/s]



  4%|▎         | 14/393 [00:01<00:31, 11.94it/s]



  4%|▍         | 16/393 [00:01<00:31, 12.15it/s]



  5%|▌         | 20/393 [00:01<00:32, 11.53it/s]



  6%|▌         | 22/393 [00:01<00:32, 11.53it/s]



  7%|▋         | 26/393 [00:02<00:32, 11.37it/s]



  7%|▋         | 28/393 [00:02<00:31, 11.64it/s]



  8%|▊         | 32/393 [00:02<00:31, 11.36it/s]



  9%|▊         | 34/393 [00:02<00:30, 11.75it/s]



 10%|▉         | 38/393 [00:03<00:30, 11.68it/s]



 10%|█         | 40/393 [00:03<00:32, 10.81it/s]



 11%|█         | 42/393 [00:03<00:32, 10.73it/s]



 12%|█▏        | 46/393 [00:04<00:32, 10.81it/s]



 12%|█▏        | 48/393 [00:04<00:31, 11.08it/s]



 13%|█▎        | 52/393 [00:04<00:30, 11.26it/s]



 14%|█▎        | 54/393 [00:04<00:29, 11.51it/s]



 14%|█▍        | 56/393 [00:04<00:31, 10.75it/s]



 15%|█▌        | 60/393 [00:05<00:29, 11.13it/s]



 16%|█▌        | 62/393 [00:05<00:30, 10.71it/s]



 17%|█▋        | 66/393 [00:05<00:28, 11.62it/s]



 17%|█▋        | 68/393 [00:05<00:27, 11.95it/s]



 18%|█▊        | 72/393 [00:06<00:26, 11.97it/s]



 19%|█▉        | 74/393 [00:06<00:26, 12.13it/s]



 20%|█▉        | 78/393 [00:06<00:25, 12.34it/s]



 20%|██        | 80/393 [00:06<00:25, 12.28it/s]



 21%|██▏       | 84/393 [00:07<00:24, 12.38it/s]



 22%|██▏       | 86/393 [00:07<00:28, 10.82it/s]



 23%|██▎       | 90/393 [00:07<00:26, 11.56it/s]



 23%|██▎       | 92/393 [00:07<00:25, 11.82it/s]



 24%|██▍       | 96/393 [00:08<00:26, 11.32it/s]



 25%|██▍       | 98/393 [00:08<00:25, 11.46it/s]



 25%|██▌       | 100/393 [00:08<00:24, 11.74it/s]



 26%|██▋       | 104/393 [00:09<00:25, 11.28it/s]



 27%|██▋       | 106/393 [00:09<00:24, 11.63it/s]



 28%|██▊       | 110/393 [00:09<00:23, 12.11it/s]



 28%|██▊       | 112/393 [00:09<00:23, 12.06it/s]



 30%|██▉       | 116/393 [00:10<00:22, 12.27it/s]



 30%|███       | 118/393 [00:10<00:22, 12.29it/s]



 31%|███       | 122/393 [00:10<00:21, 12.38it/s]



 32%|███▏      | 124/393 [00:10<00:24, 10.82it/s]



 32%|███▏      | 126/393 [00:10<00:24, 10.88it/s]



 33%|███▎      | 128/393 [00:11<00:26, 10.19it/s]



 33%|███▎      | 131/393 [00:11<00:41,  6.36it/s]



 34%|███▎      | 132/393 [00:12<00:43,  5.94it/s]



 34%|███▍      | 133/393 [00:12<01:07,  3.87it/s]



 35%|███▍      | 137/393 [00:13<00:41,  6.16it/s]



 35%|███▌      | 139/393 [00:13<00:33,  7.48it/s]



 36%|███▋      | 143/393 [00:13<00:26,  9.27it/s]



 37%|███▋      | 145/393 [00:13<00:24, 10.04it/s]



 38%|███▊      | 149/393 [00:14<00:21, 11.25it/s]



 38%|███▊      | 151/393 [00:14<00:22, 10.76it/s]



 39%|███▉      | 155/393 [00:14<00:20, 11.47it/s]



 40%|███▉      | 157/393 [00:14<00:20, 11.77it/s]



 41%|████      | 161/393 [00:15<00:19, 11.93it/s]



 41%|████▏     | 163/393 [00:15<00:19, 12.07it/s]



 42%|████▏     | 167/393 [00:15<00:18, 11.99it/s]



 43%|████▎     | 169/393 [00:15<00:18, 11.89it/s]



 44%|████▍     | 173/393 [00:16<00:18, 11.99it/s]



 45%|████▍     | 175/393 [00:16<00:18, 11.54it/s]



 46%|████▌     | 179/393 [00:16<00:17, 11.93it/s]



 46%|████▌     | 181/393 [00:16<00:17, 12.04it/s]



 47%|████▋     | 183/393 [00:17<00:20, 10.04it/s]



 48%|████▊     | 187/393 [00:17<00:18, 11.15it/s]



 48%|████▊     | 189/393 [00:17<00:18, 11.15it/s]



 49%|████▉     | 193/393 [00:17<00:17, 11.54it/s]



 50%|████▉     | 195/393 [00:18<00:16, 11.83it/s]



 51%|█████     | 199/393 [00:18<00:15, 12.17it/s]



 51%|█████     | 201/393 [00:18<00:15, 12.36it/s]



 52%|█████▏    | 205/393 [00:18<00:16, 11.73it/s]



 53%|█████▎    | 207/393 [00:19<00:17, 10.91it/s]



 53%|█████▎    | 209/393 [00:19<00:18, 10.21it/s]



 54%|█████▍    | 213/393 [00:19<00:16, 11.09it/s]



 55%|█████▍    | 215/393 [00:19<00:17, 10.46it/s]



 56%|█████▌    | 219/393 [00:20<00:15, 11.35it/s]



 56%|█████▌    | 221/393 [00:20<00:14, 11.64it/s]



 57%|█████▋    | 225/393 [00:20<00:14, 11.82it/s]



 58%|█████▊    | 227/393 [00:20<00:13, 11.99it/s]



 59%|█████▉    | 231/393 [00:21<00:13, 12.19it/s]



 59%|█████▉    | 233/393 [00:21<00:13, 12.31it/s]



 60%|██████    | 237/393 [00:21<00:12, 12.22it/s]



 61%|██████    | 239/393 [00:21<00:12, 12.36it/s]



 62%|██████▏   | 243/393 [00:22<00:11, 12.52it/s]



 62%|██████▏   | 245/393 [00:22<00:11, 12.57it/s]



 63%|██████▎   | 247/393 [00:22<00:11, 12.68it/s]



 64%|██████▍   | 251/393 [00:22<00:12, 11.68it/s]



 64%|██████▍   | 253/393 [00:23<00:12, 11.14it/s]



 65%|██████▍   | 255/393 [00:23<00:11, 11.51it/s]



 66%|██████▌   | 259/393 [00:23<00:11, 11.42it/s]



 66%|██████▋   | 261/393 [00:23<00:11, 11.73it/s]



 67%|██████▋   | 263/393 [00:23<00:10, 11.96it/s]



 68%|██████▊   | 267/393 [00:24<00:10, 11.64it/s]



 68%|██████▊   | 269/393 [00:24<00:10, 11.88it/s]



 69%|██████▉   | 273/393 [00:24<00:10, 11.20it/s]



 70%|██████▉   | 275/393 [00:24<00:10, 11.63it/s]



 70%|███████   | 277/393 [00:25<00:10, 11.22it/s]



 72%|███████▏  | 281/393 [00:25<00:10, 10.81it/s]



 72%|███████▏  | 283/393 [00:25<00:09, 11.38it/s]



 73%|███████▎  | 287/393 [00:26<00:09, 11.59it/s]



 74%|███████▎  | 289/393 [00:26<00:08, 11.87it/s]



 75%|███████▍  | 293/393 [00:26<00:08, 11.64it/s]



 75%|███████▌  | 295/393 [00:26<00:08, 11.92it/s]



 76%|███████▌  | 299/393 [00:26<00:07, 12.32it/s]



 77%|███████▋  | 301/393 [00:27<00:07, 12.43it/s]



 78%|███████▊  | 305/393 [00:27<00:07, 12.36it/s]



 78%|███████▊  | 307/393 [00:27<00:07, 10.93it/s]



 79%|███████▉  | 311/393 [00:28<00:07, 11.27it/s]



 80%|███████▉  | 313/393 [00:28<00:06, 11.64it/s]



 81%|████████  | 317/393 [00:28<00:08,  9.41it/s]



 82%|████████▏ | 321/393 [00:30<00:19,  3.70it/s]



 82%|████████▏ | 323/393 [00:30<00:14,  4.70it/s]



 83%|████████▎ | 327/393 [00:31<00:09,  6.73it/s]



 84%|████████▎ | 329/393 [00:31<00:08,  7.75it/s]



 85%|████████▍ | 333/393 [00:31<00:06,  9.48it/s]



 85%|████████▌ | 335/393 [00:31<00:05, 10.22it/s]



 86%|████████▋ | 339/393 [00:32<00:04, 11.11it/s]



 87%|████████▋ | 341/393 [00:32<00:04, 11.45it/s]



 88%|████████▊ | 345/393 [00:32<00:04, 11.88it/s]



 88%|████████▊ | 347/393 [00:32<00:03, 12.08it/s]



 89%|████████▉ | 351/393 [00:33<00:03, 11.55it/s]



 90%|████████▉ | 353/393 [00:33<00:03, 11.87it/s]



 91%|█████████ | 357/393 [00:33<00:02, 12.18it/s]



 91%|█████████▏| 359/393 [00:33<00:02, 12.31it/s]



 92%|█████████▏| 363/393 [00:34<00:02, 12.02it/s]



 93%|█████████▎| 365/393 [00:34<00:02, 11.92it/s]



 94%|█████████▍| 369/393 [00:34<00:01, 12.12it/s]



 94%|█████████▍| 371/393 [00:34<00:01, 12.09it/s]



 95%|█████████▌| 375/393 [00:35<00:01, 12.31it/s]



 96%|█████████▌| 377/393 [00:35<00:01, 12.36it/s]



 97%|█████████▋| 381/393 [00:35<00:00, 12.09it/s]



 97%|█████████▋| 383/393 [00:35<00:00, 12.30it/s]



 98%|█████████▊| 387/393 [00:36<00:00, 12.51it/s]



 99%|█████████▉| 389/393 [00:36<00:00, 11.52it/s]



 99%|█████████▉| 391/393 [00:36<00:00, 11.74it/s]



100%|██████████| 393/393 [00:36<00:00, 10.70it/s]

Done. 393 MCQs written to phys_unibench_mcqs.json
Images written (when available) into phys_unibench_images.zip





In [None]:
!pip install --quiet requests pillow tqdm


In [None]:
import os, io, sys, json, zipfile
from typing import Optional, List
import requests
from PIL import Image

# try to import tqdm, fallback to identity
try:
    from tqdm import tqdm
except Exception:
    def tqdm(x, **kw):
        return x

# ---- CONFIG ----
HF_JSON_URL = "https://huggingface.co/datasets/PrismaX/PhysUniBench/raw/main/PhysUnivBench_en_MCQ.json"
IMAGE_BASE_URL = "https://huggingface.co/datasets/PrismaX/PhysUniBench/resolve/main/images/"
DEFAULT_START_ID = 174
DEFAULT_OUTPUT_ZIP = "phys_unibench_images.zip"
REQUEST_TIMEOUT = 30
# ----------------

def load_json(path_or_url: Optional[str]) -> List[dict]:
    """Load JSON either from a local file path or (if None) from HF_JSON_URL."""
    if path_or_url:
        if os.path.exists(path_or_url):
            with open(path_or_url, "r", encoding="utf-8") as f:
                return json.load(f)
        else:
            r = requests.get(path_or_url, timeout=REQUEST_TIMEOUT)
            r.raise_for_status()
            return r.json()
    else:
        r = requests.get(HF_JSON_URL, timeout=REQUEST_TIMEOUT)
        r.raise_for_status()
        return r.json()

def candidate_filenames(original: str) -> List[str]:
    """Return ordered candidate filenames to try (handles 0-index vs 1-index and .jpg/.png)."""
    if not original:
        return []
    base, ext = os.path.splitext(original)
    candidates = [original, base + ".png", base + ".jpg"]
    if base.isdigit():
        try:
            n = int(base)
            candidates.append(str(n + 1) + ".png")
            candidates.append(str(n + 1) + ".jpg")
        except Exception:
            pass
    # dedupe preserving order
    seen = set(); out=[]
    for c in candidates:
        if c and c not in seen:
            seen.add(c); out.append(c)
    return out

def download_and_convert(session: requests.Session, fname: str) -> Optional[bytes]:
    """Download IMAGE_BASE_URL + fname and convert to PNG bytes, or return None on failure."""
    url = IMAGE_BASE_URL + fname
    try:
        resp = session.get(url, timeout=REQUEST_TIMEOUT)
        if resp.status_code != 200 or not resp.content:
            return None
        try:
            img = Image.open(io.BytesIO(resp.content)).convert("RGB")
            buf = io.BytesIO(); img.save(buf, format="PNG")
            return buf.getvalue()
        except Exception:
            # fallback: return raw bytes (rare)
            return resp.content
    except Exception:
        return None

def extract_images_to_zip(json_path: Optional[str]=None, start_id: int=DEFAULT_START_ID,
                          output_zip: str=DEFAULT_OUTPUT_ZIP, debug: bool=False):
    """Main function to extract images into zip as images/image{ID}.png"""
    try:
        data = load_json(json_path)
    except Exception as e:
        print("Failed to load JSON:", e)
        return

    if not isinstance(data, list):
        print("JSON root is not a list; aborting.")
        return

    session = requests.Session()
    session.headers.update({"User-Agent":"physunibench-image-extractor/1.0"})

    total = len(data)
    print(f"Loaded {total} entries. Starting ID = {start_id}. Output zip = {output_zip}")

    success = 0
    failed_entries = []

    # ALWAYS create the zip (context manager ensures close)
    with zipfile.ZipFile(output_zip, mode="w", compression=zipfile.ZIP_DEFLATED) as zf:
        # a small README so the zip is never empty
        readme_text = ("PhysUniBench images ZIP\n"
                       "Images stored as images/image{ID}.png\n")
        zf.writestr("README.txt", readme_text)

        current_id = start_id
        for idx, row in enumerate(tqdm(data, desc="Downloading images")):
            qid = current_id
            current_id += 1

            # typical image key names in the dataset
            src_image_name = ""
            if isinstance(row, dict):
                src_image_name = (row.get("image") or row.get("img") or row.get("filename") or "")
            else:
                src_image_name = ""

            image_zip_path = f"images/image{qid}.png"

            if not src_image_name:
                failed_entries.append((qid, src_image_name, ["(no filename in JSON)"]))
                if debug:
                    print(f"[{qid}] no image filename in JSON for entry index {idx}")
                continue

            png_bytes = None
            tried = []
            for cand in candidate_filenames(src_image_name):
                tried.append(IMAGE_BASE_URL + cand)
                png_bytes = download_and_convert(session, cand)
                if png_bytes:
                    break

            if png_bytes:
                zf.writestr(image_zip_path, png_bytes)
                success += 1
                if debug:
                    print(f"[{qid}] downloaded -> {image_zip_path} (source: {cand})")
            else:
                failed_entries.append((qid, src_image_name, tried))
                if debug:
                    print(f"[{qid}] FAILED. Tried URLs:")
                    for u in tried:
                        print("   ", u)

    abs_path = os.path.abspath(output_zip)
    print("\nFinished.")
    print(f"ZIP file written to: {abs_path}")
    print(f"Images downloaded: {success} / {total}")
    if failed_entries:
        print(f"Failed images: {len(failed_entries)}. Use debug=True to inspect tried URLs.")
    else:
        print("All images downloaded successfully (or there were no image fields).")

# ---- RUN it here with defaults ----
# change parameters below if you want (e.g., json_path='/content/PhysUnivBench_en_MCQ.json')
extract_images_to_zip(json_path=None, start_id=174, output_zip="phys_unibench_images.zip", debug=True)


Loaded 393 entries. Starting ID = 174. Output zip = phys_unibench_images.zip


Downloading images:   0%|          | 1/393 [00:00<05:06,  1.28it/s]

[174] downloaded -> images/image174.png (source: 0.jpg)


Downloading images:   1%|          | 2/393 [00:01<04:29,  1.45it/s]

[175] downloaded -> images/image175.png (source: 1.jpg)


Downloading images:   1%|          | 3/393 [00:01<03:26,  1.88it/s]

[176] downloaded -> images/image176.png (source: 2.jpg)


Downloading images:   1%|          | 4/393 [00:01<02:40,  2.42it/s]

[177] downloaded -> images/image177.png (source: 3.jpg)


Downloading images:   1%|▏         | 5/393 [00:02<02:14,  2.89it/s]

[178] downloaded -> images/image178.png (source: 4.jpg)


Downloading images:   2%|▏         | 6/393 [00:02<02:03,  3.14it/s]

[179] downloaded -> images/image179.png (source: 5.jpg)


Downloading images:   2%|▏         | 7/393 [00:02<01:50,  3.49it/s]

[180] downloaded -> images/image180.png (source: 6.jpg)


Downloading images:   2%|▏         | 8/393 [00:03<02:14,  2.87it/s]

[181] downloaded -> images/image181.png (source: 7.jpg)


Downloading images:   2%|▏         | 9/393 [00:03<01:59,  3.22it/s]

[182] downloaded -> images/image182.png (source: 8.jpg)


Downloading images:   3%|▎         | 10/393 [00:03<01:56,  3.28it/s]

[183] downloaded -> images/image183.png (source: 9.jpg)


Downloading images:   3%|▎         | 12/393 [00:04<01:45,  3.61it/s]

[184] downloaded -> images/image184.png (source: 10.jpg)
[185] downloaded -> images/image185.png (source: 11.jpg)


Downloading images:   4%|▎         | 14/393 [00:04<01:40,  3.76it/s]

[186] downloaded -> images/image186.png (source: 12.jpg)
[187] downloaded -> images/image187.png (source: 13.jpg)


Downloading images:   4%|▍         | 15/393 [00:05<01:53,  3.34it/s]

[188] downloaded -> images/image188.png (source: 14.jpg)


Downloading images:   4%|▍         | 16/393 [00:05<01:46,  3.53it/s]

[189] downloaded -> images/image189.png (source: 15.jpg)


Downloading images:   4%|▍         | 17/393 [00:05<01:52,  3.34it/s]

[190] downloaded -> images/image190.png (source: 16.jpg)


Downloading images:   5%|▍         | 18/393 [00:06<01:56,  3.22it/s]

[191] downloaded -> images/image191.png (source: 17.jpg)


Downloading images:   5%|▍         | 19/393 [00:06<02:20,  2.65it/s]

[192] downloaded -> images/image192.png (source: 18.jpg)


Downloading images:   5%|▌         | 21/393 [00:07<01:54,  3.24it/s]

[193] downloaded -> images/image193.png (source: 19.jpg)
[194] downloaded -> images/image194.png (source: 20.jpg)


Downloading images:   6%|▌         | 22/393 [00:07<02:00,  3.08it/s]

[195] downloaded -> images/image195.png (source: 21.jpg)


Downloading images:   6%|▌         | 23/393 [00:07<02:04,  2.98it/s]

[196] downloaded -> images/image196.png (source: 22.jpg)


Downloading images:   6%|▌         | 24/393 [00:08<01:51,  3.32it/s]

[197] downloaded -> images/image197.png (source: 23.jpg)


Downloading images:   6%|▋         | 25/393 [00:08<01:43,  3.55it/s]

[198] downloaded -> images/image198.png (source: 24.jpg)


Downloading images:   7%|▋         | 26/393 [00:08<01:38,  3.73it/s]

[199] downloaded -> images/image199.png (source: 25.jpg)


Downloading images:   7%|▋         | 27/393 [00:08<01:44,  3.50it/s]

[200] downloaded -> images/image200.png (source: 26.jpg)


Downloading images:   7%|▋         | 28/393 [00:09<01:37,  3.75it/s]

[201] downloaded -> images/image201.png (source: 27.jpg)


Downloading images:   7%|▋         | 29/393 [00:09<01:48,  3.36it/s]

[202] downloaded -> images/image202.png (source: 28.jpg)


Downloading images:   8%|▊         | 30/393 [00:09<01:51,  3.25it/s]

[203] downloaded -> images/image203.png (source: 29.jpg)


Downloading images:   8%|▊         | 32/393 [00:10<01:33,  3.87it/s]

[204] downloaded -> images/image204.png (source: 30.jpg)
[205] downloaded -> images/image205.png (source: 31.jpg)


Downloading images:   9%|▊         | 34/393 [00:11<02:55,  2.04it/s]

[206] downloaded -> images/image206.png (source: 32.jpg)
[207] downloaded -> images/image207.png (source: 33.jpg)


Downloading images:   9%|▉         | 35/393 [00:17<12:22,  2.07s/it]

[208] downloaded -> images/image208.png (source: 34.jpg)


Downloading images:   9%|▉         | 36/393 [00:17<09:13,  1.55s/it]

[209] downloaded -> images/image209.png (source: 35.jpg)


Downloading images:   9%|▉         | 37/393 [00:18<07:06,  1.20s/it]

[210] downloaded -> images/image210.png (source: 36.jpg)


Downloading images:  10%|▉         | 39/393 [00:19<04:36,  1.28it/s]

[211] downloaded -> images/image211.png (source: 37.jpg)
[212] downloaded -> images/image212.png (source: 38.jpg)


Downloading images:  10%|█         | 40/393 [00:21<06:37,  1.13s/it]

[213] downloaded -> images/image213.png (source: 39.jpg)


Downloading images:  11%|█         | 42/393 [00:21<04:19,  1.35it/s]

[214] downloaded -> images/image214.png (source: 40.jpg)
[215] downloaded -> images/image215.png (source: 41.jpg)


Downloading images:  11%|█         | 44/393 [00:22<02:42,  2.15it/s]

[216] downloaded -> images/image216.png (source: 42.jpg)
[217] downloaded -> images/image217.png (source: 43.jpg)


Downloading images:  11%|█▏        | 45/393 [00:22<02:25,  2.39it/s]

[218] downloaded -> images/image218.png (source: 44.jpg)


Downloading images:  12%|█▏        | 46/393 [00:22<02:11,  2.63it/s]

[219] downloaded -> images/image219.png (source: 45.jpg)


Downloading images:  12%|█▏        | 47/393 [00:23<01:59,  2.89it/s]

[220] downloaded -> images/image220.png (source: 46.jpg)


Downloading images:  12%|█▏        | 48/393 [00:23<01:53,  3.04it/s]

[221] downloaded -> images/image221.png (source: 47.jpg)


Downloading images:  12%|█▏        | 49/393 [00:23<01:41,  3.38it/s]

[222] downloaded -> images/image222.png (source: 48.jpg)


Downloading images:  13%|█▎        | 50/393 [00:23<01:32,  3.72it/s]

[223] downloaded -> images/image223.png (source: 49.jpg)


Downloading images:  13%|█▎        | 51/393 [00:24<01:28,  3.85it/s]

[224] downloaded -> images/image224.png (source: 50.jpg)


Downloading images:  13%|█▎        | 53/393 [00:24<01:27,  3.90it/s]

[225] downloaded -> images/image225.png (source: 51.jpg)
[226] downloaded -> images/image226.png (source: 52.jpg)


Downloading images:  14%|█▎        | 54/393 [00:24<01:26,  3.92it/s]

[227] downloaded -> images/image227.png (source: 53.jpg)


Downloading images:  14%|█▍        | 55/393 [00:25<01:41,  3.34it/s]

[228] downloaded -> images/image228.png (source: 54.jpg)


Downloading images:  14%|█▍        | 56/393 [00:25<01:42,  3.28it/s]

[229] downloaded -> images/image229.png (source: 55.jpg)


Downloading images:  15%|█▍        | 57/393 [00:26<01:55,  2.92it/s]

[230] downloaded -> images/image230.png (source: 56.jpg)


Downloading images:  15%|█▍        | 58/393 [00:26<01:42,  3.28it/s]

[231] downloaded -> images/image231.png (source: 57.jpg)


Downloading images:  15%|█▌        | 59/393 [00:26<01:38,  3.39it/s]

[232] downloaded -> images/image232.png (source: 58.jpg)


Downloading images:  15%|█▌        | 60/393 [00:27<02:04,  2.67it/s]

[233] downloaded -> images/image233.png (source: 59.jpg)


Downloading images:  16%|█▌        | 62/393 [00:27<01:33,  3.53it/s]

[234] downloaded -> images/image234.png (source: 60.jpg)
[235] downloaded -> images/image235.png (source: 61.jpg)


Downloading images:  16%|█▌        | 63/393 [00:27<01:45,  3.13it/s]

[236] downloaded -> images/image236.png (source: 62.jpg)


Downloading images:  16%|█▋        | 64/393 [00:28<01:48,  3.04it/s]

[237] downloaded -> images/image237.png (source: 63.jpg)


Downloading images:  17%|█▋        | 65/393 [00:28<01:40,  3.25it/s]

[238] downloaded -> images/image238.png (source: 64.jpg)


Downloading images:  17%|█▋        | 66/393 [00:28<01:33,  3.48it/s]

[239] downloaded -> images/image239.png (source: 65.jpg)


Downloading images:  17%|█▋        | 67/393 [00:28<01:27,  3.74it/s]

[240] downloaded -> images/image240.png (source: 66.jpg)


Downloading images:  18%|█▊        | 69/393 [00:29<01:14,  4.36it/s]

[241] downloaded -> images/image241.png (source: 67.jpg)
[242] downloaded -> images/image242.png (source: 68.jpg)


Downloading images:  18%|█▊        | 70/393 [00:29<01:08,  4.69it/s]

[243] downloaded -> images/image243.png (source: 69.jpg)


Downloading images:  18%|█▊        | 71/393 [00:30<02:05,  2.57it/s]

[244] downloaded -> images/image244.png (source: 70.jpg)


Downloading images:  18%|█▊        | 72/393 [00:30<01:53,  2.83it/s]

[245] downloaded -> images/image245.png (source: 71.jpg)


Downloading images:  19%|█▊        | 73/393 [00:31<02:15,  2.36it/s]

[246] downloaded -> images/image246.png (source: 72.jpg)


Downloading images:  19%|█▉        | 74/393 [00:31<02:19,  2.29it/s]

[247] downloaded -> images/image247.png (source: 73.jpg)


Downloading images:  19%|█▉        | 75/393 [00:32<02:16,  2.32it/s]

[248] downloaded -> images/image248.png (source: 74.jpg)


Downloading images:  20%|█▉        | 77/393 [00:32<01:38,  3.22it/s]

[249] downloaded -> images/image249.png (source: 75.jpg)
[250] downloaded -> images/image250.png (source: 76.jpg)


Downloading images:  20%|█▉        | 78/393 [00:32<01:28,  3.55it/s]

[251] downloaded -> images/image251.png (source: 77.jpg)


Downloading images:  20%|██        | 79/393 [00:32<01:25,  3.66it/s]

[252] downloaded -> images/image252.png (source: 78.jpg)


Downloading images:  20%|██        | 80/393 [00:33<01:38,  3.17it/s]

[253] downloaded -> images/image253.png (source: 79.jpg)


Downloading images:  21%|██        | 82/393 [00:33<01:20,  3.88it/s]

[254] downloaded -> images/image254.png (source: 80.jpg)
[255] downloaded -> images/image255.png (source: 81.jpg)


Downloading images:  21%|██▏       | 84/393 [00:34<01:11,  4.32it/s]

[256] downloaded -> images/image256.png (source: 82.jpg)
[257] downloaded -> images/image257.png (source: 83.jpg)


Downloading images:  22%|██▏       | 85/393 [00:34<01:09,  4.43it/s]

[258] downloaded -> images/image258.png (source: 84.jpg)


Downloading images:  22%|██▏       | 87/393 [00:34<01:06,  4.57it/s]

[259] downloaded -> images/image259.png (source: 85.jpg)
[260] downloaded -> images/image260.png (source: 86.jpg)


Downloading images:  22%|██▏       | 88/393 [00:35<01:13,  4.12it/s]

[261] downloaded -> images/image261.png (source: 87.jpg)


Downloading images:  23%|██▎       | 89/393 [00:35<01:11,  4.27it/s]

[262] downloaded -> images/image262.png (source: 88.jpg)


Downloading images:  23%|██▎       | 90/393 [00:35<01:14,  4.07it/s]

[263] downloaded -> images/image263.png (source: 89.jpg)


Downloading images:  23%|██▎       | 91/393 [00:35<01:15,  4.00it/s]

[264] downloaded -> images/image264.png (source: 90.jpg)


Downloading images:  23%|██▎       | 92/393 [00:36<01:22,  3.64it/s]

[265] downloaded -> images/image265.png (source: 91.jpg)


Downloading images:  24%|██▍       | 94/393 [00:36<01:15,  3.96it/s]

[266] downloaded -> images/image266.png (source: 92.jpg)
[267] downloaded -> images/image267.png (source: 93.jpg)


Downloading images:  24%|██▍       | 95/393 [00:37<01:22,  3.61it/s]

[268] downloaded -> images/image268.png (source: 94.jpg)


Downloading images:  24%|██▍       | 96/393 [00:37<01:16,  3.88it/s]

[269] downloaded -> images/image269.png (source: 95.jpg)


Downloading images:  25%|██▍       | 97/393 [00:37<01:13,  4.04it/s]

[270] downloaded -> images/image270.png (source: 96.jpg)


Downloading images:  25%|██▍       | 98/393 [00:38<01:43,  2.86it/s]

[271] downloaded -> images/image271.png (source: 97.jpg)


Downloading images:  25%|██▌       | 99/393 [00:38<01:50,  2.66it/s]

[272] downloaded -> images/image272.png (source: 98.jpg)


Downloading images:  25%|██▌       | 100/393 [00:38<01:42,  2.85it/s]

[273] downloaded -> images/image273.png (source: 99.jpg)


Downloading images:  26%|██▌       | 101/393 [00:39<01:41,  2.87it/s]

[274] downloaded -> images/image274.png (source: 100.jpg)


Downloading images:  26%|██▌       | 102/393 [00:39<01:41,  2.87it/s]

[275] downloaded -> images/image275.png (source: 101.jpg)


Downloading images:  26%|██▌       | 103/393 [00:40<03:03,  1.58it/s]

[276] downloaded -> images/image276.png (source: 102.jpg)


Downloading images:  26%|██▋       | 104/393 [00:41<02:35,  1.86it/s]

[277] downloaded -> images/image277.png (source: 103.jpg)


Downloading images:  27%|██▋       | 105/393 [00:41<02:08,  2.25it/s]

[278] downloaded -> images/image278.png (source: 104.jpg)


Downloading images:  27%|██▋       | 106/393 [00:47<09:47,  2.05s/it]

[279] downloaded -> images/image279.png (source: 105.jpg)


Downloading images:  27%|██▋       | 107/393 [00:47<07:16,  1.53s/it]

[280] downloaded -> images/image280.png (source: 106.jpg)


Downloading images:  27%|██▋       | 108/393 [00:47<05:34,  1.17s/it]

[281] downloaded -> images/image281.png (source: 107.jpg)


Downloading images:  28%|██▊       | 109/393 [00:48<04:21,  1.09it/s]

[282] downloaded -> images/image282.png (source: 108.jpg)


Downloading images:  28%|██▊       | 110/393 [00:48<03:23,  1.39it/s]

[283] downloaded -> images/image283.png (source: 109.jpg)


Downloading images:  28%|██▊       | 111/393 [00:48<03:09,  1.49it/s]

[284] downloaded -> images/image284.png (source: 110.jpg)


Downloading images:  28%|██▊       | 112/393 [00:51<05:23,  1.15s/it]

[285] downloaded -> images/image285.png (source: 111.jpg)


Downloading images:  29%|██▉       | 113/393 [00:52<04:57,  1.06s/it]

[286] downloaded -> images/image286.png (source: 112.jpg)


Downloading images:  29%|██▉       | 115/393 [00:54<04:38,  1.00s/it]

[287] downloaded -> images/image287.png (source: 113.jpg)
[288] downloaded -> images/image288.png (source: 114.jpg)


Downloading images:  30%|██▉       | 117/393 [00:54<02:46,  1.66it/s]

[289] downloaded -> images/image289.png (source: 115.jpg)
[290] downloaded -> images/image290.png (source: 116.jpg)


Downloading images:  30%|███       | 119/393 [00:55<01:46,  2.57it/s]

[291] downloaded -> images/image291.png (source: 117.jpg)
[292] downloaded -> images/image292.png (source: 118.jpg)


Downloading images:  31%|███       | 120/393 [00:55<01:33,  2.92it/s]

[293] downloaded -> images/image293.png (source: 119.jpg)


Downloading images:  31%|███       | 121/393 [00:55<01:32,  2.95it/s]

[294] downloaded -> images/image294.png (source: 120.jpg)


Downloading images:  31%|███       | 122/393 [00:55<01:27,  3.09it/s]

[295] downloaded -> images/image295.png (source: 121.jpg)


Downloading images:  31%|███▏      | 123/393 [00:56<01:29,  3.00it/s]

[296] downloaded -> images/image296.png (source: 122.jpg)


Downloading images:  32%|███▏      | 125/393 [00:57<02:05,  2.13it/s]

[297] downloaded -> images/image297.png (source: 123.jpg)
[298] downloaded -> images/image298.png (source: 124.jpg)


Downloading images:  32%|███▏      | 126/393 [00:57<01:45,  2.52it/s]

[299] downloaded -> images/image299.png (source: 125.jpg)


Downloading images:  32%|███▏      | 127/393 [00:58<01:35,  2.78it/s]

[300] downloaded -> images/image300.png (source: 126.jpg)


Downloading images:  33%|███▎      | 128/393 [00:58<01:35,  2.78it/s]

[301] downloaded -> images/image301.png (source: 127.jpg)


Downloading images:  33%|███▎      | 129/393 [00:58<01:32,  2.84it/s]

[302] downloaded -> images/image302.png (source: 128.jpg)


Downloading images:  33%|███▎      | 130/393 [00:59<01:24,  3.10it/s]

[303] downloaded -> images/image303.png (source: 129.jpg)


Downloading images:  33%|███▎      | 131/393 [00:59<01:29,  2.92it/s]

[304] downloaded -> images/image304.png (source: 130.jpg)


Downloading images:  34%|███▎      | 132/393 [00:59<01:24,  3.08it/s]

[305] downloaded -> images/image305.png (source: 131.jpg)


Downloading images:  34%|███▍      | 133/393 [01:00<01:25,  3.04it/s]

[306] downloaded -> images/image306.png (source: 132.jpg)


Downloading images:  34%|███▍      | 134/393 [01:00<01:30,  2.87it/s]

[307] downloaded -> images/image307.png (source: 133.jpg)


Downloading images:  35%|███▍      | 136/393 [01:00<01:14,  3.46it/s]

[308] downloaded -> images/image308.png (source: 134.jpg)
[309] downloaded -> images/image309.png (source: 135.jpg)


Downloading images:  35%|███▌      | 138/393 [01:01<00:58,  4.39it/s]

[310] downloaded -> images/image310.png (source: 136.jpg)
[311] downloaded -> images/image311.png (source: 137.jpg)


Downloading images:  36%|███▌      | 140/393 [01:01<00:56,  4.49it/s]

[312] downloaded -> images/image312.png (source: 138.jpg)
[313] downloaded -> images/image313.png (source: 139.jpg)


Downloading images:  36%|███▌      | 141/393 [01:02<01:02,  4.01it/s]

[314] downloaded -> images/image314.png (source: 140.jpg)


Downloading images:  36%|███▌      | 142/393 [01:02<01:21,  3.08it/s]

[315] downloaded -> images/image315.png (source: 141.jpg)


Downloading images:  36%|███▋      | 143/393 [01:03<01:40,  2.50it/s]

[316] downloaded -> images/image316.png (source: 142.jpg)


Downloading images:  37%|███▋      | 145/393 [01:03<01:17,  3.21it/s]

[317] downloaded -> images/image317.png (source: 143.jpg)
[318] downloaded -> images/image318.png (source: 144.jpg)


Downloading images:  37%|███▋      | 146/393 [01:03<01:05,  3.74it/s]

[319] downloaded -> images/image319.png (source: 145.jpg)


Downloading images:  37%|███▋      | 147/393 [01:04<01:07,  3.66it/s]

[320] downloaded -> images/image320.png (source: 146.jpg)


Downloading images:  38%|███▊      | 148/393 [01:04<01:01,  3.96it/s]

[321] downloaded -> images/image321.png (source: 147.jpg)


Downloading images:  38%|███▊      | 149/393 [01:04<01:24,  2.89it/s]

[322] downloaded -> images/image322.png (source: 148.jpg)


Downloading images:  38%|███▊      | 150/393 [01:05<01:17,  3.14it/s]

[323] downloaded -> images/image323.png (source: 149.jpg)


Downloading images:  39%|███▊      | 152/393 [01:05<01:07,  3.55it/s]

[324] downloaded -> images/image324.png (source: 150.jpg)
[325] downloaded -> images/image325.png (source: 151.jpg)


Downloading images:  39%|███▉      | 153/393 [01:05<01:08,  3.52it/s]

[326] downloaded -> images/image326.png (source: 152.jpg)


Downloading images:  39%|███▉      | 155/393 [01:06<00:57,  4.11it/s]

[327] downloaded -> images/image327.png (source: 153.jpg)
[328] downloaded -> images/image328.png (source: 154.jpg)


Downloading images:  40%|███▉      | 156/393 [01:06<00:57,  4.15it/s]

[329] downloaded -> images/image329.png (source: 155.jpg)


Downloading images:  40%|████      | 158/393 [01:06<00:50,  4.68it/s]

[330] downloaded -> images/image330.png (source: 156.jpg)
[331] downloaded -> images/image331.png (source: 157.jpg)


Downloading images:  40%|████      | 159/393 [01:07<00:50,  4.67it/s]

[332] downloaded -> images/image332.png (source: 158.jpg)


Downloading images:  41%|████      | 160/393 [01:07<00:54,  4.28it/s]

[333] downloaded -> images/image333.png (source: 159.jpg)


Downloading images:  41%|████      | 161/393 [01:07<00:52,  4.46it/s]

[334] downloaded -> images/image334.png (source: 160.jpg)


Downloading images:  41%|████      | 162/393 [01:07<00:53,  4.32it/s]

[335] downloaded -> images/image335.png (source: 161.jpg)


Downloading images:  41%|████▏     | 163/393 [01:08<00:51,  4.44it/s]

[336] downloaded -> images/image336.png (source: 162.jpg)


Downloading images:  42%|████▏     | 164/393 [01:08<00:55,  4.15it/s]

[337] downloaded -> images/image337.png (source: 163.jpg)


Downloading images:  42%|████▏     | 165/393 [01:08<01:06,  3.42it/s]

[338] downloaded -> images/image338.png (source: 164.jpg)


Downloading images:  42%|████▏     | 166/393 [01:09<01:08,  3.30it/s]

[339] downloaded -> images/image339.png (source: 165.jpg)


Downloading images:  42%|████▏     | 167/393 [01:09<01:06,  3.41it/s]

[340] downloaded -> images/image340.png (source: 166.jpg)


Downloading images:  43%|████▎     | 168/393 [01:09<01:10,  3.17it/s]

[341] downloaded -> images/image341.png (source: 167.jpg)


Downloading images:  43%|████▎     | 170/393 [01:10<01:06,  3.34it/s]

[342] downloaded -> images/image342.png (source: 168.jpg)
[343] downloaded -> images/image343.png (source: 169.jpg)


Downloading images:  44%|████▎     | 171/393 [01:10<01:12,  3.07it/s]

[344] downloaded -> images/image344.png (source: 170.jpg)


Downloading images:  44%|████▍     | 172/393 [01:11<01:08,  3.23it/s]

[345] downloaded -> images/image345.png (source: 171.jpg)


Downloading images:  44%|████▍     | 174/393 [01:11<00:56,  3.85it/s]

[346] downloaded -> images/image346.png (source: 172.jpg)
[347] downloaded -> images/image347.png (source: 173.jpg)


Downloading images:  45%|████▍     | 175/393 [01:11<00:55,  3.92it/s]

[348] downloaded -> images/image348.png (source: 174.jpg)


Downloading images:  45%|████▍     | 176/393 [01:12<00:59,  3.65it/s]

[349] downloaded -> images/image349.png (source: 175.jpg)


Downloading images:  45%|████▌     | 178/393 [01:12<00:48,  4.44it/s]

[350] downloaded -> images/image350.png (source: 176.jpg)
[351] downloaded -> images/image351.png (source: 177.jpg)


Downloading images:  46%|████▌     | 179/393 [01:12<00:49,  4.30it/s]

[352] downloaded -> images/image352.png (source: 178.jpg)


Downloading images:  46%|████▌     | 180/393 [01:12<00:48,  4.38it/s]

[353] downloaded -> images/image353.png (source: 179.jpg)


Downloading images:  46%|████▌     | 181/393 [01:13<00:48,  4.34it/s]

[354] downloaded -> images/image354.png (source: 180.jpg)


Downloading images:  46%|████▋     | 182/393 [01:13<00:57,  3.67it/s]

[355] downloaded -> images/image355.png (source: 181.jpg)


Downloading images:  47%|████▋     | 183/393 [01:13<01:06,  3.16it/s]

[356] downloaded -> images/image356.png (source: 182.jpg)


Downloading images:  47%|████▋     | 184/393 [01:14<01:00,  3.46it/s]

[357] downloaded -> images/image357.png (source: 183.jpg)


Downloading images:  47%|████▋     | 185/393 [01:14<01:02,  3.32it/s]

[358] downloaded -> images/image358.png (source: 184.jpg)


Downloading images:  47%|████▋     | 186/393 [01:15<01:26,  2.38it/s]

[359] downloaded -> images/image359.png (source: 185.jpg)


Downloading images:  48%|████▊     | 187/393 [01:16<02:38,  1.30it/s]

[360] downloaded -> images/image360.png (source: 186.jpg)


Downloading images:  48%|████▊     | 188/393 [01:16<02:04,  1.64it/s]

[361] downloaded -> images/image361.png (source: 187.jpg)


Downloading images:  48%|████▊     | 189/393 [01:17<01:43,  1.98it/s]

[362] downloaded -> images/image362.png (source: 188.jpg)


Downloading images:  48%|████▊     | 190/393 [01:17<01:46,  1.91it/s]

[363] downloaded -> images/image363.png (source: 189.jpg)


Downloading images:  49%|████▊     | 191/393 [01:19<02:51,  1.18it/s]

[364] downloaded -> images/image364.png (source: 190.jpg)


Downloading images:  49%|████▉     | 192/393 [01:19<02:19,  1.44it/s]

[365] downloaded -> images/image365.png (source: 191.jpg)


Downloading images:  49%|████▉     | 193/393 [01:20<02:15,  1.47it/s]

[366] downloaded -> images/image366.png (source: 192.jpg)


Downloading images:  49%|████▉     | 194/393 [01:21<02:13,  1.49it/s]

[367] downloaded -> images/image367.png (source: 193.jpg)


Downloading images:  50%|████▉     | 195/393 [01:23<03:41,  1.12s/it]

[368] downloaded -> images/image368.png (source: 194.jpg)


Downloading images:  50%|█████     | 197/393 [01:23<02:18,  1.42it/s]

[369] downloaded -> images/image369.png (source: 195.jpg)
[370] downloaded -> images/image370.png (source: 196.jpg)


Downloading images:  50%|█████     | 198/393 [01:24<02:27,  1.32it/s]

[371] downloaded -> images/image371.png (source: 197.jpg)


Downloading images:  51%|█████     | 199/393 [01:25<02:05,  1.55it/s]

[372] downloaded -> images/image372.png (source: 198.jpg)


Downloading images:  51%|█████     | 200/393 [01:25<01:39,  1.94it/s]

[373] downloaded -> images/image373.png (source: 199.jpg)


Downloading images:  51%|█████     | 201/393 [01:25<01:32,  2.08it/s]

[374] downloaded -> images/image374.png (source: 200.jpg)


Downloading images:  51%|█████▏    | 202/393 [01:26<01:23,  2.27it/s]

[375] downloaded -> images/image375.png (source: 201.jpg)


Downloading images:  52%|█████▏    | 203/393 [01:26<01:24,  2.24it/s]

[376] downloaded -> images/image376.png (source: 202.jpg)


Downloading images:  52%|█████▏    | 204/393 [01:26<01:14,  2.55it/s]

[377] downloaded -> images/image377.png (source: 203.jpg)


Downloading images:  52%|█████▏    | 205/393 [01:27<01:28,  2.11it/s]

[378] downloaded -> images/image378.png (source: 204.jpg)


Downloading images:  52%|█████▏    | 206/393 [01:27<01:16,  2.45it/s]

[379] downloaded -> images/image379.png (source: 205.jpg)


Downloading images:  53%|█████▎    | 207/393 [01:28<01:31,  2.04it/s]

[380] downloaded -> images/image380.png (source: 206.jpg)


Downloading images:  53%|█████▎    | 208/393 [01:28<01:27,  2.11it/s]

[381] downloaded -> images/image381.png (source: 207.jpg)


Downloading images:  53%|█████▎    | 209/393 [01:29<01:37,  1.88it/s]

[382] downloaded -> images/image382.png (source: 208.jpg)


Downloading images:  53%|█████▎    | 210/393 [01:29<01:19,  2.31it/s]

[383] downloaded -> images/image383.png (source: 209.jpg)


Downloading images:  54%|█████▎    | 211/393 [01:30<01:18,  2.32it/s]

[384] downloaded -> images/image384.png (source: 210.jpg)


Downloading images:  54%|█████▍    | 212/393 [01:30<01:11,  2.52it/s]

[385] downloaded -> images/image385.png (source: 211.jpg)


Downloading images:  54%|█████▍    | 213/393 [01:30<01:03,  2.83it/s]

[386] downloaded -> images/image386.png (source: 212.jpg)


Downloading images:  54%|█████▍    | 214/393 [01:30<00:58,  3.05it/s]

[387] downloaded -> images/image387.png (source: 213.jpg)


Downloading images:  55%|█████▍    | 215/393 [01:31<00:52,  3.39it/s]

[388] downloaded -> images/image388.png (source: 214.jpg)


Downloading images:  55%|█████▍    | 216/393 [01:31<00:47,  3.70it/s]

[389] downloaded -> images/image389.png (source: 215.jpg)


Downloading images:  55%|█████▌    | 217/393 [01:31<00:46,  3.76it/s]

[390] downloaded -> images/image390.png (source: 216.jpg)


Downloading images:  55%|█████▌    | 218/393 [01:31<00:45,  3.82it/s]

[391] downloaded -> images/image391.png (source: 217.jpg)


Downloading images:  56%|█████▌    | 219/393 [01:32<00:42,  4.08it/s]

[392] downloaded -> images/image392.png (source: 218.jpg)


Downloading images:  56%|█████▌    | 220/393 [01:32<00:42,  4.09it/s]

[393] downloaded -> images/image393.png (source: 219.jpg)


Downloading images:  56%|█████▌    | 221/393 [01:32<00:40,  4.29it/s]

[394] downloaded -> images/image394.png (source: 220.jpg)


Downloading images:  56%|█████▋    | 222/393 [01:32<00:39,  4.38it/s]

[395] downloaded -> images/image395.png (source: 221.jpg)


Downloading images:  57%|█████▋    | 224/393 [01:33<00:34,  4.91it/s]

[396] downloaded -> images/image396.png (source: 222.jpg)
[397] downloaded -> images/image397.png (source: 223.jpg)


Downloading images:  57%|█████▋    | 225/393 [01:33<00:33,  5.06it/s]

[398] downloaded -> images/image398.png (source: 224.jpg)


Downloading images:  58%|█████▊    | 227/393 [01:33<00:33,  4.93it/s]

[399] downloaded -> images/image399.png (source: 225.jpg)
[400] downloaded -> images/image400.png (source: 226.jpg)


Downloading images:  58%|█████▊    | 228/393 [01:33<00:34,  4.74it/s]

[401] downloaded -> images/image401.png (source: 227.jpg)


Downloading images:  58%|█████▊    | 229/393 [01:34<00:34,  4.75it/s]

[402] downloaded -> images/image402.png (source: 228.jpg)


Downloading images:  59%|█████▊    | 230/393 [01:34<00:38,  4.20it/s]

[403] downloaded -> images/image403.png (source: 229.jpg)


Downloading images:  59%|█████▉    | 231/393 [01:34<00:40,  3.99it/s]

[404] downloaded -> images/image404.png (source: 230.jpg)


Downloading images:  59%|█████▉    | 232/393 [01:35<00:54,  2.93it/s]

[405] downloaded -> images/image405.png (source: 231.jpg)


Downloading images:  59%|█████▉    | 233/393 [01:35<00:53,  2.99it/s]

[406] downloaded -> images/image406.png (source: 232.jpg)


Downloading images:  60%|█████▉    | 234/393 [01:35<00:49,  3.23it/s]

[407] downloaded -> images/image407.png (source: 233.jpg)


Downloading images:  60%|█████▉    | 235/393 [01:36<00:47,  3.36it/s]

[408] downloaded -> images/image408.png (source: 234.jpg)


Downloading images:  60%|██████    | 236/393 [01:36<00:43,  3.64it/s]

[409] downloaded -> images/image409.png (source: 235.jpg)


Downloading images:  60%|██████    | 237/393 [01:36<00:46,  3.36it/s]

[410] downloaded -> images/image410.png (source: 236.jpg)


Downloading images:  61%|██████    | 238/393 [01:36<00:43,  3.59it/s]

[411] downloaded -> images/image411.png (source: 237.jpg)


Downloading images:  61%|██████    | 239/393 [01:37<00:40,  3.85it/s]

[412] downloaded -> images/image412.png (source: 238.jpg)


Downloading images:  61%|██████    | 240/393 [01:37<00:37,  4.03it/s]

[413] downloaded -> images/image413.png (source: 239.jpg)


Downloading images:  61%|██████▏   | 241/393 [01:37<00:41,  3.64it/s]

[414] downloaded -> images/image414.png (source: 240.jpg)


Downloading images:  62%|██████▏   | 243/393 [01:38<00:44,  3.39it/s]

[415] downloaded -> images/image415.png (source: 241.jpg)
[416] downloaded -> images/image416.png (source: 242.jpg)


Downloading images:  62%|██████▏   | 244/393 [01:38<00:43,  3.45it/s]

[417] downloaded -> images/image417.png (source: 243.jpg)


Downloading images:  62%|██████▏   | 245/393 [01:38<00:40,  3.63it/s]

[418] downloaded -> images/image418.png (source: 244.jpg)


Downloading images:  63%|██████▎   | 246/393 [01:41<02:23,  1.03it/s]

[419] downloaded -> images/image419.png (source: 245.jpg)


Downloading images:  63%|██████▎   | 247/393 [01:42<02:33,  1.05s/it]

[420] downloaded -> images/image420.png (source: 246.jpg)


Downloading images:  63%|██████▎   | 248/393 [01:43<02:04,  1.17it/s]

[421] downloaded -> images/image421.png (source: 247.jpg)


Downloading images:  63%|██████▎   | 249/393 [01:43<01:37,  1.47it/s]

[422] downloaded -> images/image422.png (source: 248.jpg)


Downloading images:  64%|██████▎   | 250/393 [01:43<01:21,  1.76it/s]

[423] downloaded -> images/image423.png (source: 249.jpg)


Downloading images:  64%|██████▍   | 251/393 [01:43<01:06,  2.12it/s]

[424] downloaded -> images/image424.png (source: 250.jpg)


Downloading images:  64%|██████▍   | 252/393 [01:44<01:01,  2.30it/s]

[425] downloaded -> images/image425.png (source: 251.jpg)


Downloading images:  64%|██████▍   | 253/393 [01:44<00:51,  2.72it/s]

[426] downloaded -> images/image426.png (source: 252.jpg)


Downloading images:  65%|██████▍   | 254/393 [01:45<01:05,  2.12it/s]

[427] downloaded -> images/image427.png (source: 253.jpg)


Downloading images:  65%|██████▍   | 255/393 [01:46<01:33,  1.48it/s]

[428] downloaded -> images/image428.png (source: 254.jpg)


Downloading images:  65%|██████▌   | 256/393 [01:46<01:21,  1.67it/s]

[429] downloaded -> images/image429.png (source: 255.jpg)


Downloading images:  65%|██████▌   | 257/393 [01:47<01:33,  1.45it/s]

[430] downloaded -> images/image430.png (source: 256.jpg)


Downloading images:  66%|██████▌   | 258/393 [01:47<01:13,  1.83it/s]

[431] downloaded -> images/image431.png (source: 257.jpg)


Downloading images:  66%|██████▌   | 259/393 [01:48<01:10,  1.91it/s]

[432] downloaded -> images/image432.png (source: 258.jpg)


Downloading images:  66%|██████▌   | 260/393 [01:48<01:04,  2.07it/s]

[433] downloaded -> images/image433.png (source: 259.jpg)


Downloading images:  66%|██████▋   | 261/393 [01:49<00:56,  2.34it/s]

[434] downloaded -> images/image434.png (source: 260.jpg)


Downloading images:  67%|██████▋   | 263/393 [01:49<00:46,  2.82it/s]

[435] downloaded -> images/image435.png (source: 261.jpg)
[436] downloaded -> images/image436.png (source: 262.jpg)


Downloading images:  67%|██████▋   | 264/393 [01:49<00:40,  3.21it/s]

[437] downloaded -> images/image437.png (source: 263.jpg)


Downloading images:  67%|██████▋   | 265/393 [01:50<00:40,  3.18it/s]

[438] downloaded -> images/image438.png (source: 264.jpg)


Downloading images:  68%|██████▊   | 266/393 [01:50<00:37,  3.40it/s]

[439] downloaded -> images/image439.png (source: 265.jpg)


Downloading images:  68%|██████▊   | 268/393 [01:51<00:35,  3.53it/s]

[440] downloaded -> images/image440.png (source: 266.jpg)
[441] downloaded -> images/image441.png (source: 267.jpg)


Downloading images:  68%|██████▊   | 269/393 [01:51<00:32,  3.83it/s]

[442] downloaded -> images/image442.png (source: 268.jpg)


Downloading images:  69%|██████▊   | 270/393 [01:51<00:34,  3.56it/s]

[443] downloaded -> images/image443.png (source: 269.jpg)


Downloading images:  69%|██████▉   | 271/393 [01:51<00:33,  3.61it/s]

[444] downloaded -> images/image444.png (source: 270.jpg)


Downloading images:  69%|██████▉   | 272/393 [01:52<00:36,  3.33it/s]

[445] downloaded -> images/image445.png (source: 271.jpg)


Downloading images:  69%|██████▉   | 273/393 [01:52<00:42,  2.80it/s]

[446] downloaded -> images/image446.png (source: 272.jpg)


Downloading images:  70%|██████▉   | 274/393 [01:52<00:38,  3.12it/s]

[447] downloaded -> images/image447.png (source: 273.jpg)


Downloading images:  70%|██████▉   | 275/393 [01:53<00:36,  3.27it/s]

[448] downloaded -> images/image448.png (source: 274.jpg)


Downloading images:  70%|███████   | 276/393 [01:56<02:27,  1.26s/it]

[449] downloaded -> images/image449.png (source: 275.jpg)


Downloading images:  70%|███████   | 277/393 [01:57<01:58,  1.02s/it]

[450] downloaded -> images/image450.png (source: 276.jpg)


Downloading images:  71%|███████   | 278/393 [01:57<01:33,  1.23it/s]

[451] downloaded -> images/image451.png (source: 277.jpg)


Downloading images:  71%|███████   | 279/393 [01:57<01:16,  1.49it/s]

[452] downloaded -> images/image452.png (source: 278.jpg)


Downloading images:  71%|███████   | 280/393 [01:58<01:02,  1.80it/s]

[453] downloaded -> images/image453.png (source: 279.jpg)


Downloading images:  72%|███████▏  | 281/393 [01:58<00:53,  2.10it/s]

[454] downloaded -> images/image454.png (source: 280.jpg)


Downloading images:  72%|███████▏  | 282/393 [01:58<00:44,  2.51it/s]

[455] downloaded -> images/image455.png (source: 281.jpg)


Downloading images:  72%|███████▏  | 283/393 [01:59<00:52,  2.11it/s]

[456] downloaded -> images/image456.png (source: 282.jpg)


Downloading images:  72%|███████▏  | 284/393 [01:59<00:43,  2.49it/s]

[457] downloaded -> images/image457.png (source: 283.jpg)


Downloading images:  73%|███████▎  | 285/393 [01:59<00:39,  2.76it/s]

[458] downloaded -> images/image458.png (source: 284.jpg)


Downloading images:  73%|███████▎  | 286/393 [02:00<00:35,  3.04it/s]

[459] downloaded -> images/image459.png (source: 285.jpg)


Downloading images:  73%|███████▎  | 287/393 [02:00<00:36,  2.88it/s]

[460] downloaded -> images/image460.png (source: 286.jpg)


Downloading images:  73%|███████▎  | 288/393 [02:00<00:32,  3.20it/s]

[461] downloaded -> images/image461.png (source: 287.jpg)


Downloading images:  74%|███████▎  | 289/393 [02:00<00:30,  3.37it/s]

[462] downloaded -> images/image462.png (source: 288.jpg)


Downloading images:  74%|███████▍  | 290/393 [02:01<00:28,  3.64it/s]

[463] downloaded -> images/image463.png (source: 289.jpg)


Downloading images:  74%|███████▍  | 291/393 [02:02<00:49,  2.06it/s]

[464] downloaded -> images/image464.png (source: 290.jpg)


Downloading images:  74%|███████▍  | 292/393 [02:02<00:58,  1.72it/s]

[465] downloaded -> images/image465.png (source: 291.jpg)


Downloading images:  75%|███████▍  | 293/393 [02:03<00:48,  2.05it/s]

[466] downloaded -> images/image466.png (source: 292.jpg)


Downloading images:  75%|███████▍  | 294/393 [02:03<00:44,  2.24it/s]

[467] downloaded -> images/image467.png (source: 293.jpg)


Downloading images:  75%|███████▌  | 295/393 [02:04<00:44,  2.20it/s]

[468] downloaded -> images/image468.png (source: 294.jpg)


Downloading images:  75%|███████▌  | 296/393 [02:04<00:39,  2.43it/s]

[469] downloaded -> images/image469.png (source: 295.jpg)


Downloading images:  76%|███████▌  | 297/393 [02:04<00:39,  2.41it/s]

[470] downloaded -> images/image470.png (source: 296.jpg)


Downloading images:  76%|███████▌  | 298/393 [02:05<00:35,  2.64it/s]

[471] downloaded -> images/image471.png (source: 297.jpg)


Downloading images:  76%|███████▌  | 299/393 [02:05<00:38,  2.44it/s]

[472] downloaded -> images/image472.png (source: 298.jpg)


Downloading images:  76%|███████▋  | 300/393 [02:05<00:32,  2.88it/s]

[473] downloaded -> images/image473.png (source: 299.jpg)


Downloading images:  77%|███████▋  | 301/393 [02:06<00:29,  3.07it/s]

[474] downloaded -> images/image474.png (source: 300.jpg)


Downloading images:  77%|███████▋  | 303/393 [02:06<00:25,  3.52it/s]

[475] downloaded -> images/image475.png (source: 301.jpg)
[476] downloaded -> images/image476.png (source: 302.jpg)


Downloading images:  77%|███████▋  | 304/393 [02:06<00:22,  3.87it/s]

[477] downloaded -> images/image477.png (source: 303.jpg)


Downloading images:  78%|███████▊  | 305/393 [02:07<00:23,  3.67it/s]

[478] downloaded -> images/image478.png (source: 304.jpg)
[479] downloaded -> images/image479.png (source: 305.jpg)


Downloading images:  78%|███████▊  | 307/393 [02:07<00:20,  4.26it/s]

[480] downloaded -> images/image480.png (source: 306.jpg)


Downloading images:  78%|███████▊  | 308/393 [02:07<00:19,  4.34it/s]

[481] downloaded -> images/image481.png (source: 307.jpg)


Downloading images:  79%|███████▊  | 309/393 [02:07<00:19,  4.31it/s]

[482] downloaded -> images/image482.png (source: 308.jpg)


Downloading images:  79%|███████▉  | 310/393 [02:08<00:19,  4.21it/s]

[483] downloaded -> images/image483.png (source: 309.jpg)


Downloading images:  79%|███████▉  | 311/393 [02:08<00:19,  4.29it/s]

[484] downloaded -> images/image484.png (source: 310.jpg)


Downloading images:  80%|███████▉  | 313/393 [02:08<00:18,  4.27it/s]

[485] downloaded -> images/image485.png (source: 311.jpg)
[486] downloaded -> images/image486.png (source: 312.jpg)


Downloading images:  80%|███████▉  | 314/393 [02:09<00:18,  4.35it/s]

[487] downloaded -> images/image487.png (source: 313.jpg)


Downloading images:  80%|████████  | 316/393 [02:09<00:17,  4.51it/s]

[488] downloaded -> images/image488.png (source: 314.jpg)
[489] downloaded -> images/image489.png (source: 315.jpg)


Downloading images:  81%|████████  | 317/393 [02:09<00:17,  4.31it/s]

[490] downloaded -> images/image490.png (source: 316.jpg)


Downloading images:  81%|████████  | 318/393 [02:09<00:17,  4.28it/s]

[491] downloaded -> images/image491.png (source: 317.jpg)


Downloading images:  81%|████████▏ | 320/393 [02:10<00:17,  4.13it/s]

[492] downloaded -> images/image492.png (source: 318.jpg)
[493] downloaded -> images/image493.png (source: 319.jpg)


Downloading images:  82%|████████▏ | 322/393 [02:10<00:15,  4.67it/s]

[494] downloaded -> images/image494.png (source: 320.jpg)
[495] downloaded -> images/image495.png (source: 321.jpg)


Downloading images:  82%|████████▏ | 323/393 [02:11<00:18,  3.89it/s]

[496] downloaded -> images/image496.png (source: 322.jpg)


Downloading images:  82%|████████▏ | 324/393 [02:11<00:16,  4.13it/s]

[497] downloaded -> images/image497.png (source: 323.jpg)


Downloading images:  83%|████████▎ | 325/393 [02:11<00:18,  3.69it/s]

[498] downloaded -> images/image498.png (source: 324.jpg)


Downloading images:  83%|████████▎ | 326/393 [02:12<00:17,  3.74it/s]

[499] downloaded -> images/image499.png (source: 325.jpg)


Downloading images:  83%|████████▎ | 327/393 [02:12<00:21,  3.09it/s]

[500] downloaded -> images/image500.png (source: 326.jpg)


Downloading images:  83%|████████▎ | 328/393 [02:12<00:20,  3.13it/s]

[501] downloaded -> images/image501.png (source: 327.jpg)


Downloading images:  84%|████████▎ | 329/393 [02:13<00:18,  3.47it/s]

[502] downloaded -> images/image502.png (source: 328.jpg)


Downloading images:  84%|████████▍ | 330/393 [02:13<00:16,  3.78it/s]

[503] downloaded -> images/image503.png (source: 329.jpg)


Downloading images:  84%|████████▍ | 331/393 [02:13<00:16,  3.70it/s]

[504] downloaded -> images/image504.png (source: 330.jpg)


Downloading images:  84%|████████▍ | 332/393 [02:13<00:15,  3.90it/s]

[505] downloaded -> images/image505.png (source: 331.jpg)


Downloading images:  85%|████████▍ | 333/393 [02:14<00:21,  2.84it/s]

[506] downloaded -> images/image506.png (source: 332.jpg)


Downloading images:  85%|████████▍ | 334/393 [02:14<00:19,  3.05it/s]

[507] downloaded -> images/image507.png (source: 333.jpg)


Downloading images:  85%|████████▌ | 336/393 [02:15<00:15,  3.70it/s]

[508] downloaded -> images/image508.png (source: 334.jpg)
[509] downloaded -> images/image509.png (source: 335.jpg)


Downloading images:  86%|████████▌ | 338/393 [02:15<00:13,  4.09it/s]

[510] downloaded -> images/image510.png (source: 336.jpg)
[511] downloaded -> images/image511.png (source: 337.jpg)


Downloading images:  87%|████████▋ | 340/393 [02:15<00:11,  4.68it/s]

[512] downloaded -> images/image512.png (source: 338.jpg)
[513] downloaded -> images/image513.png (source: 339.jpg)


Downloading images:  87%|████████▋ | 341/393 [02:16<00:12,  4.24it/s]

[514] downloaded -> images/image514.png (source: 340.jpg)


Downloading images:  87%|████████▋ | 342/393 [02:16<00:18,  2.75it/s]

[515] downloaded -> images/image515.png (source: 341.jpg)


Downloading images:  87%|████████▋ | 343/393 [02:17<00:20,  2.40it/s]

[516] downloaded -> images/image516.png (source: 342.jpg)


Downloading images:  88%|████████▊ | 344/393 [02:17<00:19,  2.46it/s]

[517] downloaded -> images/image517.png (source: 343.jpg)


Downloading images:  88%|████████▊ | 345/393 [02:17<00:16,  2.86it/s]

[518] downloaded -> images/image518.png (source: 344.jpg)


Downloading images:  88%|████████▊ | 346/393 [02:18<00:15,  3.03it/s]

[519] downloaded -> images/image519.png (source: 345.jpg)


Downloading images:  88%|████████▊ | 347/393 [02:18<00:14,  3.28it/s]

[520] downloaded -> images/image520.png (source: 346.jpg)


Downloading images:  89%|████████▊ | 348/393 [02:18<00:13,  3.45it/s]

[521] downloaded -> images/image521.png (source: 347.jpg)


Downloading images:  89%|████████▉ | 350/393 [02:19<00:10,  3.95it/s]

[522] downloaded -> images/image522.png (source: 348.jpg)
[523] downloaded -> images/image523.png (source: 349.jpg)


Downloading images:  89%|████████▉ | 351/393 [02:19<00:12,  3.40it/s]

[524] downloaded -> images/image524.png (source: 350.jpg)


Downloading images:  90%|████████▉ | 352/393 [02:19<00:12,  3.35it/s]

[525] downloaded -> images/image525.png (source: 351.jpg)


Downloading images:  90%|████████▉ | 353/393 [02:20<00:13,  2.97it/s]

[526] downloaded -> images/image526.png (source: 352.jpg)


Downloading images:  90%|█████████ | 354/393 [02:20<00:13,  2.87it/s]

[527] downloaded -> images/image527.png (source: 353.jpg)


Downloading images:  90%|█████████ | 355/393 [02:20<00:11,  3.17it/s]

[528] downloaded -> images/image528.png (source: 354.jpg)


Downloading images:  91%|█████████ | 356/393 [02:21<00:11,  3.29it/s]

[529] downloaded -> images/image529.png (source: 355.jpg)


Downloading images:  91%|█████████ | 357/393 [02:21<00:14,  2.42it/s]

[530] downloaded -> images/image530.png (source: 356.jpg)


Downloading images:  91%|█████████ | 358/393 [02:22<00:12,  2.75it/s]

[531] downloaded -> images/image531.png (source: 357.jpg)


Downloading images:  92%|█████████▏| 360/393 [02:22<00:10,  3.21it/s]

[532] downloaded -> images/image532.png (source: 358.jpg)
[533] downloaded -> images/image533.png (source: 359.jpg)


Downloading images:  92%|█████████▏| 361/393 [02:22<00:09,  3.38it/s]

[534] downloaded -> images/image534.png (source: 360.jpg)


Downloading images:  92%|█████████▏| 362/393 [02:23<00:08,  3.71it/s]

[535] downloaded -> images/image535.png (source: 361.jpg)


Downloading images:  93%|█████████▎| 364/393 [02:23<00:06,  4.44it/s]

[536] downloaded -> images/image536.png (source: 362.jpg)
[537] downloaded -> images/image537.png (source: 363.jpg)


Downloading images:  93%|█████████▎| 365/393 [02:23<00:06,  4.23it/s]

[538] downloaded -> images/image538.png (source: 364.jpg)


Downloading images:  93%|█████████▎| 366/393 [02:24<00:06,  4.34it/s]

[539] downloaded -> images/image539.png (source: 365.jpg)


Downloading images:  94%|█████████▎| 368/393 [02:24<00:05,  4.51it/s]

[540] downloaded -> images/image540.png (source: 366.jpg)
[541] downloaded -> images/image541.png (source: 367.jpg)


Downloading images:  94%|█████████▍| 369/393 [02:24<00:05,  4.55it/s]

[542] downloaded -> images/image542.png (source: 368.jpg)


Downloading images:  94%|█████████▍| 371/393 [02:25<00:04,  4.77it/s]

[543] downloaded -> images/image543.png (source: 369.jpg)
[544] downloaded -> images/image544.png (source: 370.jpg)


Downloading images:  95%|█████████▍| 372/393 [02:25<00:04,  5.03it/s]

[545] downloaded -> images/image545.png (source: 371.jpg)


Downloading images:  95%|█████████▌| 374/393 [02:25<00:03,  5.06it/s]

[546] downloaded -> images/image546.png (source: 372.jpg)
[547] downloaded -> images/image547.png (source: 373.jpg)


Downloading images:  95%|█████████▌| 375/393 [02:25<00:03,  5.23it/s]

[548] downloaded -> images/image548.png (source: 374.jpg)


Downloading images:  96%|█████████▌| 377/393 [02:26<00:03,  4.88it/s]

[549] downloaded -> images/image549.png (source: 375.jpg)
[550] downloaded -> images/image550.png (source: 376.jpg)


Downloading images:  96%|█████████▋| 379/393 [02:26<00:02,  4.94it/s]

[551] downloaded -> images/image551.png (source: 377.jpg)
[552] downloaded -> images/image552.png (source: 378.jpg)


Downloading images:  97%|█████████▋| 380/393 [02:26<00:02,  5.01it/s]

[553] downloaded -> images/image553.png (source: 379.jpg)


Downloading images:  97%|█████████▋| 381/393 [02:27<00:02,  4.55it/s]

[554] downloaded -> images/image554.png (source: 380.jpg)


Downloading images:  97%|█████████▋| 382/393 [02:27<00:02,  4.38it/s]

[555] downloaded -> images/image555.png (source: 381.jpg)


Downloading images:  97%|█████████▋| 383/393 [02:27<00:02,  4.38it/s]

[556] downloaded -> images/image556.png (source: 382.jpg)


Downloading images:  98%|█████████▊| 384/393 [02:27<00:02,  4.46it/s]

[557] downloaded -> images/image557.png (source: 383.jpg)


Downloading images:  98%|█████████▊| 385/393 [02:28<00:01,  4.35it/s]

[558] downloaded -> images/image558.png (source: 384.jpg)


Downloading images:  98%|█████████▊| 386/393 [02:28<00:01,  4.15it/s]

[559] downloaded -> images/image559.png (source: 385.jpg)


Downloading images:  98%|█████████▊| 387/393 [02:28<00:01,  4.29it/s]

[560] downloaded -> images/image560.png (source: 386.jpg)


Downloading images:  99%|█████████▊| 388/393 [02:28<00:01,  3.81it/s]

[561] downloaded -> images/image561.png (source: 387.jpg)


Downloading images:  99%|█████████▉| 389/393 [02:29<00:01,  3.15it/s]

[562] downloaded -> images/image562.png (source: 388.jpg)


Downloading images:  99%|█████████▉| 390/393 [02:29<00:00,  3.47it/s]

[563] downloaded -> images/image563.png (source: 389.jpg)


Downloading images:  99%|█████████▉| 391/393 [02:29<00:00,  3.23it/s]

[564] downloaded -> images/image564.png (source: 390.jpg)


Downloading images: 100%|█████████▉| 392/393 [02:30<00:00,  3.18it/s]

[565] downloaded -> images/image565.png (source: 391.jpg)


Downloading images: 100%|██████████| 393/393 [02:30<00:00,  2.60it/s]

[566] downloaded -> images/image566.png (source: 392.jpg)

Finished.
ZIP file written to: /content/phys_unibench_images.zip
Images downloaded: 393 / 393
All images downloaded successfully (or there were no image fields).





In [None]:

import json
from pathlib import Path

# --- START CONFIG ---
start_id = 567            # provided starting id
input_filename = "input.json"   # default input file name (you can change)
output_filename = "output_phy.json"
# --- END CONFIG ---

# Try to find the input file in the working directory; if not found, prompt Colab upload
input_path = Path(input_filename)
if not input_path.exists():
    try:
        from google.colab import files
        print(f"'{input_filename}' not found. Please upload your JSON file now (choose the original file).")
        uploaded = files.upload()  # user will upload a file
        # pick the first uploaded filename
        input_path = Path(next(iter(uploaded)))
        print(f"Uploaded: {input_path}")
    except Exception as e:
        raise FileNotFoundError(f"Could not locate '{input_filename}' and upload failed: {e}")

# Load JSON (supports a top-level list or a top-level dict)
with open(input_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Normalize to a list of objects
if isinstance(data, list):
    items = data
elif isinstance(data, dict):
    # try some common wrapper keys, else treat the dict as a single item
    possible_lists = ["items", "data", "questions", "rows"]
    found = False
    for key in possible_lists:
        if key in data and isinstance(data[key], list):
            items = data[key]
            found = True
            break
    if not found:
        # treat as single object (wrap in list)
        items = [data]
else:
    raise ValueError("Unsupported JSON structure: top-level must be list or dict")

# Build output: only keep objects with subject == 'phy' (case-insensitive)
out = []
cur_id = int(start_id)
for obj in items:
    subj = str(obj.get("subject", "")).strip().lower()
    if subj == "phy":
        new_obj = {
            "id": cur_id,
            "image": None,
            "text": obj.get("question"),
            "type": obj.get("type"),
            "correct_answer": obj.get("gold")
        }
        out.append(new_obj)
        cur_id += 1

# Save output JSON (pretty-printed, ensure_ascii=False to preserve unicode)
with open(output_filename, "w", encoding="utf-8") as f:
    json.dump(out, f, ensure_ascii=False, indent=2)

print(f"Saved {len(out)} filtered items to '{output_filename}' (ids {start_id}..{cur_id-1})")

# In Colab, trigger download of the result file
try:
    from google.colab import files
    files.download(output_filename)
except Exception:
    pass


'input.json' not found. Please upload your JSON file now (choose the original file).


Saving dataset.json to dataset (2).json
Uploaded: dataset (2).json
Saved 123 filtered items to 'output_phy.json' (ids 567..689)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:

import json
import os
import sys
import subprocess
import traceback

def ensure_package(pkg):
    try:
        __import__(pkg)
    except Exception:
        print(f"Package '{pkg}' not found. Installing...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])
        print(f"Installed '{pkg}'.")

def try_load_dataset():
    from datasets import load_dataset
    attempts = []
    # Try multiple common approaches to load the split
    try:
        print("Attempt: load_dataset('Cloudriver/PhyX', split='test_mini')")
        ds = load_dataset("Cloudriver/PhyX", split="test_mini")
        return ds
    except Exception as e:
        attempts.append(("split=test_mini", e))
    try:
        print("Attempt: load_dataset('Cloudriver/PhyX', 'default', split='test_mini')")
        ds = load_dataset("Cloudriver/PhyX", "default", split="test_mini")
        return ds
    except Exception as e:
        attempts.append(("config default + split", e))
    try:
        print("Attempt: load_dataset('Cloudriver/PhyX') and select ['test_mini']")
        ds_all = load_dataset("Cloudriver/PhyX")
        if "test_mini" in ds_all:
            return ds_all["test_mini"]
        else:
            raise RuntimeError(f"'test_mini' not found in dataset splits: {list(ds_all.keys())}")
    except Exception as e:
        attempts.append(("load full dataset", e))

    # If all attempts failed, raise a combined error with traces
    msg_lines = ["Failed to load dataset. Attempts:"]
    for name, err in attempts:
        msg_lines.append(f"--- {name} error: {repr(err)}")
    raise RuntimeError("\n".join(msg_lines))

def build_json_records(dataset, start_id=690):
    out = []
    cur = start_id
    # convert to list (some datasets are iterable and len() won't work)
    try:
        data_list = list(dataset)
    except Exception:
        # fallback: iterate
        data_list = []
        for i, row in enumerate(dataset):
            data_list.append(row)
    for item in data_list:
        # item may be a dict-like; use .get if available
        if hasattr(item, "get"):
            q = item.get("question") or ""
            qd = item.get("question_description") or ""
            options = item.get("options") or []
            answer = item.get("answer", None)
        else:
            # attempt attribute access
            q = getattr(item, "question", "") or ""
            qd = getattr(item, "question_description", "") or ""
            options = getattr(item, "options", []) or []
            answer = getattr(item, "answer", None)

        if q and qd:
            text = q.strip() + "\n\n" + qd.strip()
        else:
            text = (q + qd).strip()

        # normalize answer if it's list/tuple
        if isinstance(answer, (list, tuple)):
            answer_val = answer[0] if answer else None
        else:
            answer_val = answer

        record = {
            "id": cur,
            "image": f"images/image{cur}.png",
            "text": text,
            "options": options,
            "type": "MCQs with One Correct Answer",
            "correct_answer": answer_val,
        }
        out.append(record)
        cur += 1
    return out

def main(output_path="phyx_test_mini_custom.json", start_id=690):
    try:
        ensure_package("datasets")
        ensure_package("huggingface-hub")
        # import after ensure
        from datasets import __version__ as dver
        print(f"datasets version: {dver}")
    except Exception as e:
        print("Failed to ensure datasets/huggingface-hub packages:", e)
        traceback.print_exc()
        return

    try:
        print("Loading dataset Cloudriver/PhyX (test_mini)...")
        ds = try_load_dataset()
    except Exception as e:
        print("ERROR while loading dataset:")
        traceback.print_exc()
        # write an empty file and return to indicate failure but produce output
        abs_path = os.path.abspath(output_path)
        print(f"Writing empty JSON [] to {abs_path} to avoid silent failure.")
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump([], f, ensure_ascii=False, indent=2)
        return

    # convert to list and print some metadata
    try:
        ds_list = list(ds)
        n = len(ds_list)
        print(f"Successfully loaded dataset split. Number of examples: {n}")
    except Exception as e:
        print("Could not convert dataset to list; attempting to iterate.")
        traceback.print_exc()
        ds_list = []
        for i, r in enumerate(ds):
            ds_list.append(r)
        n = len(ds_list)
        print(f"Number of examples after iteration: {n}")

    print("Building JSON records...")
    records = build_json_records(ds_list, start_id=start_id)

    # write file
    abs_path = os.path.abspath(output_path)
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(records, f, ensure_ascii=False, indent=2)

    print(f"Wrote {len(records)} records to {abs_path}")
    if records:
        print("Preview of first record:")
        print(json.dumps(records[0], ensure_ascii=False, indent=2))
    else:
        print("No records produced. Check dataset loading above.")

if __name__ == "__main__":
    main()


Package 'huggingface-hub' not found. Installing...
Installed 'huggingface-hub'.
datasets version: 4.0.0
Loading dataset Cloudriver/PhyX (test_mini)...
Attempt: load_dataset('Cloudriver/PhyX', split='test_mini')
Successfully loaded dataset split. Number of examples: 1000
Building JSON records...
Wrote 1000 records to /content/phyx_test_mini_custom.json
Preview of first record:
{
  "id": 690,
  "image": "images/image690.png",
  "text": "Determine the angles\\( \\theta' \\).\n\nFigure shows a refracted light beam in linseed oil making an angle of \\( \\phi = 20.0^{\\circ} \\) with the normal line \\( NN' \\). The index of refraction of linseed oil is 1.48.",
  "options": [
    "A: \\( 28.5^{\\circ} \\)",
    "B: \\(  30.4^{\\circ} \\)",
    "C: \\( 22.3^{\\circ} \\)",
    "D: \\( 31.1^{\\circ} \\)"
  ],
  "type": "MCQs with One Correct Answer",
  "correct_answer": "C"
}


In [None]:


import io
import json
import os
import sys
import subprocess
import zipfile
import traceback

# ---------- helper to ensure packages ----------
def ensure_package(pkg_name, import_name=None):
    import_name = import_name or pkg_name
    try:
        __import__(import_name)
    except Exception:
        print(f"Package '{pkg_name}' not found — installing...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg_name])
        print(f"Installed '{pkg_name}'.")

# ensure required packages
for pkg in ("datasets", "Pillow", "requests", "tqdm"):
    ensure_package(pkg)

from datasets import load_dataset
from PIL import Image
import requests
from tqdm import tqdm

# ---------- configuration ----------
START_ID = 690
OUTPUT_ZIP = "phyx_images_and_data.zip"
OUTPUT_JSON_NAME = "phyx_test_mini_custom.json"
IMAGES_DIR_IN_ZIP = "images"  # images stored under images/image{ID}.png
PLACEHOLDER_SIZE = (1, 1)  # 1x1 PNG placeholder for missing images
REQUESTS_TIMEOUT = 15  # seconds

# ---------- utility functions ----------
def load_phyx_test_mini():
    """Try multiple ways to load the test_mini split."""
    attempts = []
    try:
        print("Attempting: load_dataset('Cloudriver/PhyX', split='test_mini') ...")
        ds = load_dataset("Cloudriver/PhyX", split="test_mini")
        return ds
    except Exception as e:
        attempts.append(("split=test_mini", e))
    try:
        print("Attempting: load_dataset('Cloudriver/PhyX', 'default', split='test_mini') ...")
        ds = load_dataset("Cloudriver/PhyX", "default", split="test_mini")
        return ds
    except Exception as e:
        attempts.append(("config default + split", e))
    try:
        print("Attempting: load_dataset('Cloudriver/PhyX') then select ['test_mini'] ...")
        ds_all = load_dataset("Cloudriver/PhyX")
        if "test_mini" in ds_all:
            return ds_all["test_mini"]
        else:
            raise RuntimeError(f"'test_mini' not found in splits: {list(ds_all.keys())}")
    except Exception as e:
        attempts.append(("load full dataset", e))
    # raise combined error
    msg = "Failed to load dataset. Attempts:\n"
    for name, err in attempts:
        msg += f" - {name}: {repr(err)}\n"
    raise RuntimeError(msg)

def build_text(item):
    q = item.get("question") or ""
    qd = item.get("question_description") or ""
    if q and qd:
        return q.strip() + "\n\n" + qd.strip()
    return (q + qd).strip()

def normalize_answer(ans):
    if isinstance(ans, (list, tuple)):
        return ans[0] if ans else None
    return ans

def find_image_source(item):
    """
    Try to find an image source inside the dataset example.
    Returns a tuple (type, value) where type in {'url','local_path','pil','bytes','none'}.
    """
    # direct 'image' field
    image_field = item.get("image")
    if image_field is not None:
        # If it's a PIL.Image
        if isinstance(image_field, Image.Image):
            return ("pil", image_field)
        # If it's bytes
        if isinstance(image_field, (bytes, bytearray)):
            return ("bytes", bytes(image_field))
        # If it's a string (likely a URL or local path)
        if isinstance(image_field, str):
            if image_field.startswith("http://") or image_field.startswith("https://"):
                return ("url", image_field)
            else:
                return ("local_path", image_field)
        # If it's a dict-like
        if isinstance(image_field, dict):
            # common keys: 'path', 'file', 'url', 'image', 'image_url'
            for k in ("url", "image_url", "src", "path", "file", "filename"):
                if k in image_field and image_field[k]:
                    val = image_field[k]
                    if isinstance(val, str):
                        if val.startswith("http://") or val.startswith("https://"):
                            return ("url", val)
                        else:
                            return ("local_path", val)
            # maybe nested bytes or PIL
            if "bytes" in image_field and image_field["bytes"]:
                return ("bytes", image_field["bytes"])
            # unknown dict case: try to stringify
            return ("unknown_dict", image_field)

    # try other common keys
    for k in ("image_url", "img_url", "img", "image_path", "image_filename", "url"):
        if k in item and item[k]:
            val = item[k]
            if isinstance(val, str):
                if val.startswith("http://") or val.startswith("https://"):
                    return ("url", val)
                else:
                    return ("local_path", val)
            if isinstance(val, Image.Image):
                return ("pil", val)
            if isinstance(val, (bytes, bytearray)):
                return ("bytes", bytes(val))

    # search for any field name containing 'image' or 'img'
    for key, val in item.items():
        if key and ("image" in key.lower() or "img" in key.lower()):
            if isinstance(val, str):
                if val.startswith("http://") or val.startswith("https://"):
                    return ("url", val)
                else:
                    return ("local_path", val)
            if isinstance(val, Image.Image):
                return ("pil", val)
            if isinstance(val, (bytes, bytearray)):
                return ("bytes", bytes(val))
            if isinstance(val, dict):
                # try same logic
                for k2 in ("url", "path", "image_url"):
                    if k2 in val and isinstance(val[k2], str):
                        if val[k2].startswith("http"):
                            return ("url", val[k2])
                        else:
                            return ("local_path", val[k2])
                if "bytes" in val:
                    return ("bytes", val["bytes"])
    # no image found
    return ("none", None)

def get_image_bytes_from_source(src_type, src_value):
    """
    Return raw PNG bytes (or bytes convertible to PNG) for writing into zip.
    Ensures image bytes are PNG-formatted.
    """
    # If PIL Image -> save to PNG bytes
    if src_type == "pil":
        bio = io.BytesIO()
        src_value.save(bio, format="PNG")
        return bio.getvalue()
    if src_type == "bytes":
        # Try to load via PIL to ensure PNG format (and convert if necessary)
        try:
            bio_in = io.BytesIO(src_value)
            img = Image.open(bio_in)
            out = io.BytesIO()
            img.save(out, format="PNG")
            return out.getvalue()
        except Exception:
            # bytes not image -> return as-is
            return bytes(src_value)
    if src_type == "local_path":
        # read file from disk
        path = src_value
        if not os.path.isabs(path):
            # sometimes dataset returns relative cache path — try as-is
            path = os.path.expanduser(path)
        try:
            with open(path, "rb") as f:
                content = f.read()
            # normalize via PIL
            try:
                img = Image.open(io.BytesIO(content))
                out = io.BytesIO()
                img.save(out, format="PNG")
                return out.getvalue()
            except Exception:
                return content
        except Exception as e:
            raise RuntimeError(f"Could not read local image path '{src_value}': {e}")
    if src_type == "url":
        url = src_value
        try:
            resp = requests.get(url, stream=True, timeout=REQUESTS_TIMEOUT)
            resp.raise_for_status()
            content = resp.content
            # normalize via PIL
            try:
                img = Image.open(io.BytesIO(content))
                out = io.BytesIO()
                img.save(out, format="PNG")
                return out.getvalue()
            except Exception:
                # if not a recognizable image, return raw bytes
                return content
        except Exception as e:
            raise RuntimeError(f"Failed to download image from {url}: {e}")
    if src_type == "unknown_dict":
        # try to serialize or find nested url/path
        if isinstance(src_value, dict):
            for k in ("url", "image_url", "path"):
                v = src_value.get(k)
                if isinstance(v, str):
                    if v.startswith("http"):
                        return get_image_bytes_from_source("url", v)
                    else:
                        return get_image_bytes_from_source("local_path", v)
        raise RuntimeError("Unknown dict image format and no url/path found.")
    if src_type == "none":
        raise RuntimeError("No image source available.")
    # fallback
    raise RuntimeError(f"Unhandled src_type: {src_type}")

def make_placeholder_png_bytes(size=(1,1)):
    img = Image.new("RGBA", size, (0,0,0,0))
    bio = io.BytesIO()
    img.save(bio, format="PNG")
    return bio.getvalue()

# ---------- main logic ----------
def main():
    try:
        ds = load_phyx_test_mini()
    except Exception as e:
        print("ERROR: could not load dataset:", e)
        traceback.print_exc()
        sys.exit(1)

    # convert to list (so we can iterate twice)
    try:
        examples = list(ds)
    except Exception:
        examples = [x for x in ds]

    n = len(examples)
    print(f"Loaded dataset split with {n} examples.")

    records = []
    current_id = START_ID

    # create an in-memory zip (but we will write to disk at the end)
    with zipfile.ZipFile(OUTPUT_ZIP, mode="w", compression=zipfile.ZIP_DEFLATED) as zf:
        # iterate with progress
        for idx, item in enumerate(tqdm(examples, desc="Processing examples", unit="ex")):
            try:
                # Build json record
                text = build_text(item)
                options = item.get("options") or []
                answer_val = normalize_answer(item.get("answer"))

                record = {
                    "id": current_id,
                    "image": f"{IMAGES_DIR_IN_ZIP}/image{current_id}.png",
                    "text": text,
                    "options": options,
                    "type": "MCQs with One Correct Answer",
                    "correct_answer": answer_val
                }
                records.append(record)

                # Determine image source
                src_type, src_value = find_image_source(item)

                try:
                    img_bytes = get_image_bytes_from_source(src_type, src_value)
                except Exception as e_img:
                    # log and fallback to placeholder
                    print(f"Warning: could not get image for example index {idx} (id={current_id}): {e_img}")
                    img_bytes = make_placeholder_png_bytes(PLACEHOLDER_SIZE)

                # Write image bytes into zip at desired path
                img_path_in_zip = f"{IMAGES_DIR_IN_ZIP}/image{current_id}.png"
                zf.writestr(img_path_in_zip, img_bytes)

            except Exception as e:
                print(f"ERROR processing example index {idx}: {e}")
                traceback.print_exc()
                # still add a placeholder record so indexing remains consistent
                record = {
                    "id": current_id,
                    "image": f"{IMAGES_DIR_IN_ZIP}/image{current_id}.png",
                    "text": "",
                    "options": [],
                    "type": "MCQs with One Correct Answer",
                    "correct_answer": None
                }
                records.append(record)
                zf.writestr(f"{IMAGES_DIR_IN_ZIP}/image{current_id}.png", make_placeholder_png_bytes(PLACEHOLDER_SIZE))

            current_id += 1

        # After images are written, write the JSON file into the zip
        json_bytes = json.dumps(records, ensure_ascii=False, indent=2).encode("utf-8")
        zf.writestr(OUTPUT_JSON_NAME, json_bytes)

    # done
    abs_zip = os.path.abspath(OUTPUT_ZIP)
    print(f"Finished. Created ZIP: {abs_zip}")
    print(f" - {len(records)} records written (IDs {START_ID}..{current_id-1})")
    print(f" - JSON file inside ZIP: {OUTPUT_JSON_NAME}")
    print(f" - Images inside ZIP at: {IMAGES_DIR_IN_ZIP}/image{{ID}}.png")

if __name__ == "__main__":
    main()


Package 'Pillow' not found — installing...
Installed 'Pillow'.
Attempting: load_dataset('Cloudriver/PhyX', split='test_mini') ...
Loaded dataset split with 1000 examples.


Processing examples: 100%|██████████| 1000/1000 [00:28<00:00, 35.28ex/s]

Finished. Created ZIP: /content/phyx_images_and_data.zip
 - 1000 records written (IDs 690..1689)
 - JSON file inside ZIP: phyx_test_mini_custom.json
 - Images inside ZIP at: images/image{ID}.png





In [None]:

import json
import os
import sys
import subprocess
import traceback

def ensure_package(pkg):
    try:
        __import__(pkg)
    except Exception:
        print(f"Package '{pkg}' not found. Installing...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])
        print(f"Installed '{pkg}'.")

def try_load_dataset():
    from datasets import load_dataset
    attempts = []
    # Try multiple common approaches to load the split
    try:
        print("Attempt: load_dataset('Cloudriver/PhyX', split='test')")
        ds = load_dataset("Cloudriver/PhyX", split="test")
        return ds
    except Exception as e:
        attempts.append(("split=test", e))
    try:
        print("Attempt: load_dataset('Cloudriver/PhyX', 'default', split='test')")
        ds = load_dataset("Cloudriver/PhyX", "default", split="test")
        return ds
    except Exception as e:
        attempts.append(("config default + split", e))
    try:
        print("Attempt: load_dataset('Cloudriver/PhyX') and select ['test']")
        ds_all = load_dataset("Cloudriver/PhyX")
        if "test" in ds_all:
            return ds_all["test"]
        else:
            raise RuntimeError(f"'test' not found in dataset splits: {list(ds_all.keys())}")
    except Exception as e:
        attempts.append(("load full dataset", e))

    # If all attempts failed, raise a combined error with traces
    msg_lines = ["Failed to load dataset. Attempts:"]
    for name, err in attempts:
        msg_lines.append(f"--- {name} error: {repr(err)}")
    raise RuntimeError("\n".join(msg_lines))

def build_json_records(dataset, start_id=1690):
    out = []
    cur = start_id
    # convert to list (some datasets are iterable and len() won't work)
    try:
        data_list = list(dataset)
    except Exception:
        # fallback: iterate
        data_list = []
        for i, row in enumerate(dataset):
            data_list.append(row)
    for item in data_list:
        # item may be a dict-like; use .get if available
        if hasattr(item, "get"):
            q = item.get("question") or ""
            qd = item.get("question_description") or ""
            options = item.get("options") or []
            answer = item.get("answer", None)
        else:
            # attempt attribute access
            q = getattr(item, "question", "") or ""
            qd = getattr(item, "question_description", "") or ""
            options = getattr(item, "options", []) or []
            answer = getattr(item, "answer", None)

        if q and qd:
            text = q.strip() + "\n\n" + qd.strip()
        else:
            text = (q + qd).strip()

        # normalize answer if it's list/tuple
        if isinstance(answer, (list, tuple)):
            answer_val = answer[0] if answer else None
        else:
            answer_val = answer

        record = {
            "id": cur,
            "image": f"images/image{cur}.png",
            "text": text,
            "options": options,
            "type": "MCQs with One Correct Answer",
            "correct_answer": answer_val,
        }
        out.append(record)
        cur += 1
    return out

def main(output_path="phyx_test_custom.json", start_id=1690):
    try:
        ensure_package("datasets")
        ensure_package("huggingface-hub")
        # import after ensure
        from datasets import __version__ as dver
        print(f"datasets version: {dver}")
    except Exception as e:
        print("Failed to ensure datasets/huggingface-hub packages:", e)
        traceback.print_exc()
        return

    try:
        print("Loading dataset Cloudriver/PhyX (test)...")
        ds = try_load_dataset()
    except Exception as e:
        print("ERROR while loading dataset:")
        traceback.print_exc()
        # write an empty file and return to indicate failure but produce output
        abs_path = os.path.abspath(output_path)
        print(f"Writing empty JSON [] to {abs_path} to avoid silent failure.")
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump([], f, ensure_ascii=False, indent=2)
        return

    # convert to list and print some metadata
    try:
        ds_list = list(ds)
        n = len(ds_list)
        print(f"Successfully loaded dataset split. Number of examples: {n}")
    except Exception as e:
        print("Could not convert dataset to list; attempting to iterate.")
        traceback.print_exc()
        ds_list = []
        for i, r in enumerate(ds):
            ds_list.append(r)
        n = len(ds_list)
        print(f"Number of examples after iteration: {n}")

    print("Building JSON records...")
    records = build_json_records(ds_list, start_id=start_id)

    # write file
    abs_path = os.path.abspath(output_path)
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(records, f, ensure_ascii=False, indent=2)

    print(f"Wrote {len(records)} records to {abs_path}")
    if records:
        print("Preview of first record:")
        print(json.dumps(records[0], ensure_ascii=False, indent=2))
    else:
        print("No records produced. Check dataset loading above.")

if __name__ == "__main__":
    main()


Package 'huggingface-hub' not found. Installing...
Installed 'huggingface-hub'.
datasets version: 4.0.0
Loading dataset Cloudriver/PhyX (test)...
Attempt: load_dataset('Cloudriver/PhyX', split='test')
Successfully loaded dataset split. Number of examples: 3000
Building JSON records...
Wrote 3000 records to /content/phyx_test_custom.json
Preview of first record:
{
  "id": 1690,
  "image": "images/image1690.png",
  "text": "How large should these pulls be?\n\nA patient with a dislocated shoulder is put into a traction apparatus as shown in figure. The pulls $\\vec{A}$ and $\\vec{B}have equal magnitudes and must combine to produce an outward traction force of 12.8 N on the patient’s arm.",
  "options": [
    "A: 7.55N",
    "B: 5.55N",
    "C: 7.65N",
    "D: 6.65N"
  ],
  "type": "MCQs with One Correct Answer",
  "correct_answer": "A"
}


In [None]:


import io
import json
import os
import sys
import subprocess
import zipfile
import traceback

# ---------- helper to ensure packages ----------
def ensure_package(pkg_name, import_name=None):
    import_name = import_name or pkg_name
    try:
        __import__(import_name)
    except Exception:
        print(f"Package '{pkg_name}' not found — installing...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg_name])
        print(f"Installed '{pkg_name}'.")

# ensure required packages
for pkg in ("datasets", "Pillow", "requests", "tqdm"):
    ensure_package(pkg)

from datasets import load_dataset
from PIL import Image
import requests
from tqdm import tqdm

# ---------- configuration ----------
START_ID = 1690
OUTPUT_ZIP = "phyx_images_and_data.zip"
OUTPUT_JSON_NAME = "phyx_test_custom.json"
IMAGES_DIR_IN_ZIP = "images"  # images stored under images/image{ID}.png
PLACEHOLDER_SIZE = (1, 1)  # 1x1 PNG placeholder for missing images
REQUESTS_TIMEOUT = 15  # seconds

# ---------- utility functions ----------
def load_phyx_test():
    """Try multiple ways to load the test split."""
    attempts = []
    try:
        print("Attempting: load_dataset('Cloudriver/PhyX', split='test') ...")
        ds = load_dataset("Cloudriver/PhyX", split="test")
        return ds
    except Exception as e:
        attempts.append(("split=test", e))
    try:
        print("Attempting: load_dataset('Cloudriver/PhyX', 'default', split='test') ...")
        ds = load_dataset("Cloudriver/PhyX", "default", split="test")
        return ds
    except Exception as e:
        attempts.append(("config default + split", e))
    try:
        print("Attempting: load_dataset('Cloudriver/PhyX') then select ['test'] ...")
        ds_all = load_dataset("Cloudriver/PhyX")
        if "test" in ds_all:
            return ds_all["test"]
        else:
            raise RuntimeError(f"'test' not found in splits: {list(ds_all.keys())}")
    except Exception as e:
        attempts.append(("load full dataset", e))
    # raise combined error
    msg = "Failed to load dataset. Attempts:\n"
    for name, err in attempts:
        msg += f" - {name}: {repr(err)}\n"
    raise RuntimeError(msg)

def build_text(item):
    q = item.get("question") or ""
    qd = item.get("question_description") or ""
    if q and qd:
        return q.strip() + "\n\n" + qd.strip()
    return (q + qd).strip()

def normalize_answer(ans):
    if isinstance(ans, (list, tuple)):
        return ans[0] if ans else None
    return ans

def find_image_source(item):
    """
    Try to find an image source inside the dataset example.
    Returns a tuple (type, value) where type in {'url','local_path','pil','bytes','none'}.
    """
    # direct 'image' field
    image_field = item.get("image")
    if image_field is not None:
        # If it's a PIL.Image
        if isinstance(image_field, Image.Image):
            return ("pil", image_field)
        # If it's bytes
        if isinstance(image_field, (bytes, bytearray)):
            return ("bytes", bytes(image_field))
        # If it's a string (likely a URL or local path)
        if isinstance(image_field, str):
            if image_field.startswith("http://") or image_field.startswith("https://"):
                return ("url", image_field)
            else:
                return ("local_path", image_field)
        # If it's a dict-like
        if isinstance(image_field, dict):
            # common keys: 'path', 'file', 'url', 'image', 'image_url'
            for k in ("url", "image_url", "src", "path", "file", "filename"):
                if k in image_field and image_field[k]:
                    val = image_field[k]
                    if isinstance(val, str):
                        if val.startswith("http://") or val.startswith("https://"):
                            return ("url", val)
                        else:
                            return ("local_path", val)
            # maybe nested bytes or PIL
            if "bytes" in image_field and image_field["bytes"]:
                return ("bytes", image_field["bytes"])
            # unknown dict case: try to stringify
            return ("unknown_dict", image_field)

    # try other common keys
    for k in ("image_url", "img_url", "img", "image_path", "image_filename", "url"):
        if k in item and item[k]:
            val = item[k]
            if isinstance(val, str):
                if val.startswith("http://") or val.startswith("https://"):
                    return ("url", val)
                else:
                    return ("local_path", val)
            if isinstance(val, Image.Image):
                return ("pil", val)
            if isinstance(val, (bytes, bytearray)):
                return ("bytes", bytes(val))

    # search for any field name containing 'image' or 'img'
    for key, val in item.items():
        if key and ("image" in key.lower() or "img" in key.lower()):
            if isinstance(val, str):
                if val.startswith("http://") or val.startswith("https://"):
                    return ("url", val)
                else:
                    return ("local_path", val)
            if isinstance(val, Image.Image):
                return ("pil", val)
            if isinstance(val, (bytes, bytearray)):
                return ("bytes", bytes(val))
            if isinstance(val, dict):
                # try same logic
                for k2 in ("url", "path", "image_url"):
                    if k2 in val and isinstance(val[k2], str):
                        if val[k2].startswith("http"):
                            return ("url", val[k2])
                        else:
                            return ("local_path", val[k2])
                if "bytes" in val:
                    return ("bytes", val["bytes"])
    # no image found
    return ("none", None)

def get_image_bytes_from_source(src_type, src_value):
    """
    Return raw PNG bytes (or bytes convertible to PNG) for writing into zip.
    Ensures image bytes are PNG-formatted.
    """
    # If PIL Image -> save to PNG bytes
    if src_type == "pil":
        bio = io.BytesIO()
        src_value.save(bio, format="PNG")
        return bio.getvalue()
    if src_type == "bytes":
        # Try to load via PIL to ensure PNG format (and convert if necessary)
        try:
            bio_in = io.BytesIO(src_value)
            img = Image.open(bio_in)
            out = io.BytesIO()
            img.save(out, format="PNG")
            return out.getvalue()
        except Exception:
            # bytes not image -> return as-is
            return bytes(src_value)
    if src_type == "local_path":
        # read file from disk
        path = src_value
        if not os.path.isabs(path):
            # sometimes dataset returns relative cache path — try as-is
            path = os.path.expanduser(path)
        try:
            with open(path, "rb") as f:
                content = f.read()
            # normalize via PIL
            try:
                img = Image.open(io.BytesIO(content))
                out = io.BytesIO()
                img.save(out, format="PNG")
                return out.getvalue()
            except Exception:
                return content
        except Exception as e:
            raise RuntimeError(f"Could not read local image path '{src_value}': {e}")
    if src_type == "url":
        url = src_value
        try:
            resp = requests.get(url, stream=True, timeout=REQUESTS_TIMEOUT)
            resp.raise_for_status()
            content = resp.content
            # normalize via PIL
            try:
                img = Image.open(io.BytesIO(content))
                out = io.BytesIO()
                img.save(out, format="PNG")
                return out.getvalue()
            except Exception:
                # if not a recognizable image, return raw bytes
                return content
        except Exception as e:
            raise RuntimeError(f"Failed to download image from {url}: {e}")
    if src_type == "unknown_dict":
        # try to serialize or find nested url/path
        if isinstance(src_value, dict):
            for k in ("url", "image_url", "path"):
                v = src_value.get(k)
                if isinstance(v, str):
                    if v.startswith("http"):
                        return get_image_bytes_from_source("url", v)
                    else:
                        return get_image_bytes_from_source("local_path", v)
        raise RuntimeError("Unknown dict image format and no url/path found.")
    if src_type == "none":
        raise RuntimeError("No image source available.")
    # fallback
    raise RuntimeError(f"Unhandled src_type: {src_type}")

def make_placeholder_png_bytes(size=(1,1)):
    img = Image.new("RGBA", size, (0,0,0,0))
    bio = io.BytesIO()
    img.save(bio, format="PNG")
    return bio.getvalue()

# ---------- main logic ----------
def main():
    try:
        ds = load_phyx_test()
    except Exception as e:
        print("ERROR: could not load dataset:", e)
        traceback.print_exc()
        sys.exit(1)

    # convert to list (so we can iterate twice)
    try:
        examples = list(ds)
    except Exception:
        examples = [x for x in ds]

    n = len(examples)
    print(f"Loaded dataset split with {n} examples.")

    records = []
    current_id = START_ID

    # create an in-memory zip (but we will write to disk at the end)
    with zipfile.ZipFile(OUTPUT_ZIP, mode="w", compression=zipfile.ZIP_DEFLATED) as zf:
        # iterate with progress
        for idx, item in enumerate(tqdm(examples, desc="Processing examples", unit="ex")):
            try:
                # Build json record
                text = build_text(item)
                options = item.get("options") or []
                answer_val = normalize_answer(item.get("answer"))

                record = {
                    "id": current_id,
                    "image": f"{IMAGES_DIR_IN_ZIP}/image{current_id}.png",
                    "text": text,
                    "options": options,
                    "type": "MCQs with One Correct Answer",
                    "correct_answer": answer_val
                }
                records.append(record)

                # Determine image source
                src_type, src_value = find_image_source(item)

                try:
                    img_bytes = get_image_bytes_from_source(src_type, src_value)
                except Exception as e_img:
                    # log and fallback to placeholder
                    print(f"Warning: could not get image for example index {idx} (id={current_id}): {e_img}")
                    img_bytes = make_placeholder_png_bytes(PLACEHOLDER_SIZE)

                # Write image bytes into zip at desired path
                img_path_in_zip = f"{IMAGES_DIR_IN_ZIP}/image{current_id}.png"
                zf.writestr(img_path_in_zip, img_bytes)

            except Exception as e:
                print(f"ERROR processing example index {idx}: {e}")
                traceback.print_exc()
                # still add a placeholder record so indexing remains consistent
                record = {
                    "id": current_id,
                    "image": f"{IMAGES_DIR_IN_ZIP}/image{current_id}.png",
                    "text": "",
                    "options": [],
                    "type": "MCQs with One Correct Answer",
                    "correct_answer": None
                }
                records.append(record)
                zf.writestr(f"{IMAGES_DIR_IN_ZIP}/image{current_id}.png", make_placeholder_png_bytes(PLACEHOLDER_SIZE))

            current_id += 1

        # After images are written, write the JSON file into the zip
        json_bytes = json.dumps(records, ensure_ascii=False, indent=2).encode("utf-8")
        zf.writestr(OUTPUT_JSON_NAME, json_bytes)

    # done
    abs_zip = os.path.abspath(OUTPUT_ZIP)
    print(f"Finished. Created ZIP: {abs_zip}")
    print(f" - {len(records)} records written (IDs {START_ID}..{current_id-1})")
    print(f" - JSON file inside ZIP: {OUTPUT_JSON_NAME}")
    print(f" - Images inside ZIP at: {IMAGES_DIR_IN_ZIP}/image{{ID}}.png")

if __name__ == "__main__":
    main()


Package 'Pillow' not found — installing...
Installed 'Pillow'.
Attempting: load_dataset('Cloudriver/PhyX', split='test') ...
Loaded dataset split with 3000 examples.


Processing examples: 100%|██████████| 3000/3000 [01:21<00:00, 36.88ex/s]


Finished. Created ZIP: /content/phyx_images_and_data.zip
 - 3000 records written (IDs 1690..4689)
 - JSON file inside ZIP: phyx_test_custom.json
 - Images inside ZIP at: images/image{ID}.png
