In [1]:
# ensure gpu
!nvidia-smi

Thu Feb 19 10:44:48 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.82.07              Driver Version: 580.82.07      CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          Off |   00000000:00:05.0 Off |                    0 |
| N/A   34C    P0             52W /  400W |       0MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+

+----------------------------------------------

In [2]:
# install packages
# !pip install -U transformers==4.57.1 torch pillow einops torchvision accelerate decord2 molmo_utils bitsandbytes

In [2]:
# get geometry3k TEST split
!wget https://lupantech.github.io/inter-gps/geometry3k/test.zip
!unzip test.zip

--2026-02-19 10:44:49--  https://lupantech.github.io/inter-gps/geometry3k/test.zip
Resolving lupantech.github.io (lupantech.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to lupantech.github.io (lupantech.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 25524601 (24M) [application/x-zip-compressed]
Saving to: ‘test.zip.2’


2026-02-19 10:44:49 (283 MB/s) - ‘test.zip.2’ saved [25524601/25524601]

Archive:  test.zip
replace test/2925/logic_form.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: test/2925/logic_form.json  
  inflating: test/2925/img_diagram_point.png  
  inflating: test/2925/data.json     
  inflating: test/2925/img_diagram.png  
  inflating: test/2513/logic_form.json  
  inflating: test/2513/img_diagram_point.png  
  inflating: test/2513/data.json     
  inflating: test/2513/img_diagram.png  
  inflating: test/2467/logic_form.json  
  inflating: test/2467/img_diagram_point.png  


In [3]:
import re
import os
import json
import copy
import torch
from PIL import Image
from torch.utils.data import DataLoader
from transformers import AutoProcessor, AutoModelForImageTextToText

In [4]:
model_id="Qwen/Qwen3-VL-8B-Instruct"

# load the processor
processor = AutoProcessor.from_pretrained(
    model_id,
    trust_remote_code=True,
)
processor.tokenizer.padding_side = "left"
model = AutoModelForImageTextToText.from_pretrained(
    model_id,
    trust_remote_code=True,
    dtype=torch.bfloat16,
    device_map={"": "cuda"}
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading weights:   0%|          | 0/750 [00:00<?, ?it/s]

In [5]:
test_problems = []

# Get a list of all subdirectories within the 'test/' directory and filter for numerical directories
problem_ids = sorted([d for d in os.listdir('test/') if d.isdigit()], key=int)

for problem_id in problem_ids:
    problem_dir = os.path.join('test/', problem_id)
    data_json_path = os.path.join(problem_dir, 'data.json')
    img_diagram_path = os.path.join(problem_dir, 'img_diagram.png')

    if os.path.exists(data_json_path):
        with open(data_json_path, 'r') as f:
            data = json.load(f)

        problem_text = data.get('problem_text', '')
        choices = data.get('choices', [])
        answer = data.get('answer', '')

        # Create a dictionary to map choice indices to letters
        choice_letters = {0: 'A. ', 1: 'B. ', 2: 'C. ', 3: 'D. ', 4: 'E. ', 5: 'F. '}
        formatted_choices = [
            f"{choice_letters.get(i, str(i) + '.')}" + c for i, c in enumerate(choices)
        ]

        test_problems.append({
            'problem_id': problem_id,
            'problem_text': problem_text,
            'choices': formatted_choices,
            'answer': answer,
            'image_path': img_diagram_path
        })

print(f"Loaded {len(test_problems)} test problems.")
# Display the first problem to verify structure
if test_problems:
    print("\nFirst processed problem example:")
    print(json.dumps(test_problems[0], indent=2))

Loaded 601 test problems.

First processed problem example:
{
  "problem_id": "2401",
  "problem_text": "Find the area of the figure.",
  "choices": [
    "A. 30",
    "B. 60",
    "C. 120",
    "D. 240"
  ],
  "answer": "B",
  "image_path": "test/2401/img_diagram.png"
}


In [6]:
prompt_direct = """The following problem refers to an image diagram. Please consider the diagram to select an answer choice to the question.
    Your output response MUST ONLY be a single letter from the choices: A, B, C, or D.
    For example: \nStep 1: reasoning step 1. \nStep 2: reasoning step 2. \nStep 3: reasoning step 3.\n...\nFinal Answer: <answer>...</answer>
    \n\n{problem_text}\n\n{choices_text}\n\n
    Answer:"""
prompt_thinking = """You are a helpful AI assistant.
    When answering questions, you always consider the image diagram and must always show your step-by-step reasoning process first.
    Your final answer MUST by a single letter from the choices (e.g., A, B, C, or D) in the following format: <answer>n</answer> where n is the letter choice (e.g., A, B, C, or D) corresponding to the selected answer.
    \n\n{problem_text}\n\n{choices_text}\n\n
    """

In [7]:
def format_problem(prompt_format, test_problems):
    test_problems_copy = copy.deepcopy(test_problems)
    for problem in test_problems_copy:
        problem_text = problem['problem_text']
        choices_text = '\n'.join(problem['choices'])

        prompt = prompt_format.format(problem_text=problem_text, choices_text=choices_text)

        problem['model_prompt'] = [dict(type="text", text=prompt), dict(type="image", image=Image.open(problem['image_path']).convert("RGB"))]

    # Display the prompt for the first problem to verify structure
    if test_problems_copy:
        print("\nFirst problem's constructed prompt example:")
        print(test_problems_copy[0]['model_prompt'])
    return test_problems_copy

In [8]:
test_problems_direct = format_problem(prompt_direct, test_problems)

test_problems_thinking = format_problem(prompt_thinking, test_problems)


First problem's constructed prompt example:
[{'type': 'text', 'text': 'The following problem refers to an image diagram. Please consider the diagram to select an answer choice to the question.\n    Your output response MUST ONLY be a single letter from the choices: A, B, C, or D.\n    For example: \nStep 1: reasoning step 1. \nStep 2: reasoning step 2. \nStep 3: reasoning step 3.\n...\nFinal Answer: <answer>...</answer>\n    \n\nFind the area of the figure.\n\nA. 30\nB. 60\nC. 120\nD. 240\n\n\n    Answer:'}, {'type': 'image', 'image': <PIL.Image.Image image mode=RGB size=260x275 at 0x7C83C06FACC0>}]

First problem's constructed prompt example:
[{'type': 'text', 'text': 'You are a helpful AI assistant.\n    When answering questions, you always consider the image diagram and must always show your step-by-step reasoning process first.\n    Your final answer MUST by a single letter from the choices (e.g., A, B, C, or D) in the following format: <answer>n</answer> where n is the letter cho

In [10]:
BATCH_SIZE = 4  # Adjust based on model size and A10 VRAM (24GB)
MAX_NEW_TOKENS = 1

# 1. Define a simple collate function that prevents stacking
def identity_collate(batch):
    return {key: [d[key] for d in batch] for key in batch[0]}

# 2. Update your DataLoader to use it
data_loader = DataLoader(
    test_problems_direct,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=identity_collate # Add this line
)

predictions_direct = []

# Ensure tokenizer has a padding token (required for batching)
if processor.tokenizer.pad_token is None:
    processor.tokenizer.pad_token = processor.tokenizer.eos_token

for batch in data_loader:
    # Prepare batch prompts
    prompts = batch['model_prompt']
    messages_batch = [[{"role": "user", "content": p}] for p in prompts]

    # 3. Apply template and tokenize with padding
    inputs = processor.apply_chat_template(
        messages_batch,
        add_generation_prompt=True,
        tokenize=True,
        padding=True, # Critical for batching
        return_dict=True,
        return_tensors="pt",
    ).to(model.device)

    # 4. Batched Generation
    with torch.inference_mode():
        outputs = model.generate(
            **inputs,
            do_sample=False,
            max_new_tokens=MAX_NEW_TOKENS,
            pad_token_id=processor.tokenizer.pad_token_id
        )

    # 5. Decode and Process
    input_length = inputs["input_ids"].shape[-1]
    generated_tokens = outputs[:, input_length:]
    decoded_outputs = processor.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

    for i, full_text in enumerate(decoded_outputs):
        problem_id = batch['problem_id'][i] if torch.is_tensor(batch['problem_id']) else batch['problem_id'][i]


        entry = {
            'problem_id': problem_id,
            'prediction': full_text
        }
        predictions_direct.append(entry)

        print(f"Processed ID: {problem_id} | Extracted: {full_text}")

# 6. Save results
with open("predictions_direct.json", "w") as f:
    json.dump(predictions_direct, f, indent=4)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Processed ID: 2401 | Extracted: B
Processed ID: 2402 | Extracted: A
Processed ID: 2403 | Extracted: C
Processed ID: 2404 | Extracted: B
Processed ID: 2405 | Extracted: C
Processed ID: 2406 | Extracted: B
Processed ID: 2407 | Extracted: C
Processed ID: 2408 | Extracted: C
Processed ID: 2409 | Extracted: C
Processed ID: 2410 | Extracted: C
Processed ID: 2411 | Extracted: C
Processed ID: 2412 | Extracted: A
Processed ID: 2413 | Extracted: C
Processed ID: 2414 | Extracted: B
Processed ID: 2415 | Extracted: B
Processed ID: 2416 | Extracted: B
Processed ID: 2417 | Extracted: B
Processed ID: 2418 | Extracted: D
Processed ID: 2419 | Extracted: B
Processed ID: 2420 | Extracted: C
Processed ID: 2421 | Extracted: B
Processed ID: 2422 | Extracted: B
Processed ID: 2423 | Extracted: C
Processed ID: 2424 | Extracted: B
Processed ID: 2425 | Extracted: B
Processed ID: 2426 | Extracted: B
Processed ID: 2427 | Extracted: C
Processed ID: 2428 | Extracted: B
Processed ID: 2429 | Extracted: B
Processed ID: 

In [9]:
def extract_answer(text):
    match = re.search(r'<answer>(.*?)</answer>', text, re.IGNORECASE | re.DOTALL)
    if match:
        return match.group(1).strip()
    return None

# 1. Configuration
BATCH_SIZE = 16 # Adjust based on model size and A10 VRAM (24GB)
MAX_NEW_TOKENS = 2048

# 1. Define a simple collate function that prevents stacking
def identity_collate(batch):
    return {key: [d[key] for d in batch] for key in batch[0]}

# 2. Update your DataLoader to use it
data_loader = DataLoader(
    test_problems_thinking,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=identity_collate # Add this line
)

predictions_thinking = []

# Ensure tokenizer has a padding token (required for batching)
if processor.tokenizer.pad_token is None:
    processor.tokenizer.pad_token = processor.tokenizer.eos_token

for batch in data_loader:
    # Prepare batch prompts
    prompts = batch['model_prompt']
    messages_batch = [[{"role": "user", "content": p}] for p in prompts]

    # 3. Apply template and tokenize with padding
    inputs = processor.apply_chat_template(
        messages_batch,
        add_generation_prompt=True,
        tokenize=True,
        padding=True, # Critical for batching
        return_dict=True,
        return_tensors="pt",
    ).to(model.device)

    # 4. Batched Generation
    with torch.inference_mode():
        outputs = model.generate(
            **inputs,
            do_sample=False,
            max_new_tokens=MAX_NEW_TOKENS,
            pad_token_id=processor.tokenizer.pad_token_id,
            temperature=0.7,
            top_p=0.8,
            top_k=20,
            min_p=0
        )

    # 5. Decode and Process
    input_length = inputs["input_ids"].shape[-1]
    generated_tokens = outputs[:, input_length:]
    decoded_outputs = processor.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

    for i, full_text in enumerate(decoded_outputs):
        problem_id = batch['problem_id'][i] if torch.is_tensor(batch['problem_id']) else batch['problem_id'][i]

        extracted = extract_answer(full_text)

        entry = {
            'problem_id': problem_id,
            'prediction_full': full_text.strip(),
            'prediction': extracted
        }
        predictions_thinking.append(entry)

        print(f"Processed ID: {problem_id} | Extracted: {extracted}")

# 6. Save results
with open("predictions_thinking.json", "w") as f:
    json.dump(predictions_thinking, f, indent=4)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'min_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Processed ID: 2401 | Extracted: B
Processed ID: 2402 | Extracted: A
Processed ID: 2403 | Extracted: None
Processed ID: 2404 | Extracted: None
Processed ID: 2405 | Extracted: None
Processed ID: 2406 | Extracted: None
Processed ID: 2407 | Extracted: C
Processed ID: 2408 | Extracted: None
Processed ID: 2409 | Extracted: C
Processed ID: 2410 | Extracted: None
Processed ID: 2411 | Extracted: B
Processed ID: 2412 | Extracted: None
Processed ID: 2413 | Extracted: B
Processed ID: 2414 | Extracted: B
Processed ID: 2415 | Extracted: None
Processed ID: 2416 | Extracted: B
Processed ID: 2417 | Extracted: C
Processed ID: 2418 | Extracted: D
Processed ID: 2419 | Extracted: B
Processed ID: 2420 | Extracted: D
Processed ID: 2421 | Extracted: C
Processed ID: 2422 | Extracted: None
Processed ID: 2423 | Extracted: C
Processed ID: 2424 | Extracted: None
Processed ID: 2425 | Extracted: D
Processed ID: 2426 | Extracted: C
Processed ID: 2427 | Extracted: A
Processed ID: 2428 | Extracted: B
Processed ID: 2429