In [1]:
# ensure gpu
!nvidia-smi

Tue Feb 17 02:36:18 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.211.01             Driver Version: 570.211.01     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A10                     Off |   00000000:07:00.0 Off |                    0 |
|  0%   42C    P0             64W /  150W |       0MiB /  23028MiB |      6%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
# get geometry3k TEST split
!wget https://lupantech.github.io/inter-gps/geometry3k/test.zip
!unzip test.zip
# install packages
!pip install -U transformers==4.57.1 torch pillow einops torchvision accelerate decord2 molmo_utils bitsandbytes

--2026-02-17 02:36:18--  https://lupantech.github.io/inter-gps/geometry3k/test.zip
Resolving lupantech.github.io (lupantech.github.io)... 185.199.108.153, 185.199.110.153, 185.199.109.153, ...
Connecting to lupantech.github.io (lupantech.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 25524601 (24M) [application/x-zip-compressed]
Saving to: ‘test.zip’

test.zip              0%[                    ]       0  --.-KB/s               


2026-02-17 02:36:18 (318 MB/s) - ‘test.zip’ saved [25524601/25524601]

Archive:  test.zip
   creating: test/
   creating: test/2925/
  inflating: test/2925/logic_form.json  
  inflating: test/2925/img_diagram_point.png  
  inflating: test/2925/data.json     
  inflating: test/2925/img_diagram.png  
   creating: test/2513/
  inflating: test/2513/logic_form.json  
  inflating: test/2513/img_diagram_point.png  
  inflating: test/2513/data.json     
  inflating: test/2513/img_diagram.png  
   creating: test/2467/
  inflating: test/2467/logic_form.json  
  inflating: test/2467/img_diagram_point.png  
  inflating: test/2467/data.json     
  inflating: test/2467/img_diagram.png  
   creating: test/2583/
  inflating: test/2583/logic_form.json  
  inflating: test/2583/img_diagram_point.png  
  inflating: test/2583/data.json     
  inflating: test/2583/img_diagram.png  
   creating: test/2602/
  inflating: test/2602/logic_form.json  
  inflating: test/2602/img_diagram_point.png  
  inflating: t

In [1]:
import re
import os
import json
import copy
import torch
from PIL import Image
from torch.utils.data import DataLoader
from transformers import AutoProcessor, AutoModelForImageTextToText

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_id="allenai/Molmo2-O-7B"

# load the processor
processor = AutoProcessor.from_pretrained(
    model_id,
    trust_remote_code=True,
)
processor.tokenizer.padding_side = "left"
model = AutoModelForImageTextToText.from_pretrained(
    model_id,
    trust_remote_code=True,
    dtype=torch.bfloat16,  
    device_map={"": "cuda"}
)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Loading checkpoint shards: 100%|██████████| 7/7 [00:04<00:00,  1.57it/s]


In [3]:
test_problems = []

# Get a list of all subdirectories within the 'test/' directory and filter for numerical directories
problem_ids = sorted([d for d in os.listdir('test/') if d.isdigit()], key=int)

for problem_id in problem_ids:
    problem_dir = os.path.join('test/', problem_id)
    data_json_path = os.path.join(problem_dir, 'data.json')
    img_diagram_path = os.path.join(problem_dir, 'img_diagram.png')

    if os.path.exists(data_json_path):
        with open(data_json_path, 'r') as f:
            data = json.load(f)

        problem_text = data.get('problem_text', '')
        choices = data.get('choices', [])
        answer = data.get('answer', '')

        # Create a dictionary to map choice indices to letters
        choice_letters = {0: 'A. ', 1: 'B. ', 2: 'C. ', 3: 'D. ', 4: 'E. ', 5: 'F. '}
        formatted_choices = [
            f"{choice_letters.get(i, str(i) + '.')}" + c for i, c in enumerate(choices)
        ]

        test_problems.append({
            'problem_id': problem_id,
            'problem_text': problem_text,
            'choices': formatted_choices,
            'answer': answer,
            'image_path': img_diagram_path
        })

print(f"Loaded {len(test_problems)} test problems.")
# Display the first problem to verify structure
if test_problems:
    print("\nFirst processed problem example:")
    print(json.dumps(test_problems[0], indent=2))

Loaded 601 test problems.

First processed problem example:
{
  "problem_id": "2401",
  "problem_text": "Find the area of the figure.",
  "choices": [
    "A. 30",
    "B. 60",
    "C. 120",
    "D. 240"
  ],
  "answer": "B",
  "image_path": "test/2401/img_diagram.png"
}


In [4]:
prompt_direct = """The following problem refers to an image diagram. Please consider the diagram to answer the question. Your answer MUST be a single capital letter: A, B, C, or D. \n\n{problem_text}\n\n{choices_text}\n\n
    Answer:"""
prompt_thinking = """You are a helpful AI assistant.
    When answering questions, you always consider the image diagram and must always show your step-by-step reasoning process first.
    Once you have reached a conclusion, you MUST answer in the following format: <answer>A, B, C, or D.</answer> \n\n{problem_text}\n\n{choices_text}\n\n
    """

In [5]:
def format_problem(prompt_format, test_problems):
    test_problems_copy = copy.deepcopy(test_problems)
    for problem in test_problems_copy:
        problem_text = problem['problem_text']
        choices_text = '\n'.join(problem['choices'])

        prompt = prompt_format.format(problem_text=problem_text, choices_text=choices_text)

        problem['model_prompt'] = [dict(type="text", text=prompt), dict(type="image", image=Image.open(problem['image_path']).convert("RGB"))]

    # Display the prompt for the first problem to verify structure
    if test_problems_copy:
        print("\nFirst problem's constructed prompt example:")
        print(test_problems_copy[0]['model_prompt'])
    return test_problems_copy

In [6]:
test_problems_direct = format_problem(prompt_direct, test_problems)
test_problems_thinking = format_problem(prompt_thinking, test_problems)


First problem's constructed prompt example:
[{'type': 'text', 'text': 'The following problem refers to an image diagram. Please consider the diagram to answer the question. Your answer MUST be a single capital letter: A, B, C, or D. \n\nFind the area of the figure.\n\nA. 30\nB. 60\nC. 120\nD. 240\n\n\n    Answer:'}, {'type': 'image', 'image': <PIL.Image.Image image mode=RGB size=260x275 at 0x7A7BA8D6B7A0>}]

First problem's constructed prompt example:
[{'type': 'text', 'text': 'You are a helpful AI assistant.\n    When answering questions, you always consider the image diagram and must always show your step-by-step reasoning process first.\n    Once you have reached a conclusion, you MUST answer in the following format: <answer>A, B, C, or D.</answer> \n\nFind the area of the figure.\n\nA. 30\nB. 60\nC. 120\nD. 240\n\n\n    '}, {'type': 'image', 'image': <PIL.Image.Image image mode=RGB size=260x275 at 0x7A7BA81DA9F0>}]


In [7]:
BATCH_SIZE = 4  # Adjust based on model size and A10 VRAM (24GB)
MAX_NEW_TOKENS = 1

# 1. Define a simple collate function that prevents stacking
def identity_collate(batch):
    return {key: [d[key] for d in batch] for key in batch[0]}

# 2. Update your DataLoader to use it
data_loader = DataLoader(
    test_problems_direct, 
    batch_size=BATCH_SIZE, 
    shuffle=False,
    collate_fn=identity_collate # Add this line
)

predictions_direct = []

# Ensure tokenizer has a padding token (required for batching)
if processor.tokenizer.pad_token is None:
    processor.tokenizer.pad_token = processor.tokenizer.eos_token

for batch in data_loader:
    # Prepare batch prompts
    prompts = batch['model_prompt']
    messages_batch = [[{"role": "user", "content": p}] for p in prompts]

    # 3. Apply template and tokenize with padding
    inputs = processor.apply_chat_template(
        messages_batch,
        add_generation_prompt=True,
        tokenize=True,
        padding=True, # Critical for batching
        return_dict=True,
        return_tensors="pt",
    ).to(model.device)

    # 4. Batched Generation
    with torch.inference_mode():
        outputs = model.generate(
            **inputs,
            do_sample=False,
            max_new_tokens=MAX_NEW_TOKENS,
            pad_token_id=processor.tokenizer.pad_token_id
        )

    # 5. Decode and Process
    input_length = inputs["input_ids"].shape[-1]
    generated_tokens = outputs[:, input_length:]
    decoded_outputs = processor.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

    for i, full_text in enumerate(decoded_outputs):
        problem_id = batch['problem_id'][i] if torch.is_tensor(batch['problem_id']) else batch['problem_id'][i]
        
        
        entry = {
            'problem_id': problem_id,
            'prediction': full_text
        }
        predictions_direct.append(entry)
        
        print(f"Processed ID: {problem_id} | Extracted: {full_text}")

# 6. Save results
with open("predictions_direct.json", "w") as f:
    json.dump(predictions_direct, f, indent=4)

Processed ID: 2401 | Extracted: C
Processed ID: 2402 | Extracted: A
Processed ID: 2403 | Extracted: C
Processed ID: 2404 | Extracted: C
Processed ID: 2405 | Extracted: C
Processed ID: 2406 | Extracted: C
Processed ID: 2407 | Extracted: C
Processed ID: 2408 | Extracted: C
Processed ID: 2409 | Extracted: B
Processed ID: 2410 | Extracted: C
Processed ID: 2411 | Extracted: B
Processed ID: 2412 | Extracted: C
Processed ID: 2413 | Extracted: C
Processed ID: 2414 | Extracted: B
Processed ID: 2415 | Extracted: C
Processed ID: 2416 | Extracted: C
Processed ID: 2417 | Extracted: C
Processed ID: 2418 | Extracted: B
Processed ID: 2419 | Extracted: C
Processed ID: 2420 | Extracted: A
Processed ID: 2421 | Extracted: C
Processed ID: 2422 | Extracted: C
Processed ID: 2423 | Extracted: C
Processed ID: 2424 | Extracted: C
Processed ID: 2425 | Extracted: B
Processed ID: 2426 | Extracted: B
Processed ID: 2427 | Extracted: D
Processed ID: 2428 | Extracted: D
Processed ID: 2429 | Extracted: C
Processed ID: 

In [7]:
def extract_answer(text):
    match = re.search(r'<answer>(.*?)</answer>', text, re.IGNORECASE | re.DOTALL)
    if match:
        return match.group(1).strip()
    return None

# 1. Configuration
BATCH_SIZE = 3 # Adjust based on model size and A10 VRAM (24GB)
MAX_NEW_TOKENS = 2048

# 1. Define a simple collate function that prevents stacking
def identity_collate(batch):
    return {key: [d[key] for d in batch] for key in batch[0]}

# 2. Update your DataLoader to use it
data_loader = DataLoader(
    test_problems_thinking, 
    batch_size=BATCH_SIZE, 
    shuffle=False,
    collate_fn=identity_collate # Add this line
)

predictions_thinking = []

# Ensure tokenizer has a padding token (required for batching)
if processor.tokenizer.pad_token is None:
    processor.tokenizer.pad_token = processor.tokenizer.eos_token

for batch in data_loader:
    # Prepare batch prompts
    prompts = batch['model_prompt']
    messages_batch = [[{"role": "user", "content": p}] for p in prompts]

    # 3. Apply template and tokenize with padding
    inputs = processor.apply_chat_template(
        messages_batch,
        add_generation_prompt=True,
        tokenize=True,
        padding=True, # Critical for batching
        return_dict=True,
        return_tensors="pt",
    ).to(model.device)

    # 4. Batched Generation
    with torch.inference_mode():
        outputs = model.generate(
            **inputs,
            do_sample=False,
            max_new_tokens=MAX_NEW_TOKENS,
            pad_token_id=processor.tokenizer.pad_token_id
        )

    # 5. Decode and Process
    input_length = inputs["input_ids"].shape[-1]
    generated_tokens = outputs[:, input_length:]
    decoded_outputs = processor.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

    for i, full_text in enumerate(decoded_outputs):
        problem_id = batch['problem_id'][i] if torch.is_tensor(batch['problem_id']) else batch['problem_id'][i]
        
        extracted = extract_answer(full_text)
        
        entry = {
            'problem_id': problem_id,
            'prediction_full': full_text.strip(),
            'prediction': extracted
        }
        predictions_thinking.append(entry)
        
        print(f"Processed ID: {problem_id} | Extracted: {extracted}")

# 6. Save results
with open("predictions_thinking.json", "w") as f:
    json.dump(predictions_thinking, f, indent=4)

Processed ID: 2401 | Extracted: C
Processed ID: 2402 | Extracted: B
Processed ID: 2403 | Extracted: C. 25.9
Processed ID: 2404 | Extracted: C
Processed ID: 2405 | Extracted: B. 38
Processed ID: 2406 | Extracted: C. 79
Processed ID: 2407 | Extracted: C. 74
Processed ID: 2408 | Extracted: C. 24
Processed ID: 2409 | Extracted: C. 21 \sqrt { 3 }
Processed ID: 2410 | Extracted: C
Processed ID: 2411 | Extracted: B
Processed ID: 2412 | Extracted: B. 70
Processed ID: 2413 | Extracted: D. 346
Processed ID: 2414 | Extracted: None of the given options.
Processed ID: 2415 | Extracted: A. 38
Processed ID: 2416 | Extracted: None of the options are correct.
Processed ID: 2417 | Extracted: B
Processed ID: 2418 | Extracted: B
Processed ID: 2419 | Extracted: B. 10 \pi
Processed ID: 2420 | Extracted: A. 3
Processed ID: 2421 | Extracted: C
Processed ID: 2422 | Extracted: C. 11.13
Processed ID: 2423 | Extracted: C. 7
Processed ID: 2424 | Extracted: B
Processed ID: 2425 | Extracted: B. 69.8
Processed ID: 24