# Load Dataset

In [1]:
import datasets

CONFIG_NAME = "coco"
SPLIT_NAME = "rotate"
COND = "invariant"

ds = datasets.load_dataset("feiziaarash/pairbench", CONFIG_NAME, split=SPLIT_NAME)
ds_templates = datasets.load_dataset("feiziaarash/pairbench", "templates", split=CONFIG_NAME)

  from .autonotebook import tqdm as notebook_tqdm
Downloading readme: 100%|██████████| 5.33k/5.33k [00:00<00:00, 19.4MB/s]
Downloading data: 100%|██████████| 159M/159M [00:04<00:00, 35.0MB/s] 
Downloading data: 100%|██████████| 165M/165M [00:03<00:00, 41.8MB/s] 
Downloading data: 100%|██████████| 152M/152M [00:04<00:00, 36.5MB/s] 
Downloading data: 100%|██████████| 152M/152M [00:03<00:00, 40.9MB/s] 
Downloading data: 100%|██████████| 133M/133M [00:03<00:00, 40.4MB/s] 
Generating colorjitter split: 100%|██████████| 500/500 [00:00<00:00, 904.72 examples/s] 
Generating elastic split: 100%|██████████| 500/500 [00:00<00:00, 1106.21 examples/s]
Generating gaussianblur split: 100%|██████████| 500/500 [00:00<00:00, 1221.54 examples/s]
Generating rotate split: 100%|██████████| 500/500 [00:00<00:00, 1238.66 examples/s]
Generating perspective split: 100%|██████████| 500/500 [00:00<00:00, 1456.51 examples/s]
Downloading data: 100%|██████████| 47.2k/47.2k [00:00<00:00, 125kB/s]
Downloading data: 10

# Load Qwen2-VL-7B-Instruct Model

In [2]:
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info

# default: Load the model on the available device(s)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")


Loading checkpoint shards: 100%|██████████| 5/5 [01:16<00:00, 15.32s/it]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [3]:
import json 

templates = json.loads(ds_templates['query_templates'][0])
condition_dict = json.loads(ds_templates['query_conditions'][0])
logistics = json.loads(ds_templates['logistics'][0])

# Metric functions

In [58]:
from sklearn.metrics import normalized_mutual_info_score
from scipy.stats import entropy
import re 

from collections import defaultdict
import math
import numpy as np


def compute_relaxed_symmetry(results, eps=1):
    grouped = defaultdict(list)
    for r in results:
        key = (f"{r['index']}_{r['condition']}", tuple(r['img_keys']))
        grouped[key].append(r['score'])
    total = len(grouped)
    symmetric = sum(1 for scores in grouped.values() if len(scores) == 2 and abs(scores[0] - scores[1]) <= eps)
    return symmetric / total if total > 0 else 0

def compute_mmscore(results):
    y_true = [r["gt"] for r in results]
    y_pred = [r["score"] for r in results]
    return normalized_mutual_info_score(y_true, y_pred)

def compute_controllability(results_invar, results_var):
    mmscore_invar = compute_mmscore(results_invar)
    mmscore_var = compute_mmscore(results_var)
    denominator = math.sqrt(mmscore_var * mmscore_invar)
    return 1 - abs(mmscore_var - mmscore_invar) / denominator if denominator > 0 else 0

def compute_smoothness(results):
    scores = [r["score"] for r in results]
    counts = np.bincount(scores, minlength=11)  # 0-10
    prob = counts / counts.sum()
    return entropy(prob, base=2)

# Parse Response
    - Get Score and Reason
    - If not parsable, use first integer in string as a heuristic
    - If no integers were found, score = -1

In [47]:
def parse_response(resp):
    # Unescape literal '\n' to actual newlines, and strip surrounding whitespace
    cleaned = resp.encode('utf-8').decode('unicode_escape').strip()

    # Try to parse using standard format
    match = re.search(r"Score:\s*(\d+)\s*Reason:\s*(.+)", cleaned, re.IGNORECASE | re.DOTALL)
    if match:
        score = int(match.group(1))
        reason = match.group(2).strip()
        return score, reason

    # Fallback: extract first integer and return full text as reason
    fallback_score = re.search(r"\d+", cleaned)
    score = int(fallback_score.group(0)) if fallback_score else -1
    return score, cleaned

# Inference

In [48]:
import io
import base64
from tqdm import tqdm
from qwen_vl_utils import process_vision_info

def pil_to_base64_str(pil_img):
    buffer = io.BytesIO()
    pil_img.save(buffer, format="PNG")
    base64_str = base64.b64encode(buffer.getvalue()).decode("utf-8")
    return f"data:image/png;base64,{base64_str}"


all_results = {}
for condition in ['invariant', 'variant']:
    print(f"Processing condition: {condition}")
    results = []

    for i in tqdm(range(20)):
        sample = ds[i]
        for img_pair in logistics["data-pairs"]:
            # Randomly choose a prompt version for each pair
            prompt_version = random.choice(list(templates.keys()))
            raw_template = templates[prompt_version]

            for reverse in [False, True]:
                img_key_1, img_key_2 = img_pair if not reverse else img_pair[::-1]
                img1, img2 = sample[img_key_1], sample[img_key_2]

                # Convert images to base64
                img1_base64 = pil_to_base64_str(img1)
                img2_base64 = pil_to_base64_str(img2)

                # Construct the prompt
                cond_text = condition_dict["rotation"][condition]
                prompt = raw_template.format(conditions=cond_text)

                # Build multimodal message
                messages = [{
                    "role": "user",
                    "content": [
                        {"type": "image", "image": img1_base64},
                        {"type": "image", "image": img2_base64},
                        {"type": "text", "text": prompt},
                    ]
                }]

                # Apply chat template and process vision input
                text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
                image_inputs, video_inputs = process_vision_info(messages)

                inputs = processor(
                    text=[text],
                    images=image_inputs,
                    videos=video_inputs,
                    padding=True,
                    return_tensors="pt",
                ).to(model.device)

                with torch.no_grad():
                    generated_ids = model.generate(**inputs, max_new_tokens=256, do_sample=False)

                generated_ids_trimmed = [
                    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
                ]
                response = processor.batch_decode(
                    generated_ids_trimmed,
                    skip_special_tokens=True,
                    clean_up_tokenization_spaces=False,
                )[0]
                
                score, reason = parse_response(response)
                keys = sorted([img_key_1, img_key_2])
                if condition == 'invariant':
                    gt = 10 if keys in [["img1", "img2"], ["img1", "img3"]] else 1
                else: # 'variant'
                    if keys == ["img1", "img2"]:
                        gt = 10
                    elif keys == ["img1", "img3"]:
                        gt = 6
                    else: # ["img1", "img4"]
                        gt = 1
                # Save result
                results.append({
                    "index": i,
                    "img_keys": keys,
                    "reversed": reverse,
                    "template_version": prompt_version,
                    "condition": condition,
                    "cond_text": cond_text,
                    "prompt": prompt,
                    "gt": gt,
                    "model_response": response,
                    "score": score,
                    "reason": reason,
                })
            
        
    all_results[condition] = results

Processing condition: invariant


100%|██████████| 20/20 [01:03<00:00,  3.16s/it]


Processing condition: variant


100%|██████████| 20/20 [00:56<00:00,  2.81s/it]


In [59]:
print("Computing metrics...")
results_invar = all_results['invariant']
results_var = all_results['variant']

relaxed_symmetry_invar = compute_relaxed_symmetry(results_invar)
relaxed_symmetry_var = compute_relaxed_symmetry(results_var)
relaxed_symmetry = compute_relaxed_symmetry(results_var + results_invar)
mmscore = compute_mmscore(results_invar + results_var)
controllability = compute_controllability(results_invar, results_var)
smoothness = compute_smoothness(results_invar + results_var)
print(f"Relaxed Symmetry: {relaxed_symmetry:.4f}")
print(f"MMSCORE: {mmscore:.4f}")
print(f"Controllability: {controllability:.4f}")
print(f"Smoothness: {smoothness:.4f}")

Computing metrics...
Relaxed Symmetry: 0.9667
MMSCORE: 0.5370
Controllability: 0.8517
Smoothness: 2.4471
