## Color naming acc. evaluator
1. Prepare image VQA dataset (Original Image)
2. Load Model, and set up the CVD simulator
3. Batch inference

In [None]:
DATASET_PATH = "./color_150k.json"
MODEL_PATH = "./Qwen3-VL-8B-Instruct"
CVD_TYPE = "Deuteranomaly"
CVD_SEVERITY = 1.0

In [None]:
import re
import nltk
from nltk import word_tokenize, pos_tag

# 如果是第一次运行，需要下载以下资源（只需运行一次）
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

class ColorSentenceEvaluator:
    def __init__(self):
        # 定义颜色词表
        self.selected_colors = {
            "red", "green", "blue", "yellow", "orange", "purple",
            "pink", "brown", "gray", "black", "white"
        }

    def _extract_color_object_pairs(self, sentence):
        """
        提取句子中的颜色-物体对，如 {"red": "car", "yellow": "wing"}。
        规则：按标点与连词分句 -> 在每个子句中找第一个颜色词 -> 取其后的第一个名词
        """
        pairs = {}
        # 拆分子句
        clauses = re.split(r'[,.]| and | but ', sentence)
        for clause in clauses:
            words = word_tokenize(clause)
            tagged = pos_tag(words)  # [(word, POS), ...]

            color = None
            obj = None
            for i, (word, pos) in enumerate(tagged):
                w_lower = word.lower()
                if color is None and w_lower in self.selected_colors:
                    color = w_lower
                    # 在后面寻找第一个名词（NN/NNS）
                    for j in range(i + 1, len(tagged)):
                        if tagged[j][1] in ("NN", "NNS", "NNP"):
                            obj = tagged[j][0].lower()
                            break
                    break
            if color and obj:
                pairs[color] = obj
        return pairs

    def evaluate(self, predict_sentence, ground_truth_sentence):
        """
        比较预测句与参考句中颜色-物体对的匹配程度，返回准确率(0~1)
        """
        gt_pairs = self._extract_color_object_pairs(ground_truth_sentence)
        pred_pairs = self._extract_color_object_pairs(predict_sentence)

        if not gt_pairs:
            return 1.0 if not pred_pairs else 0.0

        correct = sum(
            1 for c, o in gt_pairs.items()
            if c in pred_pairs and pred_pairs[c] == o
        )
        return correct / len(gt_pairs)



In [None]:
# Example
evaluator = ColorSentenceEvaluator()

gt = "There is a red car, equipped with a yellow wing."
pred = "A red car with a yellow spoiler."

score = evaluator.evaluate(pred, gt)
print("Color accuracy:", score)


In [None]:
# !pip install colour-science
from colour.blindness import matrix_cvd_Machado2009
from PIL import Image
import numpy as np

def RGB_to_sRGB(RGB):
    '''RGB to sRGB, value 0.0-1.0(NOT 0-255)'''
    sRGB = np.ones_like(RGB)
    mask = RGB > 0.0031308
    sRGB[~mask] = 12.92*RGB[~mask]
    sRGB[mask] = 1.055 * RGB[mask]**(1 / 2.4) - 0.055
    return sRGB

def sRGB_to_RGB(srgb_img):
    ''' Gamma correction of sRGB photo from camera  
        value 0.0-1.0(NOT 0-255)
    Ref: http://brucelindbloom.com/Eqn_RGB_to_XYZ.html 
    '''
    RGB = np.ones_like(srgb_img)
    mask = srgb_img < 0.04045
    RGB[mask] = srgb_img[mask]/12.92
    RGB[~mask] = ((srgb_img[~mask]+0.055)/1.055)**2.4
    return RGB

def im_dot(H_mat,im):
    '''input: h*w*3, 0.0-1.0 np array'''
    h,w,d = im.shape
    im = sRGB_to_RGB(im)  # convert to RGB, value 0.0-1.0
    im1 = im.reshape(-1,d)
    im_dst1 = im1 @ H_mat.T
    # im_dst1 = cvd_simulation_tritran(im1)
    im_dst = im_dst1.reshape(h,w,d)
    im_dst = RGB_to_sRGB(im_dst)  # convert to sRGB, value 0.0-1.0
    im_dst[im_dst>1] = 1.
    im_dst[im_dst<0] = 0.
    return im_dst
class cvdSimulateNetMarchado():
    def __init__(self,cvd_type='Deuteranomaly',severity=1.0):
        """ 模拟人眼对颜色的感知.
        Args:
            cvd_type (str, optional): 色盲类型. Defaults to 'Deuteranomaly'. Protanomaly, Deuteranomaly
            severity (float, optional): 色盲 Severity. Defaults to 1.0.
        """
        self.cvd_type = cvd_type
        self.mat = matrix_cvd_Machado2009(cvd_type, severity)
    
    def __call__(self,im):
        '''
        input: h*w*3, Image
        output: PIL Image
        '''
        if isinstance(im,Image.Image):
            im = np.array(im)/255.
        np_out = im_dot(self.mat,im)
        out = Image.fromarray((np_out*255).astype(np.uint8))
        return out

## Set up the model

In [None]:
from PIL import Image
# 创建自定义的处理器，基于原始tokenizer
import json
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# -----------------------------
# 初始化模型和处理器
# -----------------------------
from transformers import AutoProcessor, AutoModelForImageTextToText

model = AutoModelForImageTextToText.from_pretrained(MODEL_PATH, torch_dtype=torch.float16, device_map="auto")
cvd_tokenizer = AutoProcessor.from_pretrained(MODEL_PATH)

# 参考 https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen3_vl/processing_qwen3_vl.py

# 保存原始的图像预处理方法
original_process_images = cvd_tokenizer.image_processor.preprocess

# 初始化色盲模拟器
# cvd_simulator = cvdSimulateNet(cvd_type='protan', cuda=True, batched_input=True)  # 根据需要设置色盲类型
cvd_simulator = cvdSimulateNetMarchado(cvd_type=CVD_TYPE,severity=CVD_SEVERITY)

def convert_to_LMS(image):
    """将彩色图像转换为CVD图像。Output: Tensor"""
    # print(image)# debug
    if isinstance(image, list):
        return [[cvd_simulator(img[0])] for img in image]
    else:
        return cvd_simulator(Image.open(image).convert("RGB"))

def new_process_images(images, **kwargs):
    """新的图像预处理方法，先转换为LMS图像"""
    # 转换为黑白图像
    LMS_images = convert_to_LMS(images)
    # 调用原始的预处理方法
    return original_process_images(LMS_images, **kwargs)
# 替换处理器的预处理方法
cvd_tokenizer.image_processor.preprocess = new_process_images

In [None]:
# -----------------------------
# 初始化颜色评估器
# -----------------------------
evaluator = ColorSentenceEvaluator()

# -----------------------------
# 加载数据集
# -----------------------------
with open("dataset.json", "r", encoding="utf-8") as f:
    data = json.load(f)

results = []

# ==============================
# 遍历数据集
# ==============================
for sample in tqdm(data, desc="Evaluating dataset"):
    messages = sample.get("messages", [])
    image_path = None
    system_prompt = ""
    gt_turns = []

    # 解析多轮结构
    for m in messages:
        role = m["role"]
        if role == "system":
            system_prompt = m["content"]
        elif role == "user" and "images" in m:
            # 保存图像路径
            image_path = m["images"][0]
        elif role == "user":
            current_question = m["content"]
        elif role == "assistant":
            current_answer = m["content"]
            # 生成一条完整对话样本
            if image_path and current_question and current_answer:
                gt_turns.append({
                    "system": system_prompt,
                    "image": image_path,
                    "question": current_question,
                    "answer": current_answer
                })

    # ==============================
    # 对每轮对话分别推理 + 评估
    # ==============================
    for turn in gt_turns:
        img_path = turn["image"]
        gt = turn["answer"]
        question = turn["question"]

        # 构造输入消息（Qwen-VL 格式）
        chat_messages = [
            {"role": "system", "content": turn["system"]},
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": img_path},
                    {"type": "text", "text": question},
                ],
            }
        ]

        # === 模型推理 ===
        input_text = cvd_tokenizer.apply_chat_template(chat_messages, add_generation_prompt = True)
        inputs = cvd_tokenizer(
            img_path,
            input_text,
            add_special_tokens = False,
            return_tensors = "pt",
        ).to("cuda")

        # debug
        # print(inputs)

        generated_ids = model.generate(**inputs, max_new_tokens=256)
        generated_ids_trimmed = [
            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        output_text = cvd_tokenizer.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )[0]

        # === 颜色准确率计算 ===
        score = evaluator.evaluate(output_text, gt)

        results.append({
            "id": sample["id"],
            "question": question,
            "predict": output_text,
            "ground_truth": gt,
            "color_accuracy": score
        })


In [None]:

# ==============================
# 汇总结果
# ==============================
mean_acc = sum(r["color_accuracy"] for r in results) / len(results)
print(f"\nAverage color-object accuracy: {mean_acc:.3f} over {len(results)} QA pairs")

with open("color_eval_results.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)