In [None]:
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
from datasets import load_dataset
from PIL import Image
import torch
import re
import json
from tqdm import tqdm

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

from huggingface_hub import login
login(token='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')

# model_id = "google/paligemma2-3b-pt-448"
model_id = "google/paligemma2-3b-mix-448"
model = PaliGemmaForConditionalGeneration.from_pretrained(model_id).to(device).eval()
processor = AutoProcessor.from_pretrained(model_id)

dataset = load_dataset("MMInstruction/Clevr_CoGenT_ValB", split="train")
batch_size = 5

Using device: cuda


Downloading data: 100%|██████████| 412M/412M [00:17<00:00, 24.1MB/s] 
Downloading data: 100%|██████████| 409M/409M [00:16<00:00, 24.7MB/s] 


Generating train split:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [6]:
print(dataset[0])

{'image': <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=480x320 at 0x25CFF63D3F0>, 'problem': 'How many other things are the same shape as the red thing?', 'solution': '<answer> 1 </answer>'}


In [None]:
def extract_answer(text):
    match = re.search(r"<answer>\s*(.*?)\s*</answer>", text)
    return match.group(1).strip().lower() if match else text.strip().lower()

def normalize_answer(text):
    text = text.lower().strip()
    if "\n" in text:
        text = text.split("\n")[-1].strip()
        
    numbers = re.findall(r"\d+", text)
    if numbers:
        return numbers[0]
    if "yes" in text:
        return "yes"
    elif "no" in text:
        return "no"
    return text

def compute_accuracy(responses, ground_truths, questions=None, output_file=None):
    correct = 0
    results = []
    for i, (pred, gt) in enumerate(zip(responses, ground_truths)):
        pred_norm = normalize_answer(pred)
        gt_norm = normalize_answer(extract_answer(gt))
        is_correct = pred_norm == gt_norm
        if is_correct:
            correct += 1


        item = {
            "id": i,
            "prediction": pred,
            "prediction_normalized": pred_norm,
            "ground_truth": gt,
            "ground_truth_normalized": gt_norm,
            "is_correct": is_correct,
        }       
        if questions:
            item["question"] = questions[i]
        results.append(item)
        print(item)
        
    if output_file:
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump({
                "accuracy": correct / len(responses),
                "total": len(responses),
                "correct": correct,
                "results": results
            }, f, indent=2, ensure_ascii=False)
    return correct / len(responses)


In [None]:
# Evaluation loop
SEQLEN = 128
responses, ground_truths, problems = [], [], []
QUESTION_TEMPLATE = "{Question}"
save_path = './checkpoints/paligemma_hf_eval_CoGenT_ValB.json'

for i in tqdm(range(0, len(dataset), batch_size)):
    batch = dataset[i:i+batch_size]
    images = [image.convert("RGB") for image in batch["image"]]
    questions = [QUESTION_TEMPLATE.format(Question=content["problem"]) for content in batch]
    gt_answers = [a.lower() for a in batch["solution"]]

    prompts = [f"{q}" for q in questions]
    inputs = processor(text=prompts, images=images, return_tensors="pt", padding=True).to("cuda")
    # inputs = processor(prompts, images=images, return_tensors="pt", padding=True).to("cuda")

    with torch.no_grad():
        outputs = model.generate(**inputs,
            do_sample=False,
            eos_token_id=processor.tokenizer.eos_token_id,
            pad_token_id=processor.tokenizer.pad_token_id,)

        # outputs = model.generate(**inputs, max_new_tokens=SEQLEN)
        preds = processor.batch_decode(outputs, skip_special_tokens=True)

    
    responses.extend(preds)
    ground_truths.extend(gt_answers)
    problems.extend(questions)

    if i%100==0:
        accuracy = compute_accuracy(responses, ground_truths, problems, save_path)
        print(f"Accuracy: {accuracy:.2%}")

accuracy = compute_accuracy(responses, ground_truths, problems, save_path)
print(f"Accuracy: {accuracy:.2%}")

  0%|          | 0/1000 [00:00<?, ?it/s]You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text. For this call, we will infer how many images each text has and add special tokens.
  0%|          | 1/1000 [00:08<2:19:29,  8.38s/it]You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text. For this call, we will infer how many images each text has and add special tokens.


{'id': 0, 'prediction': 'How many other things are the same shape as the red thing?\n3', 'prediction_normalized': '3', 'ground_truth': '<answer> 1 </answer>', 'ground_truth_normalized': '1', 'is_correct': False, 'question': 'How many other things are the same shape as the red thing?'}
{'id': 1, 'prediction': 'There is a small green rubber object; what number of green objects are behind it?\n2', 'prediction_normalized': '2', 'ground_truth': '<answer> 1 </answer>', 'ground_truth_normalized': '1', 'is_correct': False, 'question': 'There is a small green rubber object; what number of green objects are behind it?'}
{'id': 2, 'prediction': 'How many cylinders are either big purple things or big objects?\n2', 'prediction_normalized': '2', 'ground_truth': '<answer> 2 </answer>', 'ground_truth_normalized': '2', 'is_correct': True, 'question': 'How many cylinders are either big purple things or big objects?'}
{'id': 3, 'prediction': 'What number of cyan things are rubber things or small blocks?\

  0%|          | 2/1000 [00:10<1:19:55,  4.80s/it]You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text. For this call, we will infer how many images each text has and add special tokens.
  0%|          | 3/1000 [00:12<1:00:48,  3.66s/it]You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text. For this call, we will infer how many images each text has and add special tokens.
  0%|          | 4/1000 [00:15<51:41,  3.11s/it]  You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recomm

{'id': 0, 'prediction': 'How many other things are the same shape as the red thing?\n3', 'prediction_normalized': '3', 'ground_truth': '<answer> 1 </answer>', 'ground_truth_normalized': '1', 'is_correct': False, 'question': 'How many other things are the same shape as the red thing?'}
{'id': 1, 'prediction': 'There is a small green rubber object; what number of green objects are behind it?\n2', 'prediction_normalized': '2', 'ground_truth': '<answer> 1 </answer>', 'ground_truth_normalized': '1', 'is_correct': False, 'question': 'There is a small green rubber object; what number of green objects are behind it?'}
{'id': 2, 'prediction': 'How many cylinders are either big purple things or big objects?\n2', 'prediction_normalized': '2', 'ground_truth': '<answer> 2 </answer>', 'ground_truth_normalized': '2', 'is_correct': True, 'question': 'How many cylinders are either big purple things or big objects?'}
{'id': 3, 'prediction': 'What number of cyan things are rubber things or small blocks?\

  2%|▏         | 22/1000 [00:56<37:31,  2.30s/it]You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text. For this call, we will infer how many images each text has and add special tokens.
  2%|▏         | 23/1000 [00:58<37:25,  2.30s/it]You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text. For this call, we will infer how many images each text has and add special tokens.
  2%|▏         | 24/1000 [01:01<37:26,  2.30s/it]You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommend

{'id': 0, 'prediction': 'How many other things are the same shape as the red thing?\n3', 'prediction_normalized': '3', 'ground_truth': '<answer> 1 </answer>', 'ground_truth_normalized': '1', 'is_correct': False, 'question': 'How many other things are the same shape as the red thing?'}
{'id': 1, 'prediction': 'There is a small green rubber object; what number of green objects are behind it?\n2', 'prediction_normalized': '2', 'ground_truth': '<answer> 1 </answer>', 'ground_truth_normalized': '1', 'is_correct': False, 'question': 'There is a small green rubber object; what number of green objects are behind it?'}
{'id': 2, 'prediction': 'How many cylinders are either big purple things or big objects?\n2', 'prediction_normalized': '2', 'ground_truth': '<answer> 2 </answer>', 'ground_truth_normalized': '2', 'is_correct': True, 'question': 'How many cylinders are either big purple things or big objects?'}
{'id': 3, 'prediction': 'What number of cyan things are rubber things or small blocks?\

  4%|▍         | 42/1000 [01:42<36:46,  2.30s/it]You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text. For this call, we will infer how many images each text has and add special tokens.
  4%|▍         | 43/1000 [01:44<36:41,  2.30s/it]You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text. For this call, we will infer how many images each text has and add special tokens.
  4%|▍         | 44/1000 [01:47<36:41,  2.30s/it]You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommend

{'id': 0, 'prediction': 'How many other things are the same shape as the red thing?\n3', 'prediction_normalized': '3', 'ground_truth': '<answer> 1 </answer>', 'ground_truth_normalized': '1', 'is_correct': False, 'question': 'How many other things are the same shape as the red thing?'}
{'id': 1, 'prediction': 'There is a small green rubber object; what number of green objects are behind it?\n2', 'prediction_normalized': '2', 'ground_truth': '<answer> 1 </answer>', 'ground_truth_normalized': '1', 'is_correct': False, 'question': 'There is a small green rubber object; what number of green objects are behind it?'}
{'id': 2, 'prediction': 'How many cylinders are either big purple things or big objects?\n2', 'prediction_normalized': '2', 'ground_truth': '<answer> 2 </answer>', 'ground_truth_normalized': '2', 'is_correct': True, 'question': 'How many cylinders are either big purple things or big objects?'}
{'id': 3, 'prediction': 'What number of cyan things are rubber things or small blocks?\

  6%|▌         | 62/1000 [02:28<35:57,  2.30s/it]You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text. For this call, we will infer how many images each text has and add special tokens.
  6%|▋         | 63/1000 [02:31<35:55,  2.30s/it]You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text. For this call, we will infer how many images each text has and add special tokens.
  6%|▋         | 64/1000 [02:33<35:55,  2.30s/it]You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommend

{'id': 0, 'prediction': 'How many other things are the same shape as the red thing?\n3', 'prediction_normalized': '3', 'ground_truth': '<answer> 1 </answer>', 'ground_truth_normalized': '1', 'is_correct': False, 'question': 'How many other things are the same shape as the red thing?'}
{'id': 1, 'prediction': 'There is a small green rubber object; what number of green objects are behind it?\n2', 'prediction_normalized': '2', 'ground_truth': '<answer> 1 </answer>', 'ground_truth_normalized': '1', 'is_correct': False, 'question': 'There is a small green rubber object; what number of green objects are behind it?'}
{'id': 2, 'prediction': 'How many cylinders are either big purple things or big objects?\n2', 'prediction_normalized': '2', 'ground_truth': '<answer> 2 </answer>', 'ground_truth_normalized': '2', 'is_correct': True, 'question': 'How many cylinders are either big purple things or big objects?'}
{'id': 3, 'prediction': 'What number of cyan things are rubber things or small blocks?\

  8%|▊         | 82/1000 [03:14<35:18,  2.31s/it]You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text. For this call, we will infer how many images each text has and add special tokens.
  8%|▊         | 83/1000 [03:17<35:15,  2.31s/it]You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text. For this call, we will infer how many images each text has and add special tokens.
  8%|▊         | 84/1000 [03:19<35:12,  2.31s/it]You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommend

KeyboardInterrupt: 

In [4]:
accuracy = compute_accuracy(responses, ground_truths, problems)
print(f"Accuracy: {accuracy:.2%}")

{'id': 0, 'prediction': 'How many other things are the same shape as the red thing?\n3', 'prediction_normalized': '3', 'ground_truth': '<answer> 1 </answer>', 'ground_truth_normalized': '1', 'is_correct': False, 'question': 'How many other things are the same shape as the red thing?'}
{'id': 1, 'prediction': 'There is a small green rubber object; what number of green objects are behind it?\n2', 'prediction_normalized': '2', 'ground_truth': '<answer> 1 </answer>', 'ground_truth_normalized': '1', 'is_correct': False, 'question': 'There is a small green rubber object; what number of green objects are behind it?'}
{'id': 2, 'prediction': 'How many cylinders are either big purple things or big objects?\n2', 'prediction_normalized': '2', 'ground_truth': '<answer> 2 </answer>', 'ground_truth_normalized': '2', 'is_correct': True, 'question': 'How many cylinders are either big purple things or big objects?'}
{'id': 3, 'prediction': 'What number of cyan things are rubber things or small blocks?\