In [14]:
# Requiremenet to use Qwen2VLForConditionalGeneration (see https://github.com/QwenLM/Qwen2-VL?tab=readme-ov-file#quickstart)
!pip install git+https://github.com/huggingface/transformers@21fac7abba2a37fae86106f87fcf9974fd1e3830 accelerate 

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting git+https://github.com/huggingface/transformers@21fac7abba2a37fae86106f87fcf9974fd1e3830
  Cloning https://github.com/huggingface/transformers (to revision 21fac7abba2a37fae86106f87fcf9974fd1e3830) to /tmp/pip-req-build-2olquy3_
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-2olquy3_
  fatal: unable to access 'https://github.com/huggingface/transformers/': Could not resolve host: github.com
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mgit clone --[0m[32mfilter[0m[32m=[0m[32mblob[0m[32m:none --quiet [0m[4;32mhttps://github.com/huggingface/transformers[0m[32m [0m[32m/tmp/[0m[32mpip-req-build-2olquy3_[0m did not run successfully.
  [31m│[0m exit code: [1;36m128[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
[1;31merror[0m: [1msubprocess-exited-with-error[0m



In [15]:
import json
import numpy as np
import pandas as pd
from PIL import Image, ImageOps
from pathlib import Path
import matplotlib.pyplot as plt

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor

# Solving ARC with VLMs

Solving an ARC puzzle requires both visual analysis and logical reasoning making Vision Language Models (VLM) a natural candidate for this task.  In this notebook, we explore VLM ability to solve ARC puzzles, by answering the following questions:

1. Can VLM describe puzzles ? 
2. Can they understand the underlying pattern between the input and the output ? 
3. Are they able to identify the correct output in a set of possible outputs ? 

We use `Qwen2-VL-7B-Instruct`, the highest ranked 7B model in the [LMSYS vision arena](https://lmarena.ai/?leaderboard) (#13 as of 2023/09/24).

In [16]:
# Load dataset 

SCALE_FACTOR = 3 * 14
COLOR_LIST = ["black", "blue", "red", "green", "yellow", "gray", "magenta", "orange", "skyblue", "brown", "lightgray"]
COLOR_MAP = np.array([Image.new("RGB", (1, 1), color=color).getpixel((0, 0)) for color in COLOR_LIST])


def render(grid):
    # Upscale grid
    grid = np.repeat(grid, SCALE_FACTOR, axis=0)
    grid = np.repeat(grid, SCALE_FACTOR, axis=1)

    # Add grid lines
    grid[::SCALE_FACTOR, :] = 10
    grid[SCALE_FACTOR - 1::SCALE_FACTOR, :] = 10
    grid[:, ::SCALE_FACTOR] = 10
    grid[:, SCALE_FACTOR - 1::SCALE_FACTOR] = 10

    # Convert to color
    grid = COLOR_MAP[grid]
    image = Image.fromarray(grid.astype(np.uint8))
    return image


class Task:
    def __init__(self, row):
        self.id = row.name
        self.train = [(render(x["input"]), render(x["output"])) for x in row["train"]]
        self.test = [(render(x["input"]), render(y)) for x, y in zip(row["test"], row["solution"])]

    def display(self):
        for name, data in [("Train", self.train), ("Test", self.test)]:
            for i, (input, output) in enumerate(data):
                for j, (subname, grid) in enumerate(
                    [("input", input), ("output", output)]
                ):
                    plt.subplot(1, 2, j + 1)
                    plt.imshow(grid)
                    plt.title(f"{name} #{i+1} {subname}")
                    plt.axis("off")
                plt.show()
    
    def __repr__(self):
        return f"Task {self.id} ({len(self.train)} train, {len(self.test)} test)"


class ArcDataset:
    def __init__(self, path):
        self.path = Path(path)
        X = json.load((self.path / "arc-agi_training_challenges.json").open())
        Y = json.load((self.path / "arc-agi_training_solutions.json").open())

        self.data = pd.DataFrame(X).T
        self.data["solution"] = pd.Series(Y)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, key):
        if isinstance(key, int):
            assert key < len(self)
            key = self.data.index[key]

        task = Task(self.data.loc[key])
        return task

    def __repr__(self):
        return f"ArcDataset(n={len(self)})"


dataset = ArcDataset("/kaggle/input/arc-prize-2024")
print(dataset)

ArcDataset(n=400)


In [17]:
# Load model

ckpt = "/kaggle/input/qwen2-vl/transformers/qwen2-vl-7b-instruct/1/"
model = Qwen2VLForConditionalGeneration.from_pretrained(ckpt, torch_dtype=torch.bfloat16, device_map="auto")
processor = AutoProcessor.from_pretrained(ckpt)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

## 1. Describing a puzzle

In [18]:
def describe_puzzle(image, prompt):
    
    # Create prompt
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": prompt},
            ],
        },
    ]

    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(text=[text], images=[image], return_tensors="pt")
    inputs = inputs.to(model.device)

    # Run inference
    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids = generated_ids[0, inputs.input_ids.shape[1]:]
    generated_text = processor.decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    return generated_text

In [19]:
# Sanity check

import requests
from io import BytesIO

url = "https://storage.googleapis.com/kaggle-organizations/3733/thumbnail.png"
response = requests.get(url)
image = Image.open(BytesIO(response.content))
display(image)

prompt = "Describe this image"
print(f"Prompt: {prompt}")
print(f"Answer: {describe_puzzle(image, prompt)}")

ConnectionError: HTTPSConnectionPool(host='storage.googleapis.com', port=443): Max retries exceeded with url: /kaggle-organizations/3733/thumbnail.png (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7c8330cb01c0>: Failed to resolve 'storage.googleapis.com' ([Errno -3] Temporary failure in name resolution)"))

In [None]:
image = dataset[0].train[1][1]
display(image)

prompt = "Describe this image"
print(f"Prompt: {prompt}")
print(f"Answer: {describe_puzzle(image, prompt)}")

In [None]:
image = dataset[129].train[0][0]
display(image)

for question in [
    "What is the color of the square in the middle?",
    "What are the colors of the 5 big squares (black being background)?",
    "How many gray squares do you see?",
]:
    print(f"\nQuestion: {question}")
    print(f"Answer: {describe_puzzle(image, question)}")

## 2. Understanding the pattern

In [None]:
def understand_logic(task):
    
    # Create prompt
    images = list(sum(task.train, ()))
    content = [{"type": "text", "text": "What is the underlying pattern between the input and output following puzzles ?"}]
    for i in range(len(task.train)):
        content += [
            {"type": "text", "text": f"\n# Puzzle {i + 1}\nInput puzzle: "},
            {"type": "image"},
            {"type": "text", "text": "\nOutput puzzle: "},
            {"type": "image"},
        ]
    messages = [{"role": "user", "content": content}]

    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    print(text)

    inputs = processor(text=[text], images=images, return_tensors="pt")
    inputs = inputs.to(model.device)

    # Run inference
    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids = generated_ids[0, inputs.input_ids.shape[1]:]
    generated_text = processor.decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)

    return generated_text

In [None]:
task = dataset[72]
task.display()
print(understand_logic(task))

## 3. Classifying the correct output

For this task, we ask the assistant to predict the correct output for the test puzzle and ask it whether it's sure or not of its answer. We compare the top logits for the right output and 3 corrupted versions of it.

In [None]:
OPERATIONS = {
    "none": None,
    "flip": ImageOps.flip,
    "mirror": ImageOps.mirror,
    "invert": ImageOps.invert,
}

def classify(task, test_index=0):
    
    # Create prompt
    images = list(sum(task.train, ())) + list(task.test[test_index])
    test_output = images[-1]

    user_messages = []
    for i in range(len(task.train)):
        user_messages += [
            {"type": "text", "text": f"\n# Puzzle {i + 1}\nInput puzzle: "},
            {"type": "image"},
            {"type": "text", "text": "\nOutput puzzle: "},
            {"type": "image"},
        ]
    user_messages += [
        {"type": "text", "text": "\n# Test puzzle \nInput puzzle: "},
        {"type": "image"},
    ]

    assistant_messages = [
        {"type": "text", "text": "\nOutput puzzle: "},
        {"type": "image"},
    ]

    messages = [
        {"role": "system", "content": [{"type": "text", "text": "You are a visual puzzle expert. For each puzzle, analyze the pattern that transforms the input into the output, then apply the same pattern to find the correct output for a new input."}]},
        {"role": "user", "content": user_messages},
        {"role": "assistant", "content":assistant_messages},
        {"role": "user", "content": "Are you sure your answer is correct ? (Yes/No)"},
    ]

    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    print(text)
    
    for op_name, op in OPERATIONS.items():
        if op is not None:
            images[-1] = op(test_output)
            if images[-1] == test_output:
                print(f"\nSkipping operation {op_name} as it has no effect")
                continue

        inputs = processor(text=[text], images=images, return_tensors="pt")
        inputs = inputs.to("cuda")

        with torch.no_grad():
            output = model(**inputs)
            logits = output.logits[0, -1]
            del output
            torch.cuda.empty_cache()
        tops = torch.topk(logits, 2)

        print(f"\nTest image operation: {op_name}, top 2 logits:")
        display(images[-1])
        for k in range(2):
            print(f"Top {k+1}: {processor.decode(tops.indices[k].item())} (confidence: {tops.values[k].item()})")

In [None]:
task = dataset[0]
task.display()

classify(task, test_index=0)