In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%%html
<style>
.cell-output-ipywidget-background {
    background-color: transparent !important;
}
:root {
    --jp-widgets-color: var(--vscode-editor-foreground);
    --jp-widgets-font-size: var(--vscode-editor-font-size);
}  
</style>

In [24]:
import asyncio
from lib.arc_agi import get_arc_agi_tasks
from lib.tasks import ChatCompletionParams, get_task_results
import openai
import os

zebra_grid_tasks = list(get_arc_agi_tasks(partial_credit=0.5))
benchmark_tasks = zebra_grid_tasks[:80]
val_tasks = zebra_grid_tasks[:64]
test_tasks = zebra_grid_tasks[64:128]
train_tasks = zebra_grid_tasks[128:]
len(benchmark_tasks), len(val_tasks), len(test_tasks), len(train_tasks)

(80, 64, 64, 272)

In [25]:
fireworks = openai.AsyncOpenAI(
    base_url="https://api.fireworks.ai/inference/v1",
    api_key=os.getenv("FIREWORKS_API_KEY"),
)
openrouter = openai.AsyncOpenAI(
    base_url="https://openrouter.ai/api/v1", api_key=os.getenv("OPENROUTER_API_KEY")
)
together = openai.AsyncOpenAI(
    base_url="https://api.together.xyz/v1", api_key=os.getenv("TOGETHER_API_KEY")
)

results = await asyncio.gather(
    get_task_results(
        tasks=benchmark_tasks,
        client=fireworks,
        model="accounts/fireworks/models/deepseek-r1",
        params=ChatCompletionParams(
            max_tokens=2**17,
            logprobs=True,
            top_logprobs=5,
        ),
        pbar_desc="deepseek-r1",
        prices=(8.0, 8.0),
    ),
    get_task_results(
        tasks=benchmark_tasks,
        client=openrouter,
        model="deepseek/deepseek-r1-distill-qwen-1.5b",
        pbar_desc="r1-qwen-1.5b",
        prices=(0.18, 0.18),
    ),
    get_task_results(
        tasks=benchmark_tasks,
        client=openrouter,
        model="qwen/qwen-2.5-7b-instruct",
        pbar_desc="qwen-2.5-7b",
        params=ChatCompletionParams(
            extra_body={"provider": {"order": ["DeepInfra"], "allow_fallbacks": False}},
        ),
        prices=(0.0025, 0.005),
    ),
    get_task_results(
        tasks=benchmark_tasks,
        client=openrouter,
        model="deepseek/deepseek-r1-distill-qwen-14b",
        pbar_desc="r1-qwen-14b",
        prices=(1.6, 1.6),
    ),
    get_task_results(
        tasks=benchmark_tasks,
        client=openrouter,
        model="deepseek/deepseek-r1-distill-qwen-32b",
        params=ChatCompletionParams(
            extra_body={"provider": {"order": ["DeepInfra"], "allow_fallbacks": False}},
        ),
        pbar_desc="r1-qwen-32b",
        prices=(0.12, 0.18),
    ),
    get_task_results(
        tasks=benchmark_tasks,
        client=openrouter,
        model="deepseek/deepseek-r1-distill-llama-70b:free",
        params=ChatCompletionParams(
            extra_body={"provider": {"order": ["Targon"], "allow_fallbacks": False}},
        ),
        pbar_desc="r1-llama-70b:targon",
        prices=(0.0, 0.0),
    ),
    get_task_results(
        tasks=benchmark_tasks,
        client=openrouter,
        model="deepseek/deepseek-r1-distill-llama-70b",
        params=ChatCompletionParams(
            extra_body={"provider": {"order": ["SambaNova"], "allow_fallbacks": False}},
        ),
        pbar_desc="r1-llama-70b:samba",
        prices=(0.7, 1.4),
    ),
)

deepseek-r1:   0%|          | 0/80 [00:00<?, ?it/s]

r1-qwen-1.5b:   0%|          | 0/80 [00:00<?, ?it/s]

qwen-2.5-7b:   0%|          | 0/80 [00:00<?, ?it/s]

r1-qwen-14b:   0%|          | 0/80 [00:00<?, ?it/s]

r1-qwen-32b:   0%|          | 0/80 [00:00<?, ?it/s]

r1-llama-70b:targon:   0%|          | 0/80 [00:00<?, ?it/s]

r1-llama-70b:samba:   0%|          | 0/80 [00:00<?, ?it/s]

In [23]:
print(results[0][0].chat_completions[0].choices[0].message.content)

<think>
Okay, so I need to figure out the rule that transforms these input grids to the output grids based on the examples provided. Let me take a look at each example carefully and see if I can spot a pattern.

Starting with Example 1:
Input:
0 0 5
0 5 0
5 0 0
Output:
3 3 3
4 4 4
2 2 2

Hmm. The input has 5s arranged in a descending diagonal from right to left. The output has each row filled with 3, then 4, then 2. I notice that in Example 1's input, each 5 is in a different row and column. Maybe the position of the 5s affects the output. Let's look for where the 5s are. For the first example, in the first row, the 5 is at column 3, row 1. The second row, column 2, row 2. Third row, column 1, row 3. So the 5s form a diagonal from top-right to bottom-left. The output rows are all 3s, then 4s, then 2s. Not sure how that connects yet.

Example 2:
Input:
0 0 5
0 0 5
0 0 5
Output:
3 3 3
3 3 3
3 3 3

Here, all the 5s are in the third column of each row. The output is all 3s. So when all 5s 

In [10]:
raise [exception for result in results[1] for exception in result.exceptions][0]

AssertionError: 