In [1]:
from pathlib import Path
import json
import pickle

from transformers import AutoModelForCausalLM, AutoTokenizer

In [28]:
# model_name = "Qwen/Qwen3-4B-Thinking-2507"
model_name = "Qwen/Qwen3-4B-Instruct-2507"
# model_name = "allenai/Olmo-3-7B-Instruct"
# model_name = "allenai/Olmo-3-7B-Think"

In [29]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

### Load ARC AGI prompts

In [78]:
tasks_eval = Path("/home/bw/ARC-AGI/data/evaluation.txt").read_text().splitlines()
tid = tasks_eval[0]
path = Path("/home/bw/ARC-AGI/data/evaluation")/f'{tid}.json'
data = json.loads(path.read_text())

In [79]:
tid

'ccd554ac'

In [80]:
def arr2str(matrix):
    intermediate = "\n ".join(
        "[" + ", ".join(str(x) for x in row) + "]," 
        for row in matrix
    )
    return "[" + intermediate[:-1] + "]"

def generate_task_strings(data):
    trn_str = 'TRAIN\n\n'
    for idx, ex in enumerate(data['train']):
        trn_str += f"EXAMPLE {idx+1}\nInput: {arr2str(ex['input'])}\nOutput: {arr2str(ex['output'])}\n\n"

    task_strs = []
    for idx, ex in enumerate(data['test']):
        tsk_str = trn_str + 'TEST\n\n' + f"Input: {arr2str(ex['input'])}\nOutput:\n"
        task_strs.append(tsk_str)
    return task_strs

In [81]:
# PROMPT_TEMPLATE = """You are a bot that is very good at solving puzzles. Below is a list of input and output pairs with a pattern.
# Identify the pattern, then apply that pattern to the test input to give a final output.
# Give valid json list of lists back as your answer to the test question.
# Do not over think. If you think you've found the answer, then stop thinking and output the answer. 
# You only have about max 8,000 tokens to think. So output an answer before that even if you are not sure of an answer.

# {task_string}"""

PROMPT_TEMPLATE = """You are a bot that is very good at solving puzzles. Below is a list of input and output pairs with a pattern.
Identify the pattern, then apply that pattern to the test input to give a final output.
Give valid json list of lists back as your answer to the test question.

{task_string}"""
    

In [82]:
prompts = []
for tid in tasks_eval[:2]:
    path = Path("/home/bw/ARC-AGI/data/evaluation")/f'{tid}.json'
    data = json.loads(path.read_text())
    task_strings = generate_task_strings(data)
    for tsk_str in task_strings:
        prompt = PROMPT_TEMPLATE.format(task_string=tsk_str)
        prompts.append(prompt)
print(f'Number of prompts: {len(prompts)}')

Number of prompts: 2


### Load AIW prompts

In [9]:
path = Path("/home/bw/knowledge-sep/notebooks/aiw_prompts.json")
data = json.loads(path.read_text())

In [11]:
prompts = []
for obj in data:
    if obj['id'] in [637, 638, 639, 640, 641, 642]:
        prompts.append(obj['prompt'])

In [12]:
print(prompts[1])

Alice has 3 male colleagues and she also has 2 female colleagues in total. All these mentioned persons in the circle around Alice are colleagues of each other. Bob has 2 female colleagues and 1 male colleague in total. All these mentioned persons in the circle around Bob are colleagues of each other. The people in the circle around Bob do not have other colleagues aside - with the only exception of Matilda. She is colleague of Bob, being part of Bob's circle, and she is also colleague of Alice, being part of Alice's circle. All the mentioned persons have no colleagues beyond the already described group of people. How many female colleagues does Matilda have? Before providing answer to this problem, think carefully step by step and double check the path to the correct solution for any mistakes. Provide then the final answer in following form: "### Answer: ".


### Do Inference

In [30]:
# prompts = ["Give me a short introduction to large language model.", "Solve x+1 = 2"]
texts = []
for p in prompts:
    messages = [
        {"role": "user", "content": p}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )
    texts.append(text)

In [31]:
print(texts[0])

<|im_start|>user
Alice has 3 male colleagues and she also has 6 female colleagues in total. All these mentioned persons in the circle around Alice are colleagues of each other. Bob has 2 female colleagues and 1 male colleague in total. All these mentioned persons in the circle around Bob are colleagues of each other. The people in the circle around Bob do not have other colleagues aside - with the only exception of Matilda. She is colleague of Bob, being part of Bob's circle, and she is also colleague of Alice, being part of Alice's circle. All the mentioned persons have no colleagues beyond the already described group of people. How many female colleagues does Matilda have? Before providing answer to this problem, think carefully step by step and double check the path to the correct solution for any mistakes. Provide then the final answer in following form: "### Answer: ".<|im_end|>
<|im_start|>assistant



### Inference with HF

In [4]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [12]:
model_inputs = tokenizer(texts, return_tensors="pt", padding=True, padding_side="left").to(model.device)

In [13]:
model_inputs.input_ids.shape

torch.Size([3, 9913])

In [14]:
generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=8192 #32768
)

In [15]:
with open('./arc_test1.pkl', 'wb') as f:
    pickle.dump(generated_ids, f)

In [23]:
# decoded = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

In [16]:
output_ids = generated_ids[:, len(model_inputs.input_ids[0]):]

In [17]:
generations = tokenizer.batch_decode(output_ids, skip_special_tokens=True)

In [18]:
gen = generations[0]
glist = gen.split('</think>')
thinking_content, content = ''.join(glist[:-1]).strip("\n"), glist[-1].strip("\n")
print("--------- THINKING CONTENT: ---------\n\n" + thinking_content)  # no opening <think> tag
print("--------- CONTENT: ---------\n\n" + content)

--------- THINKING CONTENT: ---------


--------- CONTENT: ---------

Okay, let's try to figure out the pattern here. So, we have three examples where the input is a grid of mostly zeros with some 8s, and the output is a transformed grid. The task is to apply the same pattern to the test input.

First, I need to look at the examples to see what's happening. Let me start by analyzing each example.

**EXAMPLE 1:**
Input is a 12x12 grid. There's a small pattern of 8s in the middle. For instance, row 3 (0-indexed?) has an 8 at position 5. Then row 4 has 8s at positions 4, 6. Then row 5 has an 8 at position 5. The output is a more complex grid with 8s arranged in a sort of wave-like pattern or maybe a specific shape.

Wait, the output in example 1 has 8s forming a sort of rectangle or maybe a diamond? Hmm. Let me look more carefully.

Wait, the input for example 1 has 8s in rows 3-5 (assuming 0-based index). Let's see:

Row 3 (4th row): [0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0]
Row 4: [0, 0, 0, 

### Inference with vllm

In [15]:
from vllm import LLM, SamplingParams

In [32]:
del llm

In [33]:
llm = LLM(
    model=model_name,
    max_model_len=32_768,        # <- set this
    # gpu_memory_utilization=0.9  # optional, adjust as needed
)

INFO 12-11 19:07:38 [utils.py:253] non-default args: {'seed': None, 'max_model_len': 32768, 'disable_log_stats': True, 'model': 'Qwen/Qwen3-4B-Instruct-2507'}
INFO 12-11 19:07:38 [model.py:637] Resolved architecture: Qwen3ForCausalLM
INFO 12-11 19:07:38 [model.py:1750] Using max model len 32768
INFO 12-11 19:07:38 [scheduler.py:228] Chunked prefill is enabled with max_num_batched_tokens=8192.
[0;36m(EngineCore_DP0 pid=7249)[0;0m INFO 12-11 19:07:39 [core.py:93] Initializing a V1 LLM engine (v0.12.0) with config: model='Qwen/Qwen3-4B-Instruct-2507', speculative_config=None, tokenizer='Qwen/Qwen3-4B-Instruct-2507', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0;36m(EngineCore_DP0 pid=7249)[0;0m INFO 12-11 19:07:42 [parallel_state.py:1200] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.142.0.2:42485 backend=nccl
[0;36m(EngineCore_DP0 pid=7249)[0;0m INFO 12-11 19:07:42 [parallel_state.py:1408] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[0;36m(E

[0;36m(EngineCore_DP0 pid=7249)[0;0m [2025-12-11 19:07:43] INFO _optional_torch_c_dlpack.py:119: JIT-compiling torch-c-dlpack-ext to cache...
[0;36m(EngineCore_DP0 pid=7249)[0;0m We recommend installing via `pip install torch-c-dlpack-ext`


[0;36m(EngineCore_DP0 pid=7249)[0;0m INFO 12-11 19:07:45 [cuda.py:411] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']


Loading safetensors checkpoint shards:   0% Completed | 0/3 [00:00<?, ?it/s]


[0;36m(EngineCore_DP0 pid=7249)[0;0m INFO 12-11 19:08:25 [default_loader.py:308] Loading weights took 39.23 seconds
[0;36m(EngineCore_DP0 pid=7249)[0;0m INFO 12-11 19:08:26 [gpu_model_runner.py:3549] Model loading took 7.6065 GiB memory and 41.980817 seconds
[0;36m(EngineCore_DP0 pid=7249)[0;0m INFO 12-11 19:08:39 [backends.py:655] Using cache directory: /home/bw/.cache/vllm/torch_compile_cache/51a0871892/rank_0_0/backbone for vLLM's torch.compile
[0;36m(EngineCore_DP0 pid=7249)[0;0m INFO 12-11 19:08:39 [backends.py:715] Dynamo bytecode transform time: 12.54 s
[0;36m(EngineCore_DP0 pid=7249)[0;0m INFO 12-11 19:08:44 [backends.py:216] Directly load the compiled graph(s) for dynamic shape from the cache, took 4.977 s
[0;36m(EngineCore_DP0 pid=7249)[0;0m INFO 12-11 19:08:47 [monitor.py:34] torch.compile takes 17.51 s in total
[0;36m(EngineCore_DP0 pid=7249)[0;0m INFO 12-11 19:08:49 [gpu_worker.py:359] Available KV cache memory: 10.77 GiB
[0;36m(EngineCore_DP0 pid=7249)[0;0

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|███████████████████████████████████████████████| 51/51 [00:04<00:00, 11.01it/s]
Capturing CUDA graphs (decode, FULL): 100%|██████████████████████████████████████████████████████████████████| 35/35 [00:02<00:00, 12.93it/s]


[0;36m(EngineCore_DP0 pid=7249)[0;0m INFO 12-11 19:08:58 [gpu_model_runner.py:4466] Graph capturing finished in 8 secs, took 0.58 GiB
[0;36m(EngineCore_DP0 pid=7249)[0;0m INFO 12-11 19:08:58 [core.py:254] init engine (profile, create kv cache, warmup model) took 32.24 seconds
INFO 12-11 19:09:00 [llm.py:343] Supported tasks: ['generate']


In [34]:
# sampling_params = SamplingParams(max_tokens=8192, temperature=0.6, top_p=0.95, top_k=20, min_p=0) # https://huggingface.co/Qwen/Qwen3-4B-Thinking-2507
sampling_params = SamplingParams(max_tokens=4096, temperature=0.7, top_p=0.8, top_k=20, min_p=0) # https://huggingface.co/Qwen/Qwen3-4B-Instruct-2507
# sampling_params = SamplingParams(
#     temperature=0.6,
#     top_p=0.95,
#     max_tokens=8192,
# ) # https://huggingface.co/allenai/Olmo-3-7B-Instruct, https://huggingface.co/allenai/Olmo-3-7B-Think

In [35]:
# Generate outputs
outputs = llm.generate(texts, sampling_params)

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|                                           | 0/6 [00:00<?, ?it/s, est. speed input: 0.…

In [17]:
outputs

[RequestOutput(request_id=0, prompt="<|im_start|>user\nYou are a bot that is very good at solving puzzles. Below is a list of input and output pairs with a pattern.\nIdentify the pattern, then apply that pattern to the test input to give a final output.\nGive valid json list of lists back as your answer to the test question.\nDo not over think. If you think you've found the answer, then stop thinking and output the answer. \nYou only have about max 8,000 tokens to think. So output an answer before that even if you are not sure of an answer.\n\nTRAIN\n\nEXAMPLE 1\nInput: [[2, 0, 2],\n [0, 2, 0],\n [2, 2, 2]]\nOutput: [[2, 0, 2, 2, 0, 2, 2, 0, 2],\n [0, 2, 0, 0, 2, 0, 0, 2, 0],\n [2, 2, 2, 2, 2, 2, 2, 2, 2],\n [2, 0, 2, 2, 0, 2, 2, 0, 2],\n [0, 2, 0, 0, 2, 0, 0, 2, 0],\n [2, 2, 2, 2, 2, 2, 2, 2, 2],\n [2, 0, 2, 2, 0, 2, 2, 0, 2],\n [0, 2, 0, 0, 2, 0, 0, 2, 0],\n [2, 2, 2, 2, 2, 2, 2, 2, 2]]\n\nEXAMPLE 2\nInput: [[1, 0],\n [1, 1]]\nOutput: [[1, 0, 1, 0],\n [1, 1, 1, 1],\n [1, 0, 1, 0],\n 

In [49]:
output = outputs[5]
print(output.prompt)

<|im_start|>user
Alice has 3 male colleagues and she also has 5 female colleagues in total. All these mentioned persons in the circle around Alice are colleagues of each other. Bob has 2 female colleagues and 1 male colleague in total. All these mentioned persons in the circle around Bob are colleagues of each other. The people in the circle around Bob do not have other colleagues aside - with the only exception of Matilda. She is colleague of Bob, being part of Bob's circle, and she is also colleague of Alice, being part of Alice's circle. All the mentioned persons have no colleagues beyond the already described group of people. How many female colleagues does Matilda have? Before providing answer to this problem, think carefully step by step and double check the path to the correct solution for any mistakes. Provide then the final answer in following form: "### Answer: ".<|im_end|>
<|im_start|>assistant



In [50]:
print(output.outputs[0].text)

Let's go step by step to solve this problem carefully.

---

### Step 1: Understand the setup

We have a group of people who are in **two circles**:

- **Alice's circle**: Alice has 3 male colleagues and 5 female colleagues.  
  So, in Alice's circle:  
  - 3 males  
  - 5 females  
  - Alice herself is not counted in the "colleagues" — so the circle includes her 3 male and 5 female colleagues.

- **Bob's circle**: Bob has 2 female colleagues and 1 male colleague.  
  So, in Bob's circle:  
  - 1 male  
  - 2 females  
  - Bob himself is not counted in the colleagues.

Now, **the people in Bob's circle do not have other colleagues except Matilda**, who is a special case.

Matilda is:
- A colleague of Bob → part of Bob’s circle
- A colleague of Alice → part of Alice’s circle

And she is the **only** person who is in both circles.

Also, **all mentioned persons have no colleagues beyond the described group** — meaning no one outside these two circles.

So the entire group is the union of

In [51]:
with open('./notebooks/AIW_CC.Qwen3-4B-Instruct-2507.pkl', 'wb') as f:
    pickle.dump(outputs, f)

In [None]:
# Print the outputs.
for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")