In [10]:
import enum
import typing
import abc
import itertools
import dataclasses
from pathlib import Path
from types import SimpleNamespace

import attrs
import numpy as np
from IPython import display

from arc25.dsl import types, primitives
from arc25.dsl import api as dsl
from arc25 import tools, dataset, sandbox, symmetry

In [2]:
proj_root = Path("..").resolve()
data_path = proj_root / "data"
db_root = data_path / "solutions"


In [4]:
ds = await dataset.Dataset.from_binary(data_path / "all-challenges.cbor.xz")
len(ds.challenges), sorted(ds.subsets)

(1307,
 ['all',
  'arc-agi-1',
  'arc-agi-1.evaluation',
  'arc-agi-1.training',
  'arc-agi-2',
  'arc-agi-2.evaluation',
  'arc-agi-2.training',
  'concept-arc',
  'larc'])

In [3]:
solutions = await dataset.SolutionDB.load(db_root)
sol_with_code = sorted(k for k,v in solutions.solutions.items() if v.rule)
print(len(solutions.solutions), len(sol_with_code))
print(sol_with_code)

26 17
['00576224', '36fdfd69', '46442a0e', '54d82841', '5614dbcf', '642d658d', '6e19193c', '6f8cd79b', '9841fdad', '9f5f939b', 'a740d043', 'ca-00-01', 'ca-03-02', 'ca-03-04', 'ca-10-04', 'ce039d91', 'fd02da9e']


In [19]:
ckey = "ca-03-02"
chal = ds.challenges[ckey]
sol = solutions.solutions[ckey]
display.display(display.Markdown(sol.explanation))

**Hypothesis:**
- Black background.
- One horizontal and one vertical colored dotted line.
- Impossible to say which of the two is painted over the other.

**Rule:**
- Start either from the input, or a fully black canvas of the same shape.
- First, paint a solid vertical line at the position and in the color of the dotted vertical line in the input.
- Second, paint a solid horizontal line at the position and in the color of the dotted horizontal line in the input.

**Plan:**
- There are no distractions, so we can determine the line positions simply from the count of foreground cells.


In [9]:
system_msg = "You are a careful, code-generating ARC challenge solver assistant."
task_descr = """
# Task description
ARC challenges require a rule to be found that transforms input grids into output grids.
You are given a number of pairs of input and output grids,
plus a few more input grids only, for which the rule must generate the corresponding outputs.
The rule must be expressed as a python function `solver`.
Analyse the challenge carefully and in a structured way by:
 1.) Formulate a `description` of the relevant semantic eneitites in the inputs.
 2.) Describe the underlying `rule` in natural language or pseudo-code.
 3.) Create a `plan` on how to implement the rule in python.
 4.) Output an `implemenetation` in python, implementing that rule.
"""

In [16]:
class FactDefinition:
    pass

@attrs.frozen
class PredicateFact(FactDefinition):
    descr: str
    predicate: typing.Callable

    def __call__(self, chal: dataset.Challenge) -> str | None:
        p = self.predicate(chal)
        if p:
            return self.descr.format(pred=p)

def single_element_or_none(arg: list|tuple|set):
    if len(arg) == 1:
        ret, = arg
        return ret
    
default_facts = (
    PredicateFact(
        "Inputs of equal shape: {pred}",
        lambda chal:single_element_or_none(set(e.input.shape for e in chal.train+chal.test))
    ),
    PredicateFact(
        "Output shapes match input shapes",
        lambda chal:all(e.output.shape == e.input.shape for e in chal.train)
    ),
    PredicateFact(
        "Output size uniformly increased by factor {pred}",
        lambda chal:single_element_or_none(set(
            d for d,m in (
                divmod(e.output.shape[i],e.input.shape[i])
                for i in range(2)
                for e in chal.train
            ) if not m and d>1
        ))
    ),
    PredicateFact(
        "Output size uniformly decreased by factor {pred}",
        lambda chal:single_element_or_none(set(
            d for d,m in (
                divmod(e.input.shape[i],e.output.shape[i])
                for i in range(2)
                for e in chal.train
            ) if not m and d>1
        ))
    ),
    # TODO: facts about colours
)

In [30]:
C = types.Color
single_char_color_codes = {
    C.BLACK: "k",
    C.BLUE: "b",
    C.BROWN: "n",
    C.CYAN: "c",
    C.GRAY: "h",
    C.GREEN: "g",
    C.MAGENTA: "m",
    C.ORANGE: "o",
    C.RED: "r",
    C.YELLOW: "y",
}
parentheses = "«»‹›〔〕【】〖〗❪❫❲❳❬❭❨❩⟨⟩"
len(set(single_char_color_codes.keys())),len(set(single_char_color_codes.values()))

(10, 10)

In [38]:
@attrs.frozen
class ReasonedSolution:
    input_descr: str | None = None
    rule_descr: str | None = None
    impl_plan_descr: str | None = None
    rule_impl: str | None = None

In [36]:
@attrs.frozen
class PromptEncoder:
    system_msg: str = system_msg
    task_descr: str = task_descr
    replace_all_colours: bool = True
    
    colour_tokens: tuple[str,...] = tuple(
        f"❲{single_char_color_codes[c]}❳"
        for c in types.Color
    )
    open_tokens: SimpleNamespace = SimpleNamespace(**{
        k:f"<{k}>" for k in ["input","example","grid","facts","descr","rule","plan","impl"]
    })
    close_tokens: SimpleNamespace = SimpleNamespace(**{
        k:f"</{k}>" for k in vars(open_tokens)
    })

    fact_definitions: tuple[FactDefinition] = default_facts

    
    def encode_grid(self, grid: types.Canvas) -> str:
        if isinstance(grid, types.Canvas):
            grid = grid.image
        h,w = grid.shape
        egrids = [
            "\n".join("".join(self.colour_tokens[v] for v in row) for row in d)
            for d in [grid._data,grid._data.T]
        ]
        o = self.open_tokens
        c = self.close_tokens
        return f"""
{o.grid}{h}×{w}
rows:
{egrids[0]}
---
cols:
{egrids[1]}
{c.grid}
        """.strip()

    def encode_example(self, example: types.IOPair) -> str:
        o = self.open_tokens
        c = self.close_tokens
        body = "\n".join(
            f"{k}:{self.encode_grid(v)}"
            for k in ["input","output"]
            if (v:=getattr(example,k)) is not None
        )
        return f"""
{o.example}
{body}
{c.example}
        """.strip()

    def encode_inputs(self, challenge: dataset.Challenge) -> str:
        o = self.open_tokens
        c = self.close_tokens
        body = "\n".join(
            f"{k}:\n"+"\n".join(self.encode_example(e) for e in v)
            for k in ["train","test"]
            if (v:=getattr(challenge,k))
        )
        return f"""
{o.input}
{body}
{c.input}
        """.strip()

    def encode_facts(self, challenge: dataset.Challenge) -> str:
        o = self.open_tokens
        c = self.close_tokens
        # TODO: handle colour replacement!
        body = "\n".join(
            f"- {descr}"
            for fd in self.fact_definitions
            if (descr:=fd(challenge))
        )
        return f"""
{o.facts}
{body}
{c.facts}
        """.strip()

    def encode_prompt(self, challenge: dataset.Challenge) -> dict[str,str]:
        user_msg = f"""
{self.task_descr}

{self.encode_inputs(challenge)}

{self.encode_facts(challenge)}
""".strip()
        return SimpleNamespace(
            system = self.system_msg,
            user = user_msg,
        )

    def encode_response(self, response: ReasonedSolution) -> str:
        ret = []
        for k,v in dict(
            descr = response.input_descr,
            rule = response.rule_descr,
            plan = response.impl_plan_descr,
            impl = response.rule_impl,
        ).items():
            if v is None or not v.strip():
                continue
            ret.append(f"""
{vars(o)[k]}
{v}
{vars(c)[k]}
            """.strip())

In [37]:
enc = PromptEncoder()

print(enc.encode_prompt(chal).user)

# Task description
ARC challenges require a rule to be found that transforms input grids into output grids.
You are given a number of pairs of input and output grids,
plus a few more input grids only, for which the rule must generate the corresponding outputs.
The rule must be expressed as a python function `solver`.
Analyse the challenge carefully and in a structured way by:
 1.) Formulate a `description` of the relevant semantic eneitites in the inputs.
 2.) Describe the underlying `rule` in natural language or pseudo-code.
 3.) Create a `plan` on how to implement the rule in python.
 4.) Output an `implemenetation` in python, implementing that rule.


<input>
train:
<example>
input:<grid>6×6
rows:
❲k❳❲k❳❲k❳❲k❳❲k❳❲k❳
❲k❳❲k❳❲k❳❲k❳❲k❳❲m❳
❲y❳❲k❳❲y❳❲k❳❲y❳❲k❳
❲k❳❲k❳❲k❳❲k❳❲k❳❲m❳
❲k❳❲k❳❲k❳❲k❳❲k❳❲k❳
❲k❳❲k❳❲k❳❲k❳❲k❳❲m❳
---
cols:
❲k❳❲k❳❲y❳❲k❳❲k❳❲k❳
❲k❳❲k❳❲k❳❲k❳❲k❳❲k❳
❲k❳❲k❳❲y❳❲k❳❲k❳❲k❳
❲k❳❲k❳❲k❳❲k❳❲k❳❲k❳
❲k❳❲k❳❲y❳❲k❳❲k❳❲k❳
❲k❳❲m❳❲k❳❲m❳❲k❳❲m❳
</grid>
output:<grid>6×6
rows:
❲k❳❲k❳❲k❳❲k❳❲k❳❲m❳
❲k

(10, 10)