In [1]:
from dotenv import load_dotenv
import dspy
import os
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
lm = dspy.LM("openai/gpt-5-mini", temperature=1, api_key=api_key, max_tokens=32000)
dspy.configure(lm=lm)

In [None]:
from loader import load_training_problem, list_training_problems, load_evaluation_problem, list_evaluation_problems
import json

def from_data_to_problem(data):
    problem_text = "Identify the pattern within the grids and solve the problem.\n\n"
    if 'train' in data:
        problem_text += "Training Examples:\n"
        for i, example in enumerate(data['train'], 1):
            problem_text += f"Example {i}:\n"
            problem_text += "Input:\n"
            for row in example['input']:
                problem_text += " ".join(map(str, row)) + "\n"
            problem_text += "Output:\n"
            for row in example['output']:
                problem_text += " ".join(map(str, row)) + "\n"
            problem_text += "\n"
    if 'test' in data and len(data['test']) > 0:
        test_case = data['test'][0]
        problem_text += "Test Case:\n"
        problem_text += "Input:\n"
        for row in test_case['input']:
            problem_text += " ".join(map(str, row)) + "\n"
        problem_text += "Output:\n"
        answer = ""
        for row in test_case['output']:
            answer += " ".join(map(str, row)) + "\n"
        answer = answer.strip()
        return {
            'problem': problem_text.strip(),
            'answer': answer
        }
    return None

def load_statistics(threshold=160):
    """Load statistics and return problem IDs with combined score below threshold."""
    with open('arc_statistics_detailed.json', 'r') as f:
        stats = json.load(f)
    
    # Filter problems with combined_score < threshold
    filtered_problems = {}
    for entry in stats:
        if entry['combined_score'] < threshold:
            problem_name = entry['problem']
            # Parse problem type and ID
            if problem_name.startswith('train_'):
                problem_id = problem_name.replace('train_', '')
                filtered_problems.setdefault('train', []).append(problem_id)
            elif problem_name.startswith('eval_'):
                problem_id = problem_name.replace('eval_', '')
                filtered_problems.setdefault('eval', []).append(problem_id)
    
    return filtered_problems

def init_arc_dataset(max_combined_score=160):
    """
    Initialize ARC dataset with optional filtering by combined I/O score.
    
    Args:
        max_combined_score: Maximum combined score (0.25*input + 0.75*output).
                           Set to None to disable filtering.
    """
    # Load statistics if filtering is enabled
    if max_combined_score is not None:
        filtered = load_statistics(threshold=max_combined_score)
        train_problems = filtered.get('train', [])
        test_problems = filtered.get('eval', [])
        print(f"Filtering problems with combined score < {max_combined_score}")
        print(f"  Training problems: {len(train_problems)} (filtered from 400)")
        print(f"  Evaluation problems: {len(test_problems)} (filtered from 400)")
    else:
        train_problems = list_training_problems()
        test_problems = list_evaluation_problems()
    
    # Load training data
    train_data = [load_training_problem(pb) for pb in train_problems]
    train_split = [from_data_to_problem(data) for data in train_data]
    train_split = [
        dspy.Example({
            "problem": x['problem'],
            'answer': x['answer'],
        }).with_inputs("problem")
        for x in train_split
    ]
    tot_num = len(train_split)
    
    # Load evaluation/test data
    test_data = [load_evaluation_problem(pb) for pb in test_problems]
    test_split = [from_data_to_problem(data) for data in test_data]
    test_split = [
        dspy.Example({
            "problem": x['problem'],
            'answer': x['answer'],
        }).with_inputs("problem")
        for x in test_split
    ]
    
    # Split training data into train and validation sets
    train_set = train_split[:int(0.5 * tot_num)]
    val_set = train_split[int(0.5 * tot_num):]
    test_set = test_split
    
    print(f"Dataset sizes: train={len(train_set)}, val={len(val_set)}, test={len(test_set)}")
    
    return train_set, val_set, test_set
    

In [3]:
import dspy
from datasets import load_dataset

def init_dataset():
    train_split = load_dataset("AI-MO/aimo-validation-aime")['train']
    train_split = [
        dspy.Example({
            "problem": x['problem'],
            'solution': x['solution'],
            'answer': x['answer'],
        }).with_inputs("problem")
        for x in train_split
    ]
    import random
    random.Random(0).shuffle(train_split)
    tot_num = len(train_split)

    test_split = load_dataset("MathArena/aime_2025")['train']
    test_split = [
        dspy.Example({
            "problem": x['problem'],
            'answer': x['answer'],
        }).with_inputs("problem")
        for x in test_split
    ]

    train_set = train_split[:int(0.5 * tot_num)]
    val_set = train_split[int(0.5 * tot_num):]
    test_set = test_split * 5

    return train_set, val_set, test_set

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Initialize dataset with filtering: only problems with combined score < 160
train_set, val_set, test_set = init_arc_dataset(max_combined_score=160)

len(train_set), len(val_set), len(test_set)

(200, 200, 400)

In [None]:
# Show statistics about filtered dataset
print(f"Total problems: {len(train_set) + len(val_set) + len(test_set)}")
print(f"Training set: {len(train_set)} problems")
print(f"Validation set: {len(val_set)} problems")
print(f"Test set: {len(test_set)} problems")
print(f"\nNote: All problems have combined score (0.25*input + 0.75*output) < 160")


In [5]:
print("Problem:")
print(train_set[0]['problem'])
print("\n\nAnswer:")
print(train_set[0]['answer'])

Problem:
Identify the pattern within the grids and solve the problem.

Training Examples:
Example 1:
Input:
0 7 7
7 7 7
0 7 7
Output:
0 0 0 0 7 7 0 7 7
0 0 0 7 7 7 7 7 7
0 0 0 0 7 7 0 7 7
0 7 7 0 7 7 0 7 7
7 7 7 7 7 7 7 7 7
0 7 7 0 7 7 0 7 7
0 0 0 0 7 7 0 7 7
0 0 0 7 7 7 7 7 7
0 0 0 0 7 7 0 7 7

Example 2:
Input:
4 0 4
0 0 0
0 4 0
Output:
4 0 4 0 0 0 4 0 4
0 0 0 0 0 0 0 0 0
0 4 0 0 0 0 0 4 0
0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0
0 0 0 4 0 4 0 0 0
0 0 0 0 0 0 0 0 0
0 0 0 0 4 0 0 0 0

Example 3:
Input:
0 0 0
0 0 2
2 0 2
Output:
0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 2
0 0 0 0 0 0 2 0 2
0 0 0 0 0 0 0 0 0
0 0 2 0 0 0 0 0 2
2 0 2 0 0 0 2 0 2

Example 4:
Input:
6 6 0
6 0 0
0 6 6
Output:
6 6 0 6 6 0 0 0 0
6 0 0 6 0 0 0 0 0
0 6 6 0 6 6 0 0 0
6 6 0 0 0 0 0 0 0
6 0 0 0 0 0 0 0 0
0 6 6 0 0 0 0 0 0
0 0 0 6 6 0 6 6 0
0 0 0 6 0 0 6 0 0
0 0 0 0 6 6 0 6 6

Example 5:
Input:
2 2 2
0 0 0
0 2 2
Output:
2 2 2 2 2 2 2 2 2
0 0 0 0 0 0 0 0 0
0 

In [6]:
class GenerateResponse(dspy.Signature):
    """Solve the problem and provide the answer in the correct format."""
    problem = dspy.InputField()
    answer = dspy.OutputField()

program = dspy.ChainOfThought(GenerateResponse)

In [7]:
def arc_metric(example, prediction, trace=None, pred_name=None, pred_trace=None):
    """
    Metric to evaluate ARC (Abstraction and Reasoning Corpus) outputs.
    
    Args:
        example: The ground truth example containing the expected output
        prediction: The model's prediction containing the generated output
        trace: Optional trace information
        pred_name: Optional prediction name
        pred_trace: Optional prediction trace
        
    Returns:
        int: 1 if the prediction matches the expected output exactly, 0 otherwise
    """
    try:
        # Get the expected output from the example
        expected_output = example["answer"].strip()
        # Get the predicted output from the prediction
        predicted_output = prediction.answer.strip() if hasattr(prediction, 'answer') else str(prediction).strip()
        # For ARC tasks, we need exact string matching of the grid output
        # The output should be a grid representation with spaces and newlines
        if expected_output == predicted_output:
            return 1
        else:
            return 0
            
    except Exception as e:
        # If there's any error in processing, return 0
        return 0

In [9]:
import dspy
evaluate = dspy.Evaluate(
    devset=test_set,
    metric=arc_metric,
    num_threads=32,
    display_table=True,
    display_progress=True
)

evaluate(program)

Average Metric: 156.00 / 400 (39.0%): : 402it [31:44,  4.74s/it]                       

2025/09/05 20:36:50 INFO dspy.evaluate.evaluate: Average Metric: 156 / 400 (39.0%)





Unnamed: 0,problem,example_answer,reasoning,pred_answer,arc_metric
0,Identify the pattern within the grids and solve the problem. Train...,3 2 3 2 3 2 7 8 7 8 7 8 2 3 2 3 2 3 8 7 8 7 8 7 3...,"From the examples, the 2x2 input matrix a b c d is expanded to a 6...",3 2 3 2 3 2 7 8 7 8 7 8 2 3 2 3 2 3 8 7 8 7 8 7 3...,✔️ [1]
1,Identify the pattern within the grids and solve the problem. Train...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 7 7 7 7 7 7...,From the training examples: all cells with value 8 are replaced by...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1...,
2,Identify the pattern within the grids and solve the problem. Train...,0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 2 0 0 0 0 0 2 2 2 2...,We must detect closed square frames made of 2 and fill each frame'...,0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 2 0 0 0 0 0 2 2 2 2...,✔️ [1]
3,Identify the pattern within the grids and solve the problem. Train...,7 0 0 0 0 0 0 0 0 0 7 0 0 0 0 0 0 0 0 0 7 0 0 0 0...,From the training examples the rule is: - Identify each colored bl...,7 0 0 0 0 0 0 0 0 0 7 0 0 0 0 0 0 0 0 0 7 0 0 0 0...,✔️ [1]
4,Identify the pattern within the grids and solve the problem. Train...,0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 8 0 0 0 0 0...,I was unable to confidently deduce the intended transformation fro...,0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 8 0 0 0 0 0...,
...,...,...,...,...,...
395,Identify the pattern within the grids and solve the problem. Train...,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1...,From the training examples the rule is: for each non-1 color prese...,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1...,
396,Identify the pattern within the grids and solve the problem. Train...,0 0 0 0 0 0 0 0 0 0 0 0 3 3 0 0 0 0 0 0 0 0 0 0 0...,"We identify the non-zero block in the input (its bounding box, sha...",0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 6 0 0 3 3 0 6...,
397,Identify the pattern within the grids and solve the problem. Train...,0 0 0 0 0 0 0 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4...,We need to generalize the pattern from the training examples. Obse...,0 0 0 0 0 0 0 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4...,
398,Identify the pattern within the grids and solve the problem. Train...,0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 0 2 2 0 0 0 2 0...,I parsed the 16x11 grid into the 3x2 array of 4x4 tiles (block row...,0 0 0 0 0 0 0 0 0 0 0 0 0 8 8 0 0 0 8 8 0 0 0 8 0...,


EvaluationResult(score=39.0, results=<list of 400 results>)

In [None]:
def arc_metric_with_feedback(example, prediction, trace=None, pred_name=None, pred_trace=None):
    """
    Metric to evaluate ARC (Abstraction and Reasoning Corpus) outputs with feedback.
    
    Args:
        example: The ground truth example containing the expected output
        prediction: The model's prediction containing the generated output
        trace: Optional trace information
        pred_name: Optional prediction name
        pred_trace: Optional prediction trace
        
    Returns:
        dspy.Prediction: Contains score (1 if exact match, 0 otherwise) and feedback text
    """
    try:
        expected_output = example["answer"].strip()
        predicted_output = prediction.answer.strip() if hasattr(prediction, 'answer') else str(prediction).strip()
        score = 1 if expected_output == predicted_output else 0
        feedback_text = ""
        if score == 1:
            feedback_text = f"Your answer is correct. The expected output matches your prediction exactly."
        else:
            feedback_text = f"Your answer is incorrect. The expected output does not match your prediction.\n"
            feedback_text += f"Expected:\n{expected_output}\n\nYour output:\n{predicted_output}\n"
            feedback_text += "Please ensure your output grid matches the expected format exactly, including spacing and newlines."
        
        return dspy.Prediction(score=score, feedback=feedback_text)
        
    except Exception as e:
        feedback_text = f"Error processing your answer: {str(e)}. Please ensure your answer follows the correct format for ARC grid outputs."
        return dspy.Prediction(score=0, feedback=feedback_text)


In [13]:
from dspy import GEPA

optimizer = GEPA(
    metric=arc_metric_with_feedback,
    auto="light",
    num_threads=32,
    track_stats=True,
    reflection_minibatch_size=3,
    reflection_lm=dspy.LM(model="gpt-5", temperature=1.0, max_tokens=32000, api_key=api_key)
)

optimized_program = optimizer.compile(
    program,
    trainset=train_set,
    valset=val_set,
)

2025/09/05 20:38:27 INFO dspy.teleprompt.gepa.gepa: Running GEPA for approx 1180 metric calls of the program. This amounts to 2.95 full evals on the train+val set.
2025/09/05 20:38:27 INFO dspy.teleprompt.gepa.gepa: Using 200 examples for tracking Pareto scores. You can consider using a smaller sample of the valset to allow GEPA to explore more diverse solutions within the same budget.
GEPA Optimization:   0%|          | 0/1180 [00:00<?, ?rollouts/s]

2025/09/05 20:49:04 INFO dspy.evaluate.evaluate: Average Metric: 134.0 / 200 (67.0%)
2025/09/05 20:49:04 INFO dspy.teleprompt.gepa.gepa: Iteration 0: Base program full valset score: 0.67
GEPA Optimization:  17%|█▋        | 200/1180 [10:36<51:57,  3.18s/rollouts]2025/09/05 20:49:04 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Selected program 0 score: 0.67


Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [03:47<00:00, 75.86s/it] 

2025/09/05 20:52:51 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/09/05 20:54:35 INFO dspy.teleprompt.gepa.gepa: Iteration 1: Proposed new text for predict: You are given ARC-style grid puzzles. Each puzzle contains several training examples (input grid -> output grid) and one test case (input grid only). Your task is to infer the transformation rule from the training pairs and apply it to the test input. Then output ONLY the resulting grid for the test case in the exact required formatting.

Domain facts and conventions:
- Grids are 2D arrays of integers 0–9 separated by single spaces. Rows are separated by newlines.
- 0 denotes background/empty. Non-zero digits denote colored cells; each digit is a separate color.
- Connected components are defined with 4-neighborhood (up, down, left, right).
- Bounding box of a component is the minimal axis-aligned rectangle enclosing all its cells.
- Many tasks use one or more of these transformations:
  - Cropping: output is a cropped rectangle (often a component’s bounding box) rather than the full input s

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [03:31<00:00, 70.54s/it] 

2025/09/05 21:00:28 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/09/05 21:03:19 INFO dspy.teleprompt.gepa.gepa: Iteration 2: Proposed new text for predict: Task: Given one or more training example pairs (Input grid → Output grid) and a test Input grid, infer the transformation rule(s) from the training examples and apply them to the test Input to produce its Output grid.

Input/Output format:
- Grids are rectangular arrays of integers separated by single spaces, each row on its own line.
- 0 typically denotes background. Other nonzero integers denote colors/values.
- Your answer must be ONLY the test Output grid, formatted exactly like the examples: same dimensions as the test Input, single spaces between numbers, one line per row, no extra text, no surrounding quotes/code blocks.

General approach:
1) Study all training Input/Output pairs and identify a rule that, when applied to each training Input, reproduces its training Output exactly.
2) Prefer the simplest rule that fits every training example; avoid overgeneralization.
3) Validate your 

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [03:13<00:00, 64.50s/it] 

2025/09/05 21:09:09 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/09/05 21:10:01 INFO dspy.teleprompt.gepa.gepa: Iteration 3: Proposed new text for predict: You will be given an ARC-style puzzle: a grid of integers (colors 0–9), several training Input→Output pairs, and one Test Case Input. Your job is to infer the transformation that maps each training input to its output, then apply the same rule to the Test Case Input and return the resulting grid.

Follow these guidelines:

- Understand the format:
  - Each grid is a rectangle of space-separated integers, one row per line.
  - Training Examples are labeled “Input:” and “Output:” and define the rule.
  - The Test Case provides only “Input:”. You must produce the corresponding “Output:” grid.
  - Output must match the required dimensions implied by the rule. The output may have the same size as the input or a different size (e.g., stacking/concatenation).

- Infer the rule using all training examples:
  - Propose a hypothesis and verify it against every training pair. If any example contradicts

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [04:49<00:00, 96.51s/it] 

2025/09/05 21:27:17 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/09/05 21:29:06 INFO dspy.teleprompt.gepa.gepa: Iteration 4: Proposed new text for predict: You will be given an ARC-style puzzle: a grid of integers (colors 0–9), several training Input→Output pairs, and one Test Case Input. Your job is to infer the transformation that maps each training input to its output, then apply the same rule to the Test Case Input and return the resulting grid.

Follow these guidelines carefully.

1) Input/output format
- Each grid is a rectangle of space-separated integers, one row per line.
- Training Examples are labeled “Input:” and “Output:” and define the rule.
- The Test Case provides only “Input:”. You must produce the corresponding “Output:” grid.
- Output must match the required dimensions implied by the rule (same size as input unless the examples show otherwise).

2) General approach
- Identify connected components (4-neighbor connectivity) of nonzero colors. For each component, compute:
  - Its color.
  - Its shape (the exact set of cells).
  

Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [02:28<00:00, 49.50s/it] 

2025/09/05 21:35:40 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/09/05 21:36:56 INFO dspy.teleprompt.gepa.gepa: Iteration 5: Proposed new text for predict: You will be given an ARC-style puzzle: a grid of integers (colors 0–9), several training Input→Output pairs, and one Test Case Input. Your job is to infer the transformation that maps each training input to its output, then apply the same rule to the Test Case Input and return the resulting grid.

Core approach
- Parse the format:
  - Each grid is a rectangle of space-separated integers, one row per line.
  - Training Examples are labeled “Input:” and “Output:”.
  - The Test Case provides only “Input:”. You must produce the corresponding “Output:” grid.
- Infer a single rule that explains every training pair exactly, then apply it to the Test Case.
- Validate your hypothesis against every training example; if any mismatch appears, revise the rule.

What to look for (common ARC patterns observed in prior tasks)
- Background vs. structure:
  - 0 is often background. Non-zero colors are often s

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [01:35<00:00, 31.83s/it] 

2025/09/05 21:55:15 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/09/05 21:56:54 INFO dspy.teleprompt.gepa.gepa: Iteration 6: Proposed new text for predict: You are solving ARC-style grid transformation puzzles.

Goal:
- Infer the transformation rule from the given training example(s) (each with an Input grid and its Output grid).
- Apply that same rule to the Test Case Input to produce the Test Case Output.
- Return the final grid in the exact required format.

Input format:
- Grids are rectangular arrays of integers. Each row is on its own line; numbers in a row are separated by single spaces.
- 0 denotes background; nonzero integers denote colors.
- Training data contains one or more pairs: “Input:” grid -> “Output:” grid.
- The Test Case contains only “Input:”; you must produce its “Output:”.
- Output grids may be the same size as their inputs or a different size, depending on the inferred rule.

What to infer:
- Derive a single rule that explains all training pairs. Validate the rule by checking that applying it to each training Input repro

Average Metric: 3.00 / 3 (100.0%): 100%|██████████| 3/3 [00:42<00:00, 14.31s/it]

2025/09/05 21:59:20 INFO dspy.evaluate.evaluate: Average Metric: 3.0 / 3 (100.0%)
2025/09/05 21:59:20 INFO dspy.teleprompt.gepa.gepa: Iteration 7: All subsample scores perfect. Skipping.
2025/09/05 21:59:20 INFO dspy.teleprompt.gepa.gepa: Iteration 7: Reflective mutation did not propose a new candidate
GEPA Optimization:  54%|█████▍    | 639/1180 [1:20:52<1:16:36,  8.50s/rollouts]2025/09/05 21:59:20 INFO dspy.teleprompt.gepa.gepa: Iteration 8: Selected program 2 score: 0.655



Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [02:14<00:00, 44.93s/it] 

2025/09/05 22:01:35 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/09/05 22:02:22 INFO dspy.teleprompt.gepa.gepa: Iteration 8: Proposed new text for predict: You will be given an ARC-style puzzle: a grid of integers (colors 0–9), several training Input→Output pairs, and one Test Case Input. Your job is to infer the transformation that maps each training input to its output, then apply the same rule to the Test Case and print the resulting grid.

Input format
- Each grid is a rectangle of space-separated integers, one row per line.
- Training Examples are labeled “Input:” and “Output:”.
- The Test Case provides only “Input:”. You must produce the corresponding “Output:” grid.

General principles
- Infer a single deterministic rule that explains every training pair exactly; then apply it to the Test Case.
- 0 is often background. Non-zero colors are often structural or movable. Added elements usually write into zeros only, unless examples show overwriting.
- Treat each colored object/region independently if appropriate (e.g., multiple separate bloc

Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [03:24<00:00, 68.14s/it] 

2025/09/05 22:07:38 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/09/05 22:09:07 INFO dspy.teleprompt.gepa.gepa: Iteration 9: Proposed new text for predict: You will be given an ARC-style puzzle consisting of integer grids (colors 0–9), several Training Input→Output pairs, and one Test Case Input. Your job is to infer the exact transformation rule that maps each training input to its output, verify it against all given training pairs, and then apply the same rule to the Test Case Input to produce the Output grid.

Core format and output requirements:
- Each grid is a rectangle of space-separated integers, one row per line.
- Training pairs are labeled “Input:” and “Output:”.
- The Test Case provides only “Input:”; you must output the corresponding “Output:” grid.
- Output only the resulting grid, nothing else:
  - No headings, labels, or explanations.
  - Exactly one space between numbers in a row.
  - No leading/trailing spaces on any line.
  - Correct number of rows and columns for the inferred rule.
  - Preserve row order; each row ends with 

Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [01:58<00:00, 39.56s/it]

2025/09/05 22:14:32 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/09/05 22:16:09 INFO dspy.teleprompt.gepa.gepa: Iteration 10: Proposed new text for predict: You will be given one “problem” at a time consisting of:
- Several Training Examples, each with an Input grid and its corresponding Output grid.
- One Test Case with an Input grid and an empty Output to fill.

Your task is to infer a single deterministic rule from the Training Examples and apply it to the Test Case input to produce the correct Output grid.

Input/Output format:
- Grids are rectangular, represented as rows of space-separated integers (typically 0–9).
- Rows are separated by newlines. Keep grid dimensions unchanged.
- Your final reply must contain only the transformed grid for the Test Case, formatted exactly like the examples: single spaces between numbers, one row per line, no extra text, no prefixes, no blank lines.

Core concepts and definitions:
- Colors: the integer values in the grid.
- Connected component: cells of the same color connected via 4-neighborhood (up/down/

Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [02:28<00:00, 49.66s/it] 

2025/09/05 22:31:19 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/09/05 22:32:15 INFO dspy.teleprompt.gepa.gepa: Iteration 11: Proposed new text for predict: You will be given a puzzle of the form “Identify the pattern within the grids and solve the problem.” The prompt contains one or more training examples (each with an Input grid and its corresponding Output grid), and a Test Case with an Input grid. Your job is to infer the transformation rule from the training examples and apply it to the Test Case input to produce its Output grid.

Follow these instructions carefully:

1) Understand the data format
- Grids are rectangular arrays of small nonnegative integers (0–9). Interpret each distinct nonzero integer as a color; 0 is background.
- Training data consists of one or more pairs: Input grid → Output grid. The Test Case section includes only an Input grid; you must produce its Output.
- Output dimensions may be the same as the input or may change, depending on the pattern. Infer the correct output size from the training pairs.

2) Derive the

Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [02:40<00:00, 53.44s/it]

2025/09/05 22:39:12 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/09/05 22:40:44 INFO dspy.teleprompt.gepa.gepa: Iteration 12: Proposed new text for predict: You will be given ARC-style puzzles consisting of grids of integers (colors 0–9), several Training Input→Output pairs, and one Test Case Input. Your job is to infer the transformation from the training pairs and apply it to the Test Case.

Follow this process:

1) Understand the format
- Each grid is a rectangle of space-separated integers, one row per line.
- Training Examples are labeled “Input:” and “Output:”. The Test Case provides only “Input:”.
- The output grid’s size may match the input or differ if the rule implies resizing or stacking.

2) Build a hypothesis that explains ALL training pairs
- Propose a concrete rule, then verify it against every training pair. If any example contradicts your hypothesis, revise it.
- Use the full set of training pairs to constrain orientation, alignment, and collision/overwrite rules.

3) Recognize common ARC motifs and how to apply them
- Backgrou

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [04:46<00:00, 95.43s/it] 

2025/09/05 22:59:02 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/09/05 23:00:08 INFO dspy.teleprompt.gepa.gepa: Iteration 13: Proposed new text for predict: You will solve “grid transformation” puzzles (ARC-style) by inferring a single deterministic rule from multiple Training pairs (Input → Output) and applying it to the Test Input to produce its Output.

Input/Output format and constraints:
- Grids are rectangular, rows of space-separated integers (0–9), one row per line.
- The Output grid may be the same size as the Input or a different size (cropping/extraction tasks occur). Do not assume dimensions are preserved.
- Your final reply must contain only the Output grid, formatted exactly with single spaces between numbers, one row per line, no extra text or blank lines.

Core concepts:
- Colors are integers 0–9; 0 commonly denotes background/unknown and may need filling, but treat 0 as a real color unless the learned rule dictates otherwise.
- Connected component: cells of the same color connected via 4-neighborhood (up/down/left/right) only. 

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [01:32<00:00, 30.80s/it] 

2025/09/05 23:04:33 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/09/05 23:05:38 INFO dspy.teleprompt.gepa.gepa: Iteration 14: Proposed new text for predict: Task: Given one or more training example pairs (Input -> Output) and a single Test Case Input, infer the transformation rule from the training examples and apply it to the Test Case to produce its Output. Output only the final grid for the Test Case in the exact format described below.

Input format:
- You will receive a prompt containing “Training Examples:” followed by one or more training pairs. Each pair lists an Input grid and its corresponding Output grid.
- Then you will receive “Test Case:” with a single Input grid and no Output.
- Grids are rectangular arrays of small non-negative integers (typically 0–9), with rows separated by newlines and numbers in a row separated by single spaces.

Output format:
- Print ONLY the Test Case Output grid as rows of integers separated by single spaces.
- Each grid row must be on its own line. Do not include any extra text, labels, headings, or bla

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [01:21<00:00, 27.23s/it] 

2025/09/05 23:12:08 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/09/05 23:12:55 INFO dspy.teleprompt.gepa.gepa: Iteration 15: Proposed new text for predict: You are given a pattern-induction puzzle with one or more training example pairs and a single test input. Each grid is a 2D matrix of small non-negative integers, where 0 typically denotes background and nonzero integers denote colored cells.

Your task:
- Infer the transformation rule from the training example(s).
- Apply that exact rule to the test input.
- Output only the resulting grid, with rows on separate lines and single spaces between integers. Do not include any explanatory text, labels, or extra lines.

How to approach the problem:
1) Parse the format:
   - The prompt contains “Training Examples:” with one or more Example blocks, each with an Input grid and its corresponding Output grid.
   - A “Test Case:” section provides the Input grid you must transform.
   - Grids are rectangular; rows are line-separated; cells in a row are space-separated integers.

2) Induce the rule:
   -

Average Metric: 2.00 / 3 (66.7%): 100%|██████████| 3/3 [02:21<00:00, 47.20s/it] 

2025/09/05 23:17:17 INFO dspy.evaluate.evaluate: Average Metric: 2.0 / 3 (66.7%)





2025/09/05 23:18:27 INFO dspy.teleprompt.gepa.gepa: Iteration 16: Proposed new text for predict: You will be given an ARC-style puzzle: a grid of integers (colors 0–9), several training Input→Output pairs, and one Test Case Input. Your job is to infer the transformation that maps each training input to its output, then apply the same rule to the Test Case and return the resulting grid.

Input format
- Each grid is a rectangle of space-separated integers, one row per line.
- Training examples are labeled “Input:” and “Output:”.
- The Test Case provides only “Input:”.

Output formatting (strict)
- Output only the resulting grid, nothing else (no explanations, labels, or code fences).
- Use exactly one space between numbers in a row.
- No leading/trailing spaces on any line.
- The output must have exactly the intended number of rows and columns.
- Preserve row order; each row ends with a newline.

Core process
1) Parse and measure:
   - Record H×W for each input and output in training.
  

Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [03:16<00:00, 65.63s/it] 

2025/09/05 23:23:04 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/09/05 23:24:58 INFO dspy.teleprompt.gepa.gepa: Iteration 17: Proposed new text for predict: You are given one ARC-style puzzle at a time. Each puzzle contains:
- A few “Training Examples” (each with an Input grid and an Output grid).
- One “Test Case” with an Input grid (you must produce the corresponding Output grid).
- The grids are rectangular arrays of integers (colors). 0 is background. Other digits represent colors, objects, or markers. The same digit does not carry meaning across different puzzles; infer its role per puzzle.

Your task:
- Infer a single transformation rule that explains all Training Example Input→Output pairs exactly.
- Apply that rule to the Test Case Input to produce the Test Case Output.
- Format the final answer exactly as a space-separated grid with identical dimensions to the Test Case Input, one row per line, no extra spaces, blank lines, or commentary.

General approach:
1) Read and parse all training Input→Output pairs and the test Input.
2) Identi

Average Metric: 1.00 / 3 (33.3%): 100%|██████████| 3/3 [02:02<00:00, 40.96s/it]

2025/09/05 23:31:14 INFO dspy.evaluate.evaluate: Average Metric: 1.0 / 3 (33.3%)





2025/09/05 23:32:12 INFO dspy.teleprompt.gepa.gepa: Iteration 18: Proposed new text for predict: You will be given an ARC-style puzzle: a grid of integers (colors 0–9), several training Input→Output pairs, and one Test Case Input. Your job is to infer the transformation that maps each training input to its output, then apply the same rule to the Test Case and return the resulting grid.

Input format
- Each grid is a rectangle of space-separated integers, one row per line.
- Training examples are labeled “Input:” and “Output:”.
- The Test Case provides only “Input:”. You must produce the corresponding “Output:” grid.

Core method
1) Carefully compare each training Input→Output pair.
   - What changes? Which cells, colors, locations? Are dimensions preserved or reduced?
   - Do the changes respect zeros as background (no overwrite) or do they overwrite non-zero cells (e.g., interior fills)?
2) Propose a single rule that explains every training pair exactly.
3) Validate the rule against a

In [14]:
print(optimized_program.predict.signature.instructions)

You will be given ARC-style puzzles consisting of grids of integers (colors 0–9), several Training Input→Output pairs, and one Test Case Input. Your job is to infer the transformation from the training pairs and apply it to the Test Case.

Follow this process:

1) Understand the format
- Each grid is a rectangle of space-separated integers, one row per line.
- Training Examples are labeled “Input:” and “Output:”. The Test Case provides only “Input:”.
- The output grid’s size may match the input or differ if the rule implies resizing or stacking.

2) Build a hypothesis that explains ALL training pairs
- Propose a concrete rule, then verify it against every training pair. If any example contradicts your hypothesis, revise it.
- Use the full set of training pairs to constrain orientation, alignment, and collision/overwrite rules.

3) Recognize common ARC motifs and how to apply them
- Background and frames:
  - 0 is often background; large uniform regions or stable frames/bands (e.g., col

In [15]:
evaluate(optimized_program)

Average Metric: 163.00 / 400 (40.8%): 100%|██████████| 400/400 [28:07<00:00,  4.22s/it]

2025/09/06 00:27:28 INFO dspy.evaluate.evaluate: Average Metric: 163 / 400 (40.8%)





Unnamed: 0,problem,example_answer,reasoning,pred_answer,arc_metric
0,Identify the pattern within the grids and solve the problem. Train...,3 2 3 2 3 2 7 8 7 8 7 8 2 3 2 3 2 3 8 7 8 7 8 7 3...,The 2x2 input is tiled into a 6x6 by repeating each 2-row stripe t...,3 2 3 2 3 2 7 8 7 8 7 8 2 3 2 3 2 3 8 7 8 7 8 7 3...,✔️ [1]
1,Identify the pattern within the grids and solve the problem. Train...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 7 7 7 7 7 7...,"Observing the training pairs, the transformation always removes th...",0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 3 3 3 3 3...,
2,Identify the pattern within the grids and solve the problem. Train...,0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 2 0 0 0 0 0 2 2 2 2...,We observed that each training input contains one or more square f...,0 0 0 0 0 0 0 0 0 0 0 2 2 2 2 2 0 0 0 0 0 2 2 2 2...,✔️ [1]
3,Identify the pattern within the grids and solve the problem. Train...,7 0 0 0 0 0 0 0 0 0 7 0 0 0 0 0 0 0 0 0 7 0 0 0 0...,We identify distinct colored blocks (connected nonzero regions) ar...,7 0 0 0 0 0 0 0 0 0 7 0 0 0 0 0 0 0 0 0 7 0 0 0 0...,✔️ [1]
4,Identify the pattern within the grids and solve the problem. Train...,0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 8 0 0 0 0 0...,The grids all contain a full band of color 8 (a complete row or a ...,0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 8 0 0 0 0 0...,
...,...,...,...,...,...
395,Identify the pattern within the grids and solve the problem. Train...,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1...,I was unable to confidently infer a consistent transformation from...,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1...,
396,Identify the pattern within the grids and solve the problem. Train...,0 0 0 0 0 0 0 0 0 0 0 0 3 3 0 0 0 0 0 0 0 0 0 0 0...,We need to infer the tiling/transformation rule from the three tra...,0 3 3 0 3 3 0 3 3 0 3 3 0 3 3 0 0 0 6 0 0 6 0 0 6...,
397,Identify the pattern within the grids and solve the problem. Train...,0 0 0 0 0 0 0 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4...,The training examples show a 3-cell cross of color 2 (a central pl...,0 0 0 0 0 0 0 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4...,
398,Identify the pattern within the grids and solve the problem. Train...,0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 0 2 2 0 0 0 2 0...,I analyzed the training pairs and found repeating 4x4 tile-like st...,0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 0 8 8 0 0 0 2 0...,


EvaluationResult(score=40.75, results=<list of 400 results>)