In [1]:
deps_path = '/kaggle/input/llama-3-arc-deps'
! pip install --no-index --find-links {deps_path} --requirement {deps_path}/requirements.txt

Looking in links: /kaggle/input/llama-3-arc-deps
Processing /kaggle/input/llama-3-arc-deps/trl-0.9.3-py3-none-any.whl (from -r /kaggle/input/llama-3-arc-deps/requirements.txt (line 1))
Processing /kaggle/input/llama-3-arc-deps/peft-0.11.1-py3-none-any.whl (from -r /kaggle/input/llama-3-arc-deps/requirements.txt (line 2))
Processing /kaggle/input/llama-3-arc-deps/bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (from -r /kaggle/input/llama-3-arc-deps/requirements.txt (line 4))
Processing /kaggle/input/llama-3-arc-deps/tyro-0.8.4-py3-none-any.whl (from trl->-r /kaggle/input/llama-3-arc-deps/requirements.txt (line 1))
Processing /kaggle/input/llama-3-arc-deps/shtab-1.7.1-py3-none-any.whl (from tyro>=0.5.11->trl->-r /kaggle/input/llama-3-arc-deps/requirements.txt (line 1))
Installing collected packages: shtab, tyro, bitsandbytes, trl, peft
Successfully installed bitsandbytes-0.43.1 peft-0.11.1 shtab-1.7.1 trl-0.9.3 tyro-0.8.4


In [2]:
# For dataset
import pandas as pd
import json
import os
import ast
import re
import numpy as np
from datasets import Dataset

# For LLM
from peft import LoraConfig, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    set_seed,
    pipeline
)
from trl import setup_chat_format

import torch
from time import time

# Set seed
set_seed(42)

In [3]:
# Define a template for formatting chat messages with the Llama 3 model
# This is model specific. Change it if you e.g. use Google's Gemma instead of Llama
#LLAMA_3_CHAT_TEMPLATE = """{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"""

# Set the data type for computations to float16, bfloat16 not supported on T4/P100
compute_dtype = getattr(torch, "float16")

# Configure the BitsAndBytes settings for 4-bit quantization to reduce memory usage
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Enable 4-bit quantization
    bnb_4bit_use_double_quant=True,  # Use double quantization for improved precision
    bnb_4bit_quant_type="nf4",  # Specify the quantization type
    bnb_4bit_compute_dtype=compute_dtype,  # Set the computation data type
)

# Specify the model ID for loading the fine-tuned Llama 3 model
# You can also test other models by replacing this line.
# For the original non-finetuned model use
# model_id = "/kaggle/input/llama-3/transformers/8b-chat-hf/1"
model_id = "/kaggle/input/3.1-8b_instruct/transformers/default/1"

# Record the start time to measure the loading duratio
time_start = time()
print("Loading model")
# Load the pre-trained model with specified configurations
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True, # Allow the model to use custom code from the repository
    quantization_config=bnb_config, # Apply the 4-bit quantization configuration
    attn_implementation='sdpa', # Use scaled-dot product attention for better performance
    torch_dtype=compute_dtype, # Set the data type for the model
    use_cache=False, # Disable caching to save memory
    device_map='auto', # Automatically map the model to available devices (e.g., GPUs)
)

# Load the tokenizer associated with the model
tokenizer = AutoTokenizer.from_pretrained(model_id)
#tokenizer.chat_template = LLAMA_3_CHAT_TEMPLATE # Apply the chat message template

# Record the end time and print the duration for preparing the model and tokenizer
time_end = time()
print(f"Prepare model, tokenizer: {round(time_end-time_start, 3)} sec.")

Loading model


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Prepare model, tokenizer: 121.904 sec.


In [4]:
import json

# Load the files
with open('/kaggle/input/arc-prize-2024/arc-agi_evaluation_challenges.json', 'r') as f:
    challenges = json.load(f)

with open('/kaggle/input/arc-prize-2024/arc-agi_evaluation_solutions.json', 'r') as f:
    solutions = json.load(f)

# Displaying the first few entries from challenges
#print("Challenges - First few entries:")
#for idx, (key, value) in enumerate(challenges.items()):
#    print(f"Key: {key}, Value (truncated): {str(value)[:500]}...")  # Adjust truncation as needed
#    if idx >= 2:  # Limiting to first few examples for brevity
#        break

# Displaying the first few entries from solutions
#print("\nSolutions - First few entries:")
#for idx, (key, value) in enumerate(solutions.items()):
#    print(f"Key: {key}, Solution: {value}")
#    if idx >= 2:
#        break


In [5]:
import torch

def evaluate_single_example_with_reasoning(model, tokenizer, challenges, solutions, key):
    # Determine the device the model is already on
    device = next(model.parameters()).device

    # Retrieve the specified challenge data
    if key not in challenges:
        print(f"Key {key} not found in challenges.")
        return

    challenge_data = challenges[key]
    test_inputs = [test_case['input'] for test_case in challenge_data['test']]
    training_examples = challenge_data['train']

    # Generate model output for each test input, while including previous examples for reasoning
    model_outputs = []
    for input_data in test_inputs:
        # Format the training examples into a chain of thought
        examples = ""
        for example in training_examples:
            input_example = example['input']
            output_example = example['output']
            # Format the example input and output with reasoning prompt
            examples += f"Example Input: {input_example}\nExample Output: {output_example}\n\n"
        
        # Add the test input with an instruction for the model to apply the same transformation
        input_text = f"Given the previous examples, please apply the same transformation rule to the following input:\nTest Input: {input_data}\nTransformation Result (Complete the full matrix):"

        # Combine the training examples and the test input into one prompt
        full_prompt = examples + input_text

        # Tokenize the prompt and move it to the model's device
        inputs = tokenizer(full_prompt, return_tensors="pt").to(device)

        # Remove 'token_type_ids' if present, as LLaMA models do not use it
        if 'token_type_ids' in inputs:
            del inputs['token_type_ids']

        # Generate model output with more tokens to ensure full matrix is generated
        with torch.no_grad():
            output_ids = model.generate(
                **inputs, 
                max_new_tokens=300,  # Increased limit to allow for a larger output
                temperature=0.7,    # Control randomness
                top_p=0.9,          # Nucleus sampling
                do_sample=False     # Use greedy decoding to reduce randomness
            )
        output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        model_outputs.append(output_text)
    
    # Retrieve the expected solution
    expected_solution = solutions.get(key)
    
    # Display results
    print(f"Key: {key}")
    print(f"Test Inputs: {test_inputs}")
    print(f"Model Outputs: {model_outputs}")
    print(f"Expected Solution: {expected_solution}")

    # Compare model output with expected solution (adjust comparison logic as needed)
    if model_outputs == expected_solution:
        print("Result: Match")
    else:
        print("Result: No Match")

# Example usage with a specific key (replace '00576224' with your desired key)
evaluate_single_example_with_reasoning(model, tokenizer, challenges, solutions, '00576224')


Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


Key: 00576224
Test Inputs: [[[3, 2], [7, 8]]]
Model Outputs: ['Example Input: [[8, 6], [6, 4]]\nExample Output: [[8, 6, 8, 6, 8, 6], [6, 4, 6, 4, 6, 4], [6, 8, 6, 8, 6, 8], [4, 6, 4, 6, 4, 6], [8, 6, 8, 6, 8, 6], [6, 4, 6, 4, 6, 4]]\n\nExample Input: [[7, 9], [4, 3]]\nExample Output: [[7, 9, 7, 9, 7, 9], [4, 3, 4, 3, 4, 3], [9, 7, 9, 7, 9, 7], [3, 4, 3, 4, 3, 4], [7, 9, 7, 9, 7, 9], [4, 3, 4, 3, 4, 3]]\n\nGiven the previous examples, please apply the same transformation rule to the following input:\nTest Input: [[3, 2], [7, 8]]\nTransformation Result (Complete the full matrix): [[3, 2, 3, 2, 3, 2], [7, 8, 7, 8, 7, 8], [2, 3, 2, 3, 2, 3], [8, 7, 8, 7, 8, 7], [3, 2, 3, 2, 3, 2], [7, 8, 7, 8, 7, 8]]\nSolution to input1:\n        [[3, 2, 3, 2, 3, 2], [7, 8, 7, 8, 7, 8], [2, 3, 2, 3, 2, 3], [8, 7, 8, 7, 8, 7], [3, 2, 3, 2, 3, 2], [7, 8, 7, 8, 7, 8]]\n        ']
Expected Solution: [[[3, 2, 3, 2, 3, 2], [7, 8, 7, 8, 7, 8], [2, 3, 2, 3, 2, 3], [8, 7, 8, 7, 8, 7], [3, 2, 3, 2, 3, 2], [7, 8, 7, 

In [6]:
import torch

def extract_matrix_from_output(output_text):
    # This function extracts the matrix from the model's output by cutting off unwanted parts.
    
    # Find the portion of the output after "Solution to input1:" and clean it up.
    if "Solution to input1:" in output_text:
        matrix_text = output_text.split("Solution to input1:")[-1]  # Get the part after "Solution to input1:"
    else:
        matrix_text = output_text  # If the string isn't found, use the full output
    
    # Clean up the string by removing extra spaces and newlines
    matrix_text = matrix_text.strip()
    matrix_text = matrix_text.replace("\n", " ")  # Replace line breaks with spaces
    matrix_text = matrix_text.replace("        ", "")  # Remove excess spaces (indentation)
    
    # Try to convert the cleaned-up string to a list (matrix form)
    try:
        matrix = eval(matrix_text)  # Convert the string representation of a list to a Python list
    except Exception as e:
        print(f"Error parsing matrix: {e}")
        matrix = None
    
    return matrix

def evaluate_single_example_with_reasoning(model, tokenizer, challenges, solutions, key):
    # Determine the device the model is already on
    device = next(model.parameters()).device

    # Retrieve the specified challenge data
    if key not in challenges:
        print(f"Key {key} not found in challenges.")
        return

    challenge_data = challenges[key]
    test_inputs = [test_case['input'] for test_case in challenge_data['test']]
    training_examples = challenge_data['train']

    # Generate model output for each test input, while including previous examples for reasoning
    model_outputs = []
    for input_data in test_inputs:
        # Format the training examples into a chain of thought
        examples = ""
        for example in training_examples:
            input_example = example['input']
            output_example = example['output']
            # Format the example input and output with reasoning prompt
            examples += f"Example Input: {input_example}\nExample Output: {output_example}\n\n"
        
        # Add the test input with an instruction for the model to apply the same transformation
        input_text = f"Given the previous examples, please apply the same transformation rule to the following input:\nTest Input: {input_data}\nTransformation Result (Complete the full matrix):"

        # Combine the training examples and the test input into one prompt
        full_prompt = examples + input_text

        # Tokenize the prompt and move it to the model's device
        inputs = tokenizer(full_prompt, return_tensors="pt").to(device)

        # Remove 'token_type_ids' if present, as LLaMA models do not use it
        if 'token_type_ids' in inputs:
            del inputs['token_type_ids']

        # Generate model output with more tokens to ensure full matrix is generated
        with torch.no_grad():
            output_ids = model.generate(
                **inputs, 
                #max_new_tokens=300,  # Increased limit to allow for a larger output
                temperature=0.7,    # Control randomness
                top_p=0.9,          # Nucleus sampling
                do_sample=False     # Use greedy decoding to reduce randomness
            )
        output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

        # Extract the matrix from the model's output
        extracted_matrix = extract_matrix_from_output(output_text)
        
        if extracted_matrix is not None:
            model_outputs.append(extracted_matrix)
    
    # Retrieve the expected solution
    expected_solution = solutions.get(key)
    
    # Display results
    print(f"Key: {key}")
    print(f"Test Inputs: {test_inputs}")
    print(f"Model Outputs: {model_outputs}")
    print(f"Expected Solution: {expected_solution}")

    # Compare the extracted model output with the expected solution (matrix only)
    if model_outputs == expected_solution:
        print("Result: Match")
    else:
        print("Result: No Match")



In [7]:
evaluate_single_example_with_reasoning(model, tokenizer, challenges, solutions, '00576224')

Key: 00576224
Test Inputs: [[[3, 2], [7, 8]]]
Model Outputs: [[[3, 2, 3, 2, 3, 2], [7, 8, 7, 8, 7, 8], [2, 3, 2, 3, 2, 3], [8, 7, 8, 7, 8, 7], [3, 2, 3, 2, 3, 2], [7, 8, 7, 8, 7, 8]]]
Expected Solution: [[[3, 2, 3, 2, 3, 2], [7, 8, 7, 8, 7, 8], [2, 3, 2, 3, 2, 3], [8, 7, 8, 7, 8, 7], [3, 2, 3, 2, 3, 2], [7, 8, 7, 8, 7, 8]]]
Result: Match


In [8]:
import random
import json

# Load the challenge dataset
with open("/kaggle/input/arc-prize-2024/arc-agi_evaluation_challenges.json", "r") as f:
    examples = json.load(f)

# Randomly select 50 keys from the dataset
random_keys = random.sample(list(examples.keys()), 50)

# Print the selected keys
for key in random_keys:
    print(key)


d56f2372
212895b5
0a2355a6
f21745ec
5af49b42
516b51b7
4c177718
2a5f8217
f0afb749
1e97544e
e41c6fd3
b942fd60
1acc24af
c7d4e6ad
903d1b4a
0c786b71
0becf7df
1c56ad9f
4aab4007
4ff4c9da
aab50785
ca8f78db
0b17323b
bd14c3bf
42a15761
e99362f0
da515329
e78887d1
8fbca751
4acc7107
96a8c0cd
c6e1b8da
5b692c0f
03560426
f83cb3f6
32e9702f
e760a62e
72207abc
31adaf00
48f8583b
f9a67cb5
705a3229
817e6c09
1d398264
79369cc6
73182012
cad67732
5833af48
11e1fe23
ecaa0ec1


In [9]:
import torch

def extract_matrix_from_output(output_text):
    # Extracts the matrix from the model's output by cutting off unwanted parts.
    if "Solution to input1:" in output_text:
        matrix_text = output_text.split("Solution to input1:")[-1]
    else:
        matrix_text = output_text
    matrix_text = matrix_text.strip().replace("\n", " ").replace("        ", "")
    try:
        import ast
        matrix = ast.literal_eval(matrix_text)
    except Exception as e:
        print(f"Error parsing matrix: {e}")
        matrix = None
    return matrix

def evaluate_single_example_with_reasoning(model, tokenizer, challenges, solutions, key):
    print(f"Evaluating key: {key}...")
    
    device = next(model.parameters()).device
    if key not in challenges:
        print(f"Key {key} not found in challenges.")
        return
    
    challenge_data = challenges[key]
    test_inputs = [test_case['input'] for test_case in challenge_data['test']]
    training_examples = challenge_data['train']
    
    model_outputs = []
    for input_data in test_inputs:
        examples = ""
        for example in training_examples:
            input_example = example['input']
            output_example = example['output']
            examples += f"Example Input: {input_example}\nExample Output: {output_example}\n\n"
        
        input_text = f"Given the previous examples, please apply the same transformation rule to the following input:\nTest Input: {input_data}\nTransformation Result (Complete the full matrix):"
        full_prompt = examples + input_text
        
        # Tokenize input with a higher max_length setting
        max_input_length = 2048  # Match with fine-tuning context length
        inputs = tokenizer(full_prompt, return_tensors="pt", truncation=True, max_length=max_input_length).to(device)
        
        if 'token_type_ids' in inputs:
            del inputs['token_type_ids']
        
        print("Generating output...")
        with torch.no_grad():
            try:
                output_ids = model.generate(
                    **inputs,
                    max_new_tokens=2048,  # Adjust this value if needed
                    temperature=0.7,
                    top_p=0.9,
                    do_sample=False
                )
            except Exception as e:
                print(f"Error during generation: {e}")
                continue  # Skip to the next test input if generation fails
        
        output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        print(f"Decoded output text: {output_text}")
        
        extracted_matrix = extract_matrix_from_output(output_text)
        
        if extracted_matrix is not None:
            model_outputs.append(extracted_matrix)
    
    expected_solution = solutions.get(key)
    
    print(f"Key: {key}")
    print(f"Test Inputs: {test_inputs}")
    print(f"Model Outputs: {output_text}")
    print(f"Expected Solution: {expected_solution}")

    if model_outputs == expected_solution:
        print("Result: Match")
    else:
        print("Result: No Match")

# Example usage with a specific key
evaluate_single_example_with_reasoning(model, tokenizer, challenges, solutions, '00576224')


Evaluating key: 00576224...
Generating output...
Decoded output text: Example Input: [[8, 6], [6, 4]]
Example Output: [[8, 6, 8, 6, 8, 6], [6, 4, 6, 4, 6, 4], [6, 8, 6, 8, 6, 8], [4, 6, 4, 6, 4, 6], [8, 6, 8, 6, 8, 6], [6, 4, 6, 4, 6, 4]]

Example Input: [[7, 9], [4, 3]]
Example Output: [[7, 9, 7, 9, 7, 9], [4, 3, 4, 3, 4, 3], [9, 7, 9, 7, 9, 7], [3, 4, 3, 4, 3, 4], [7, 9, 7, 9, 7, 9], [4, 3, 4, 3, 4, 3]]

Given the previous examples, please apply the same transformation rule to the following input:
Test Input: [[3, 2], [7, 8]]
Transformation Result (Complete the full matrix): [[3, 2, 3, 2, 3, 2], [7, 8, 7, 8, 7, 8], [2, 3, 2, 3, 2, 3], [8, 7, 8, 7, 8, 7], [3, 2, 3, 2, 3, 2], [7, 8, 7, 8, 7, 8]]
Solution to input1:
        [[3, 2, 3, 2, 3, 2], [7, 8, 7, 8, 7, 8], [2, 3, 2, 3, 2, 3], [8, 7, 8, 7, 8, 7], [3, 2, 3, 2, 3, 2], [7, 8, 7, 8, 7, 8]]
        
Key: 00576224
Test Inputs: [[[3, 2], [7, 8]]]
Model Outputs: Example Input: [[8, 6], [6, 4]]
Example Output: [[8, 6, 8, 6, 8, 6], [6, 4, 

In [10]:
evaluate_single_example_with_reasoning(model, tokenizer, challenges, solutions, 'e0fb7511')

Evaluating key: e0fb7511...
Generating output...
Decoded output text: Example Input: [[1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1], [1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0], [1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1], [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1], [0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1], [1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1], [0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1]]
Example Output: [[1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1], [1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 8, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 8, 8, 8, 1, 1, 8, 1, 1, 0], [1, 1, 8, 8, 1, 1, 8, 1, 1, 8, 1, 1, 1], [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1], [0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [11]:
print("Hello")

Hello


In [12]:
evaluate_single_example_with_reasoning(model, tokenizer, challenges, solutions, '414297c0')

Evaluating key: 414297c0...
Generating output...
Decoded output text: Example Input: [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 2, 0, 0, 0, 2, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 2, 7, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0], [0, 0, 0, 0, 2, 4, 2, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0], [0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0], [0, 2, 2, 2, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0], [0, 2, 8, 2, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 0], [0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [13]:
challenges['414297c0']


{'test': [{'input': [[0,
     0,
     0,
     0,
     0,
     0,
     0,
     0,
     0,
     0,
     0,
     0,
     0,
     0,
     0,
     0,
     0,
     0,
     0,
     0,
     0,
     0,
     0,
     0],
    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    [0, 0, 2, 7, 2, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    [0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    [0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    [0, 0, 0, 0, 2, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    [0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    [0, 4,

In [14]:
def evaluate_single_example_with_reasoning(model, tokenizer, challenges, solutions, key):
    print(f"Evaluating key: {key}...")
    
    device = next(model.parameters()).device
    if key not in challenges:
        print(f"Key {key} not found in challenges.")
        return
    
    challenge_data = challenges[key]
    test_inputs = [test_case['input'] for test_case in challenge_data['test']]
    training_examples = challenge_data['train']
    
    model_outputs = []
    for input_data in test_inputs:
        examples = ""
        for example in training_examples:
            input_example = example['input']
            output_example = example['output']
            examples += f"Example Input: {input_example}\nExample Output: {output_example}\n\n"
        
        input_text = f"Given the previous examples, please apply the same transformation rule to the following input:\nTest Input: {input_data}\nTransformation Result (Complete the full matrix):"
        full_prompt = examples + input_text
        
        # Tokenization length check before generating output
        inputs = tokenize_and_check_length(full_prompt, tokenizer)
        if inputs is None:
            print("Skipping input due to excessive token length.")
            continue
        
        inputs = inputs.to(device)
        if 'token_type_ids' in inputs:
            del inputs['token_type_ids']
        
        print("Generating output...")
        with torch.no_grad():
            try:
                output_ids = model.generate(
                    **inputs,
                    max_new_tokens=2048,  # Adjust this value if needed
                    temperature=0.7,
                    top_p=0.9,
                    do_sample=False
                )
            except Exception as e:
                print(f"Error during generation: {e}")
                continue  # Skip to the next test input if generation fails
        
        output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        print(f"Decoded output text: {output_text}")
        
        extracted_matrix = extract_matrix_from_output(output_text)
        
        if extracted_matrix is not None:
            model_outputs.append(extracted_matrix)
    
    expected_solution = solutions.get(key)
    
    print(f"Key: {key}")
    print(f"Test Inputs: {test_inputs}")
    print(f"Model Outputs: {model_outputs}")
    print(f"Expected Solution: {expected_solution}")

    if model_outputs == expected_solution:
        print("Result: Match")
    else:
        print("Result: No Match")
