## Importing packages and data

In [17]:
# Importing libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from   matplotlib import colors
import seaborn as sns

import json
import os
from pathlib import Path

from subprocess import Popen, PIPE, STDOUT
from glob import glob

import langchain # Main LangChain import
from langchain_openai import ChatOpenAI # To work with OpenAI
# from langchain_anthropic import ChatAnthropic # To work with Anthropic (optional)
# from langchain_google_genai import ChatGoogleGenerativeAI # To work with Gemini (optional)
from langchain_core.output_parsers import JsonOutputParser # To help with structured output
from langchain_core.prompts import PromptTemplate # To help create our prompt
from langchain_core.pydantic_v1 import BaseModel, Field # To help with defining what output structure we want
from pydantic import BaseModel

from typing import List, Tuple


In [None]:
# ==========================
# Loading Files
# ==========================

base_path = 'data/challenges/'

# Subset of challenges for testing
subset_ids_challenges = load_json(base_path + '28_8x10_challenges.json')
subset_ids_evaluation = load_json(base_path + '28_15x15_evaluation.json')

# Load evaluation dataset
evaluation_challenges = load_json(base_path + 'arc-agi_evaluation_challenges.json')
evaluation_solutions = load_json(base_path + 'arc-agi_evaluation_solutions.json')

# load training dataset
training_challenges = load_json(base_path + 'arc-agi_training_challenges.json')
training_solutions = load_json(base_path + 'arc-agi_training_solutions.json')

# Filter training challenges and solutions to only include the subset IDs
training_challenges = {k: v for k, v in training_challenges.items() if k in subset_ids_challenges}
training_solutions = {k: v for k, v in training_solutions.items() if k in subset_ids_challenges}

# filter evaluation challenges and solutions to only include the subset IDs
evaluation_challenges = {k: v for k, v in evaluation_challenges.items() if k in subset_ids_evaluation}
evaluation_solutions = {k: v for k, v in evaluation_solutions.items() if k in subset_ids_evaluation}

test_challenges = load_json(base_path + 'arc-agi_test_challenges.json')



task_sets = {
    'training': {
        'challenges': training_challenges,
        'solutions': training_solutions,
    },
    'evaluation': {
        'challenges': evaluation_challenges,
        'solutions': evaluation_solutions,
    }
}

In [None]:
# ==========================
# Function to load tasks from a pre-loaded task set
# ==========================

def load_tasks_from_file(task_set):
    """
    Loads the tasks from the pre-loaded JSON data and returns the challenges and solutions tasks.
    """
    challenges = task_set['challenges']
    solutions = task_set['solutions']

    return challenges, solutions

In [19]:
print(f'Number of training challenges = {len(training_challenges)}')
print(f'Number of solutions of training challenges = {len(training_solutions)}')

Number of training challenges = 400
Number of solutions of training challenges = 400


In [20]:
# Loading tasks from the 'training' task set
challenges, solutions = load_tasks_from_file(task_set=task_sets['training'])
print(challenges['0520fde7'])  # Accessing a specific challenge


{'test': [{'input': [[1, 0, 1, 5, 1, 0, 1], [0, 1, 0, 5, 1, 0, 1], [1, 0, 1, 5, 0, 1, 0]]}], 'train': [{'input': [[1, 0, 0, 5, 0, 1, 0], [0, 1, 0, 5, 1, 1, 1], [1, 0, 0, 5, 0, 0, 0]], 'output': [[0, 0, 0], [0, 2, 0], [0, 0, 0]]}, {'input': [[1, 1, 0, 5, 0, 1, 0], [0, 0, 1, 5, 1, 1, 1], [1, 1, 0, 5, 0, 1, 0]], 'output': [[0, 2, 0], [0, 0, 2], [0, 2, 0]]}, {'input': [[0, 0, 1, 5, 0, 0, 0], [1, 1, 0, 5, 1, 0, 1], [0, 1, 1, 5, 1, 0, 1]], 'output': [[0, 0, 0], [2, 0, 0], [0, 0, 2]]}]}


#### initializing LLM client to use

In [None]:
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv('api.env')

# Get the OpenAI API key from environment variables
openai_api_key = os.getenv('OPENAI_API_KEY')

# Initialize the ChatOpenAI model with the API key
#llm = ChatOpenAI(model='ft:gpt-4o-mini-2024-07-18:personal:arc-agi-4omini:AVHPjOLN', openai_api_key=openai_api_key, max_tokens=3000)
llm=ChatOpenAI(model='ft:gpt-4o-2024-08-06:personal:4o-arc-agi:AXmR1Dqz', openai_api_key=openai_api_key, max_tokens=3000)

## And incase you want to try Anthropic
# llm = ChatAnthropic(model='claude-3-5-sonnet-20240620', api_key=UserSecretsClient().get_secret("ANTHROPIC_API_KEY"), max_tokens=3000)
# llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro", google_api_key=UserSecretsClient().get_secret("GOOGLE_API_KEY"), max_tokens=3000)

## Trying to make MVP product which is just regular openai model trying to predict 

In [22]:
# converting train and test pairs to a string format ideal for LLMs
def json_task_to_string(challenge_tasks: dict, task_id: str, test_input_index: int) -> str:
    """
    challenge_tasks: dict a list of tasks
    task_id: str the id of the task we want to convert to a string
    
    Convert your json task into a string so you can pass it to your LLM.
    This is a crucial step where you can use your creativity to edit how tasks are represented.
    """
    json_task = challenge_tasks[task_id]

    final_output = ""

    train_tasks = json_task['train']
    test_task = json_task['test']

    final_output = "Training Examples\n"

    for i, task in enumerate(train_tasks):
        final_output += f"Example {i + 1}: Input\n["
        for row in task['input']:
            final_output += f"\n{str(row)},"

        final_output += "]\n\n"
        final_output += f"Example {i + 1}: Output\n["

        for row in task['output']:
            final_output += f"\n{str(row)},"

        final_output += "]\n\n"

    final_output += "Test\n["
    for row in test_task[test_input_index]['input']:
        final_output += f"\n{str(row)}"

    final_output += "]\n\nYour Response:"

    return final_output

In [23]:
# an example of how the function works
task_string = json_task_to_string(challenges, '0520fde7', 0)
print (task_string)

Training Examples
Example 1: Input
[
[1, 0, 0, 5, 0, 1, 0],
[0, 1, 0, 5, 1, 1, 1],
[1, 0, 0, 5, 0, 0, 0],]

Example 1: Output
[
[0, 0, 0],
[0, 2, 0],
[0, 0, 0],]

Example 2: Input
[
[1, 1, 0, 5, 0, 1, 0],
[0, 0, 1, 5, 1, 1, 1],
[1, 1, 0, 5, 0, 1, 0],]

Example 2: Output
[
[0, 2, 0],
[0, 0, 2],
[0, 2, 0],]

Example 3: Input
[
[0, 0, 1, 5, 0, 0, 0],
[1, 1, 0, 5, 1, 0, 1],
[0, 1, 1, 5, 1, 0, 1],]

Example 3: Output
[
[0, 0, 0],
[2, 0, 0],
[0, 0, 2],]

Test
[
[1, 0, 1, 5, 1, 0, 1]
[0, 1, 0, 5, 1, 0, 1]
[1, 0, 1, 5, 0, 1, 0]]

Your Response:


In [24]:
# Using a json output parser to parse the output, since LLMs aren't perfect at generating valid json
# Defining a prediction as a list of lists
class ARCPrediction(BaseModel):
    prediction: List[List] = Field(..., description="A prediction for a task")

### Creating function to get task prediction, make prompt, make API calls to model and parse output with retries

In [25]:
def get_task_prediction(challenge_tasks, task_id, test_input_index) -> List[List]:
    """
    challenge_tasks: dict a list of tasks
    task_id: str the id of the task we want to get a prediction for
    test_input_index: the index of your test input. 96% of tests only have 1 input.

    Given a task, predict the test output
    """

    # Get the string representation of your task
    task_string = json_task_to_string(challenge_tasks, task_id, test_input_index)
    
    # Set up a parser to inject instructions into the prompt template.
    parser = JsonOutputParser(pydantic_object=ARCPrediction)

    # Create your prompt template. This is very rudimentary! You should edit this to do much better.
    # For example, we don't tell the model what it's first attempt was (so it can do a different one), that might help!
    prompt = PromptTemplate(
        template="You are a bot that is very good at solving puzzles. Below is a list of input and output pairs with a pattern." 
                    "Identify the pattern, then apply that pattern to the test input to give a final output"
                    "Just give valid json list of lists response back, nothing else. Do not explain your thoughts."
                    "{format_instructions}\n{task_string}\n",
        input_variables=["task_string"],
        partial_variables={"format_instructions": parser.get_format_instructions()},
    )

    # Wrap up your chain with LCEL
    chain = prompt | llm | parser

    # Optional, print out the prompt if you want to see it. If you use LangSmith you could view this there as well.
    # print (f"Prompt:\n\n{prompt.format(task_string=task_string)}")
    
    # Finally, go get your prediction from your LLM. Ths will make the API call.
    output = chain.invoke({"task_string": task_string})

    # Because the output is structured, get the prediction key. If it isn't there, then just get the output
    if isinstance(output, dict):
        prediction = output.get('prediction', output)
    else:
        prediction = output

    # Safety measure to error out if you don't get a list of lists of ints back. This will spark a retry later.
    if not all(isinstance(sublist, list) and all(isinstance(item, int) for item in sublist) for sublist in prediction):
        print("Warning: Output must be a list of lists of integers.")
        print (f"Errored Output: {prediction}")
        raise ValueError("Output must be a list of lists of integers.")
    
    # Let's find the shape of our prediction
    num_rows = len(prediction)
    num_cols = len(prediction[0]) if num_rows > 0 else 0
    print(f"    Prediction Grid Size: {num_rows}x{num_cols}\n")
    
    return prediction

In [26]:
def run_model(challenges, NUM_ATTEMPTS=2, RETRY_ATTEMPTS=3, NUM_TASKS=None):
    """
    challenges: dict a list of challenges. This should come directly from your _challenges file
    NUM_ATTEMPTS: int the number of times to attempt a prediction. The official competition has 2 attempts.
    RETRY_ATTEMPTS: int the number of times to retry a prediction if it fails
    NUM_TASKS: int, If set, this represents the the number of tasks you'd like to test. If None then the all challeneges will be tested

    Loop through your challenges and produce a submission.json file you can submit for a score.
    """

    # A dict to hold your submissions that you'll return after all predictions are made
    submission = {}

    # Run through each task in your challenge set
    for i, task_id in enumerate(challenges):
        task_attempts = []  # List to store all attempts for the current task

        # Go through each test pair to get a prediction. 96% of challenges have 1 pair.
        for t, pair in enumerate(challenges[task_id]['test']):
            print(f"Starting task #{i + 1} ({task_id}), pair #{t+1}")

            # Dictionary to store attempts for the current test pair
            pair_attempts = {}  

            # Run through each prediction attempt
            for attempt in range(1, NUM_ATTEMPTS + 1):
                attempt_key = f"attempt_{attempt}"
                pair_attempts[attempt_key] = [] # Init your attempt

                # Try to get a prediction, with retries in case of failure
                for retry in range(RETRY_ATTEMPTS):
                    try:
                        print(f"    Predicting attempt #{attempt}, retry #{retry + 1}")
                        prediction = get_task_prediction(challenge_tasks=challenges,
                                                         task_id=task_id,
                                                         test_input_index=t)
                        
                        # If you get a valid prediction (list of lists of ints) with no error, then log the attempt
                        pair_attempts[attempt_key] = prediction
                        break  # Break the retry loop if prediction is successful
                    except Exception as e:
                        print(f"Retrying: {e}")
                        if retry == RETRY_ATTEMPTS - 1:
                            pair_attempts[attempt_key] = []  # Assign None if all retries fail

            # After you get your attempts, append them to the task attempts
            task_attempts.append(pair_attempts)

        # Append the task attempts to the submission with the task_id as the key
        submission[task_id] = task_attempts

        # If you want to stop after N tasks, uncomment the below
        if NUM_TASKS is not None and i + 1 == NUM_TASKS:
            break

    return submission

### Creating submission files and comparing it with solutions file

In [27]:
# create submission file
def create_submission_file(submission, file_name='submission.json'):
    """
    Save a submission file to the specified file name
    """
    with open(file_name, "w") as file:
        json.dump(submission, file)

    print (f"Submission saved to {file_name}")

In [28]:
# create function to compare submission with solutions
def score_submission(submission_file_name, solutions) -> Tuple[float, int]:
    """
    submission_file_name: str, the file name of your submission file
    solutions: dict, the ground truth solutions you'd like to test against
    
    Read a submission from file, score it, then return the score
    """
    print (f"Scoring {submission_file_name}\n")

    # Open your submission file
    with open(submission_file_name, "r") as file:
        submission = json.load(file)

    total_score = 0
    total_tasks = 0

    # Loop through each task in your submission to grade it
    for task_id, task_submission in submission.items():
        total_tasks += 1
        task_score = 0
        num_pairs = len(task_submission)

        # Go through each task. Most will only have 1
        for pair_index, pair_attempts in enumerate(task_submission):
            print(f"Scoring Task {task_id} pair #{pair_index+1}")
            pair_correct = False

            # Look at both of your attempts
            for attempt_key, attempt in pair_attempts.items():
                
                # check to see if one is correct
                if attempt == solutions[task_id][pair_index]:
                    print(f"Task Id {task_id} pair {pair_index+1} {attempt_key} matches solution")
                    pair_correct = True
                    break # If it is correct, log it and break the loop

            if pair_correct:
                task_score += 1

        task_score /= num_pairs
        total_score += task_score

    return {
        'total_score': total_score,
        'total_tasks_scored': total_tasks
    }

#### The main function to bring everything together

In [None]:
def main(task_set='training', NUM_TASKS=None, submission_file_name='submission_finetune4o.json'):
    # Load datasets
    challenges, solutions = load_tasks_from_file(task_set=task_sets[task_set])

    # # Run the model
    submission = run_model(challenges, NUM_TASKS=NUM_TASKS)

    # Create (and overwrite) a submission file
    create_submission_file(submission, file_name=submission_file_name)

    # Score the submission
    score_result = score_submission(solutions = solutions, submission_file_name=submission_file_name)

    print(f"Final score: {score_result['total_score']} of {score_result['total_tasks_scored']} ({round(score_result['total_score']/score_result['total_tasks_scored'] * 100, 2)}%)")

# RUNNING THE MODEL

In [30]:
main(task_set='evaluation', NUM_TASKS=10)

Starting task #1 (00576224), pair #1
    Predicting attempt #1, retry #1




    Prediction Grid Size: 6x6

    Predicting attempt #2, retry #1




    Prediction Grid Size: 6x6

Starting task #2 (009d5c81), pair #1
    Predicting attempt #1, retry #1




    Prediction Grid Size: 14x14

    Predicting attempt #2, retry #1




    Prediction Grid Size: 14x14

Starting task #3 (00dbd492), pair #1
    Predicting attempt #1, retry #1




    Prediction Grid Size: 24x20

    Predicting attempt #2, retry #1




    Prediction Grid Size: 18x20

Starting task #4 (03560426), pair #1
    Predicting attempt #1, retry #1
    Prediction Grid Size: 10x10

    Predicting attempt #2, retry #1




    Prediction Grid Size: 10x10

Starting task #5 (05a7bcf2), pair #1
    Predicting attempt #1, retry #1




    Prediction Grid Size: 36x30

    Predicting attempt #2, retry #1




    Prediction Grid Size: 26x28

Starting task #6 (0607ce86), pair #1
    Predicting attempt #1, retry #1




    Prediction Grid Size: 27x22

    Predicting attempt #2, retry #1




    Prediction Grid Size: 24x22

Starting task #7 (0692e18c), pair #1
    Predicting attempt #1, retry #1




Errored Output: [[[0, 0, 0, 3, 0, 0, 0, 0, 0], [0, 0, 3, 3, 0, 0, 0, 0, 0], [0, 3, 3, 0, 0, 0, 0, 0, 0], [0, 3, 0, 0, 3, 0, 0, 0, 0], [0, 0, 0, 0, 3, 3, 0, 0, 0], [0, 0, 0, 0, 0, 3, 3, 0, 0], [0, 0, 0, 0, 0, 0, 0, 3, 3], [0, 0, 0, 0, 0, 0, 3, 0, 3], [0, 0, 0, 0, 0, 0, 0, 3, 0]]]
Retrying: Output must be a list of lists of integers.
    Predicting attempt #1, retry #2




    Prediction Grid Size: 9x9

    Predicting attempt #2, retry #1




    Prediction Grid Size: 9x9

Starting task #8 (070dd51e), pair #1
    Predicting attempt #1, retry #1




    Prediction Grid Size: 20x20

    Predicting attempt #2, retry #1




    Prediction Grid Size: 21x20

Starting task #9 (08573cc6), pair #1
    Predicting attempt #1, retry #1




    Prediction Grid Size: 13x13

    Predicting attempt #2, retry #1




    Prediction Grid Size: 14x13

Starting task #10 (0934a4d8), pair #1
    Predicting attempt #1, retry #1




Errored Output: [[[6, 6, 5, 1], [4, 4, 7, 7], [7, 7, 7, 2]], [[2, 2, 6, 4], [6, 2, 2, 2], [2, 6, 6, 6]], [[7, 7, 7, 4], [4, 4, 1, 5], [4, 4, 5, 1], [2, 6, 4, 6]]]
Retrying: Output must be a list of lists of integers.
    Predicting attempt #1, retry #2




    Prediction Grid Size: 4x3

    Predicting attempt #2, retry #1




    Prediction Grid Size: 5x6

Submission saved to submission.json
Scoring submission.json

Scoring Task 00576224 pair #1
Task Id 00576224 pair 1 attempt_1 matches solution
Scoring Task 009d5c81 pair #1
Scoring Task 00dbd492 pair #1
Scoring Task 03560426 pair #1
Scoring Task 05a7bcf2 pair #1
Scoring Task 0607ce86 pair #1
Scoring Task 0692e18c pair #1
Scoring Task 070dd51e pair #1
Scoring Task 08573cc6 pair #1
Scoring Task 0934a4d8 pair #1
Final score: 1.0 of 10 (10.0%)
