In [149]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import colors
import seaborn as sns
import json
import os
from pathlib import Path
from subprocess import Popen, PIPE, STDOUT
from glob import glob
import langchain  # Main LangChain import
from langchain.chat_models import ChatOpenAI  # Updated import path for ChatOpenAI
from langchain_core.output_parsers import JsonOutputParser
from langchain.prompts import PromptTemplate, ChatPromptTemplate  # Updated import path
from langchain.schema import AIMessage  # Updated import path
from pydantic import BaseModel, Field  # Use pydantic directly
from typing import List, Tuple, Dict
from typing_extensions import TypedDict
from langgraph.graph import StateGraph, START, END
from langgraph.graph.message import add_messages
from langgraph.checkpoint.memory import MemorySaver
import logging
import sys
from dotenv import load_dotenv

In [150]:
# ==========================
# Configure Logging
# ==========================

# Create a logger
logger = logging.getLogger('ReWOO_LangGraph')
logger.setLevel(logging.DEBUG)

# Create handlers
c_handler = logging.StreamHandler(sys.stdout)
f_handler = logging.FileHandler('rewoolanggraph.log', mode='w')
c_handler.setLevel(logging.INFO)
f_handler.setLevel(logging.DEBUG)

# Create formatters and add to handlers
c_format = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
f_format = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
c_handler.setFormatter(c_format)
f_handler.setFormatter(f_format)

# Add handlers to the logger
logger.addHandler(c_handler)
logger.addHandler(f_handler)


In [151]:
# ==========================
# Function to load JSON files
# ==========================

def load_json(filepath):
    with open(filepath, 'r') as file:
        return json.load(file)

In [152]:
# ==========================
# Loading Files
# ==========================

base_path = 'arc-agi-genesis/data/challenges/'
training_challenges = load_json(base_path + 'arc-agi_training_challenges.json')
training_solutions = load_json(base_path + 'arc-agi_training_solutions.json')

evaluation_challenges = load_json(base_path + 'arc-agi_evaluation_challenges.json')
evaluation_solutions = load_json(base_path + 'arc-agi_evaluation_solutions.json')

test_challenges = load_json(base_path + 'arc-agi_test_challenges.json')

task_sets = {
    'training': {
        'challenges': training_challenges,
        'solutions': training_solutions,
    },
    'evaluation': {
        'challenges': evaluation_challenges,
        'solutions': evaluation_solutions,
    }
}

In [153]:
# ==========================
# Function to load tasks from a pre-loaded task set
# ==========================

def load_tasks_from_file(task_set):
    """
    Loads the tasks from the pre-loaded JSON data and returns the challenges and solutions tasks.
    """
    challenges = task_set['challenges']
    solutions = task_set['solutions']

    return challenges, solutions

In [154]:
# ==========================
# Print Dataset Information
# ==========================

print(f'Number of training challenges = {len(training_challenges)}')
print(f'Number of solutions of training challenges = {len(training_solutions)}')
# Loading tasks from the 'training' task set
challenges, solutions = load_tasks_from_file(task_set=task_sets['training'])
print(json.dumps(challenges['0520fde7'], indent=2))  # Accessing a specific challenge

Number of training challenges = 400
Number of solutions of training challenges = 400
{
  "test": [
    {
      "input": [
        [
          1,
          0,
          1,
          5,
          1,
          0,
          1
        ],
        [
          0,
          1,
          0,
          5,
          1,
          0,
          1
        ],
        [
          1,
          0,
          1,
          5,
          0,
          1,
          0
        ]
      ]
    }
  ],
  "train": [
    {
      "input": [
        [
          1,
          0,
          0,
          5,
          0,
          1,
          0
        ],
        [
          0,
          1,
          0,
          5,
          1,
          1,
          1
        ],
        [
          1,
          0,
          0,
          5,
          0,
          0,
          0
        ]
      ],
      "output": [
        [
          0,
          0,
          0
        ],
        [
          0,
          2,
          0
        ],
        [
    

In [155]:
# ==========================
# Initializing LLM Client to Use
# ==========================

from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv('api.env')

# Get the OpenAI API key from environment variables
openai_api_key = os.getenv('OPENAI_API_KEY')

# Initialize the ChatOpenAI model with the API key
llm = ChatOpenAI(model='gpt-4o-mini', openai_api_key=openai_api_key, max_tokens=3000)

## And in case you want to try Anthropic
# from langchain_anthropic import ChatAnthropic
# llm = ChatAnthropic(model='claude-3-5-sonnet-20240620', api_key=os.getenv("ANTHROPIC_API_KEY"), max_tokens=3000)
# from langchain_google_genai import ChatGoogleGenerativeAI
# llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro", google_api_key=os.getenv("GOOGLE_API_KEY"), max_tokens=3000)



  llm = ChatOpenAI(model='gpt-4o-mini', openai_api_key=openai_api_key, max_tokens=3000)


In [156]:
# ==========================
# Converting Train and Test Pairs to a String Format Ideal for LLMs
# ==========================

def json_task_to_string(challenge_tasks: dict, task_id: str, test_input_index: int) -> str:
    """
    challenge_tasks: dict a list of tasks
    task_id: str the id of the task we want to convert to a string

    Convert your json task into a string so you can pass it to your LLM.
    This is a crucial step where you can use your creativity to edit how tasks are represented.
    """
    json_task = challenge_tasks[task_id]

    final_output = ""

    train_tasks = json_task['train']
    test_task = json_task['test']

    final_output = "Training Examples\n"

    for i, task in enumerate(train_tasks):
        final_output += f"Example {i + 1}: Input\n["
        for row in task['input']:
            final_output += f"\n{str(row)},"

        final_output += "]\n\n"
        final_output += f"Example {i + 1}: Output\n["

        for row in task['output']:
            final_output += f"\n{str(row)},"

        final_output += "]\n\n"

    final_output += "Test\n["
    for row in test_task[test_input_index]['input']:
        final_output += f"\n{str(row)}"

    final_output += "]\n\nYour Response:"

    return final_output

# Example of how the function works

# Wrap the example_task in a task_id
example_tasks = {
    'example_id': example_task
}

# Convert to string using the correct task_id
task_string = json_task_to_string(example_tasks, 'example_id', 0)
print(task_string)

Training Examples
Example 1: Input
[
[8, 6],
[6, 4],]

Example 1: Output
[
[8, 6, 8, 6, 8, 6],
[6, 4, 6, 4, 6, 4],
[6, 8, 6, 8, 6, 8],
[4, 6, 4, 6, 4, 6],
[8, 6, 8, 6, 8, 6],
[6, 4, 6, 4, 6, 4],]

Test
[
[3, 2]
[7, 8]]

Your Response:


In [157]:
# ==========================
# Using a Structured Output Parser to Parse the Output
# ==========================

# Defining a prediction as a list of lists
class ARCPrediction(BaseModel):
    prediction: List[List[int]] = Field(..., description="A prediction for a task")

# Define the planner output model
class PlannerStep(BaseModel):
    plan: str
    step_id: str
    tool: str
    tool_input: str

class PlannerOutput(BaseModel):
    steps: List[PlannerStep]

In [158]:
# ==========================
# Setting Up LangGraph for ReWOO Framework
# ==========================

# Define the ReWOO State
class ReWOO(TypedDict):
    task: str
    plan_string: str
    steps: List[Tuple[str, str, str, str]]
    results: dict
    result: List[List[int]]  # Changed from 'str' to 'List[List[int]]'

# Initialize the StateGraph
graph_builder = StateGraph(ReWOO)

# Define the Planner Node
import re
from langchain_core.prompts import ChatPromptTemplate

# Define the planner prompt with structured output instructions
planner_prompt = """For the following task, create a detailed step-by-step plan to solve the problem. For each step, specify the tool to use and the exact input for that tool. Each step should follow the format: 

Plan: <Detailed plan description>
<Step Identifier> = <Tool>[<Tool Input>]

Only use the tools provided below:

1. **LLM[input]**: A pretrained language model like yourself. Use this when you need to leverage general world knowledge or perform reasoning. The input can be any instruction or question.

**Example:**

Task: Calculate the number of hours Rebecca worked if Thomas worked x hours, Toby worked 10 hours less than twice what Thomas worked, and Rebecca worked 8 hours less than Toby.

Plan: 
Plan: Translate the problem into algebraic equations.
#E1 = LLM[Translate the problem into algebraic equations based on the given information.]
Plan: Solve for Thomas's hours.
#E2 = LLM[Solve the equation derived from #E1 to find the value of x.]
Plan: Calculate Toby's hours using the value of x from #E2.
#E3 = LLM[Calculate Toby's hours as 2*x - 10 using the value of x obtained from #E2.]
Plan: Determine Rebecca's hours based on Toby's hours from #E3.
#E4 = LLM[Calculate Rebecca's hours as #E3 - 8.]

Begin!
Describe your plans with rich details. Each Plan should be followed by only one #E."""

# Initialize the structured output parser for the planner
planner_parser = StructuredOutputParser.from_pydantic(PlannerOutput)
planner_format_instructions = planner_parser.get_format_instructions()

# Update the planner prompt to include format instructions
planner_prompt_full = f"""For the following task, create a detailed step-by-step plan to solve the problem. For each step, specify the tool to use and the exact input for that tool. Each step should follow the format: 

Plan: <Detailed plan description>
<Step Identifier> = <Tool>[<Tool Input>]

Only use the tools provided below:

1. **LLM[input]**: A pretrained language model like yourself. Use this when you need to leverage general world knowledge or perform reasoning. The input can be any instruction or question.

**Example:**

Task: Calculate the number of hours Rebecca worked if Thomas worked x hours, Toby worked 10 hours less than twice what Thomas worked, and Rebecca worked 8 hours less than Toby.

Plan: 
Plan: Translate the problem into algebraic equations.
#E1 = LLM[Translate the problem into algebraic equations based on the given information.]
Plan: Solve for Thomas's hours.
#E2 = LLM[Solve the equation derived from #E1 to find the value of x.]
Plan: Calculate Toby's hours using the value of x from #E2.
#E3 = LLM[Calculate Toby's hours as 2*x - 10 using the value of x obtained from #E2.]
Plan: Determine Rebecca's hours based on Toby's hours from #E3.
#E4 = LLM[Calculate Rebecca's hours as #E3 - 8.]

Begin!
Describe your plans with rich details. Each Plan should be followed by only one #E.

{planner_format_instructions}

Task: {{task}}"""

planner_chat_prompt = ChatPromptTemplate.from_messages([("user", planner_prompt_full)])

# Define the planner with the parser
planner = (planner_chat_prompt | llm | planner_parser)

def get_plan(state: ReWOO):
    task = state["task"]
    logger.debug(f"Planner Node: Generating plan for task: {task}")
    result = planner.invoke({"task": task})
    logger.debug(f"Planner Node: Received plan: {result}")
    
    # Extract the steps from the structured output
    steps = [(step.plan, step.step_id, step.tool, step.tool_input) for step in result.steps]
    logger.debug(f"Planner Node: Extracted steps: {steps}")
    
    return {"steps": steps, "plan_string": result.json()}

# Define the Tool Execution Node (using only LLM, no Google search)
def _get_current_task(state: ReWOO):
    if "results" not in state or state["results"] is None:
        return 1
    if len(state["results"]) == len(state["steps"]):
        return None
    else:
        return len(state["results"]) + 1

def tool_execution(state: ReWOO):
    """Worker node that executes the tools of a given plan."""
    _step = _get_current_task(state)
    if _step is None:
        logger.debug("Tool Execution Node: No more steps to execute.")
        return {}  # No more steps to execute

    step = state["steps"][_step - 1]
    _plan, step_name, tool, tool_input = step
    logger.debug(f"Tool Execution Node: Executing Step {_step}: {step_name} = {tool}[{tool_input}]")

    _results = state.get("results", {})
    # Substitute any variables in the tool_input
    for k, v in _results.items():
        tool_input = tool_input.replace(k, v)

    try:
        if tool.lower() == "llm":
            result = llm.invoke(tool_input)
            logger.debug(f"Tool Execution Node: LLM response: {result.content}")
            # Parse the LLM's JSON response
            parsed_result = json.loads(result.content.strip())
            response = parsed_result.get("prediction", [])
        else:
            raise ValueError(f"Unknown tool: {tool}")
    except Exception as e:
        logger.error(f"Tool Execution Node: Error executing tool {tool}: {e}")
        response = f"Error: {e}"

    _results[step_name] = response  # Assign the parsed response directly
    logger.debug(f"Tool Execution Node: Updated results: {_results}")

    return {"results": _results}

# Define the Solver Node
solve_prompt = """Solve the following task or problem. To solve the problem, we have made step-by-step Plan and retrieved corresponding Evidence to each Plan. Use them with caution since long evidence might contain irrelevant information.

{plan}

Now solve the question or task according to provided Evidence above. Respond with the answer directly in valid JSON format without any extra words.

Task: {task}
Response:"""

def solve(state: ReWOO):
    plan = ""
    for _plan, step_name, tool, tool_input in state["steps"]:
        _results = state.get("results", {})
        for k, v in _results.items():
            tool_input = tool_input.replace(k, v)
            step_name = step_name.replace(k, v)
        plan += f"Plan: {_plan}\n{step_name} = {tool}[{tool_input}]\n"

    prompt = solve_prompt.format(plan=plan, task=state["task"])
    logger.debug(f"Solver Node: Generating final response with prompt:\n{prompt}")
    result = llm.invoke(prompt)
    logger.debug(f"Solver Node: Solver response: {result.content}")
    
    try:
        parsed_result = json.loads(result.content.strip())
        prediction = parsed_result.get("prediction", [])
    except json.JSONDecodeError as e:
        logger.error(f"JSON parsing error: {e}")
        prediction = []

    return {"result": prediction}


NameError: name 'StructuredOutputParser' is not defined

In [None]:
# ==========================
# Define Routing Function
# ==========================

def _route(state: ReWOO):
    _step = _get_current_task(state)
    if _step is None:
        logger.debug("Routing Function: All tasks executed, routing to 'solve' node.")
        return "solve"
    else:
        logger.debug(f"Routing Function: Continuing with tool execution node.")
        return "tool"

In [None]:
# ==========================
# Building the Graph
# ==========================

graph_builder.add_node("plan", get_plan)
graph_builder.add_node("tool", tool_execution)
graph_builder.add_node("solve", solve)

graph_builder.add_edge("plan", "tool")
graph_builder.add_conditional_edges("tool", _route)
graph_builder.add_edge("solve", END)
graph_builder.add_edge(START, "plan")

graph = graph_builder.compile()

from IPython.display import Image, display

try:
    display(Image(graph.get_graph(xray=True).draw_mermaid_png()))
except Exception as e:
    logger.warning(f"Visualization failed: {e}")

In [127]:
# ==========================
# Creating Function to Get Task Prediction
# ==========================

def get_task_prediction(challenge_tasks, task_id, test_input_index) -> List[List[int]]:
    task_string = json_task_to_string(challenge_tasks, task_id, test_input_index)
    task_data = challenge_tasks[task_id]

    initial_state: ReWOO = {
        'task': task_string,
        'plan_string': '',
        'steps': [],
        'results': {},
        'result': []
    }

    logger.info(f"Starting task prediction for task_id: {task_id}, test_input_index: {test_input_index}")
    logger.debug(f"Initial State: {initial_state}")

    final_state = graph.invoke(initial_state)
    logger.debug(f"Final State after graph execution: {final_state}")

    prediction = final_state.get('result', [])

    # Safety check
    if not all(isinstance(sublist, list) and all(isinstance(item, int) for item in sublist) for sublist in prediction):
        logger.warning("Output must be a list of lists of integers.")
        logger.warning(f"Errored Output: {prediction}")
        raise ValueError("Output must be a list of lists of integers.")

    num_rows = len(prediction)
    num_cols = len(prediction[0]) if num_rows > 0 else 0
    logger.info(f"Prediction Grid Size: {num_rows}x{num_cols}")

    return prediction


In [128]:
# ==========================
# Run Model Function
# ==========================

def run_model(challenges, solutions, NUM_ATTEMPTS=2, RETRY_ATTEMPTS=3, NUM_TASKS=None):
    """
    challenges: dict a list of challenges. This should come directly from your _challenges file
    solutions: dict, the ground truth solutions you'd like to test against
    NUM_ATTEMPTS: int the number of times to attempt a prediction. The official competition has 2 attempts.
    RETRY_ATTEMPTS: int the number of times to retry a prediction if it fails
    NUM_TASKS: int, If set, this represents the the number of tasks you'd like to test. If None then the all challenges will be tested

    Loop through your challenges and produce a submission.json file you can submit for a score.
    """

    # A dict to hold your submissions that you'll return after all predictions are made
    submission = {}

    # Run through each task in your challenge set
    for i, task_id in enumerate(challenges):
        task_attempts = []  # List to store all attempts for the current task

        # Go through each test pair to get a prediction. 96% of challenges have 1 pair.
        for t, pair in enumerate(challenges[task_id]['test']):
            logger.info(f"Starting task #{i + 1} ({task_id}), pair #{t+1}")

            # Dictionary to store attempts for the current test pair
            pair_attempts = {}

            # Run through each prediction attempt
            for attempt in range(1, NUM_ATTEMPTS + 1):
                attempt_key = f"attempt_{attempt}"
                pair_attempts[attempt_key] = []  # Init your attempt

                # Run through retries
                for retry in range(RETRY_ATTEMPTS):
                    try:
                        logger.info(f"    Predicting attempt #{attempt}, retry #{retry + 1}")
                        prediction = get_task_prediction(challenge_tasks=challenges,
                                                         task_id=task_id,
                                                         test_input_index=t)

                        # If you get a valid prediction (list of lists of ints) with no error, then log the attempt
                        pair_attempts[attempt_key] = prediction
                        logger.debug(f"    Prediction successful: {prediction}")
                        break  # Break the retry loop if prediction is successful
                    except Exception as e:
                        logger.error(f"    Retrying: {e}")
                        if retry == RETRY_ATTEMPTS - 1:
                            pair_attempts[attempt_key] = []  # Assign empty list if all retries fail
                            logger.error(f"    All retries failed for attempt #{attempt}")

            # After you get your attempts, append them to the task attempts
            task_attempts.append(pair_attempts)

        # Append the task attempts to the submission with the task_id as the key
        submission[task_id] = task_attempts

        # If you want to stop after N tasks, uncomment the below
        if NUM_TASKS is not None and i + 1 == NUM_TASKS:
            logger.info(f"Reached the limit of {NUM_TASKS} tasks.")
            break

    return submission

In [129]:
# ==========================
# Creating Submission Files and Comparing with Solutions
# ==========================

# Create submission file
def create_submission_file(submission, file_name='submission.json'):
    """
    Save a submission file to the specified file name
    """
    with open(file_name, "w") as file:
        json.dump(submission, file, indent=2)

    logger.info(f"Submission saved to {file_name}")

# Create function to compare submission with solutions
def score_submission(submission_file_name, solutions) -> Tuple[float, int]:
    """
    submission_file_name: str, the file name of your submission file
    solutions: dict, the ground truth solutions you'd like to test against

    Read a submission from file, score it, then return the score
    """
    logger.info(f"Scoring {submission_file_name}\n")

    # Open your submission file
    with open(submission_file_name, "r") as file:
        submission = json.load(file)

    total_score = 0
    total_tasks = 0

    # Loop through each task in your submission to grade it
    for task_id, task_submission in submission.items():
        total_tasks += 1
        task_score = 0
        num_pairs = len(task_submission)

        # Go through each task. Most will only have 1
        for pair_index, pair_attempts in enumerate(task_submission):
            logger.info(f"Scoring Task {task_id} pair #{pair_index+1}")
            pair_correct = False

            # Look at both of your attempts
            for attempt_key, attempt in pair_attempts.items():
                # check to see if one is correct
                if attempt == solutions[task_id][pair_index]:
                    logger.info(f"Task Id {task_id} pair {pair_index+1} {attempt_key} matches solution")
                    pair_correct = True
                    break  # If it is correct, log it and break the loop

            if pair_correct:
                task_score += 1
                logger.debug(f"Task {task_id} pair {pair_index+1} scored 1 point.")
            else:
                logger.debug(f"Task {task_id} pair {pair_index+1} scored 0 points.")

        task_score /= num_pairs
        total_score += task_score

    return {
        'total_score': total_score,
        'total_tasks_scored': total_tasks
    }


In [130]:
# ==========================
# The Main Function to Bring Everything Together
# ==========================

def main(task_set='training', NUM_TASKS=None, submission_file_name='submission.json'):
    # Load datasets
    challenges, solutions = load_tasks_from_file(task_set=task_sets[task_set])

    # Run the model
    submission = run_model(challenges, solutions, NUM_TASKS=NUM_TASKS)

    # Create (and overwrite) a submission file
    create_submission_file(submission, file_name=submission_file_name)

    # Score the submission
    score_result = score_submission(submission_file_name=submission_file_name, solutions=solutions)

    final_percentage = (score_result['total_score'] / score_result['total_tasks_scored'] * 100) if score_result['total_tasks_scored'] > 0 else 0
    logger.info(f"Final score: {score_result['total_score']} of {score_result['total_tasks_scored']} ({round(final_percentage, 2)}%)")


In [72]:
# ==========================
# Running the Model
# ==========================

if __name__ == "__main__":
    main(task_set='evaluation', NUM_TASKS=1, submission_file_name='submission.json')


2024-10-16 11:11:53,207 - INFO - Starting task #1 (00576224), pair #1
2024-10-16 11:11:53,207 - INFO - Starting task #1 (00576224), pair #1
2024-10-16 11:11:53,207 - INFO - Starting task #1 (00576224), pair #1
2024-10-16 11:11:53,207 - INFO - Starting task #1 (00576224), pair #1
2024-10-16 11:11:53,207 - INFO - Starting task #1 (00576224), pair #1
2024-10-16 11:11:53,214 - INFO -     Predicting attempt #1, retry #1
2024-10-16 11:11:53,214 - INFO -     Predicting attempt #1, retry #1
2024-10-16 11:11:53,214 - INFO -     Predicting attempt #1, retry #1
2024-10-16 11:11:53,214 - INFO -     Predicting attempt #1, retry #1
2024-10-16 11:11:53,214 - INFO -     Predicting attempt #1, retry #1
2024-10-16 11:11:53,220 - INFO - Starting task prediction for task_id: 00576224, test_input_index: 0
2024-10-16 11:11:53,220 - INFO - Starting task prediction for task_id: 00576224, test_input_index: 0
2024-10-16 11:11:53,220 - INFO - Starting task prediction for task_id: 00576224, test_input_index: 0
20