In [1]:
# Setup
import re
import os
import json
import numpy as np
import pandas as pd
from scipy import stats
from scipy.stats import bootstrap
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter

precision = 3
pd.set_option("display.precision", precision)

tasks = ["gameof24", "crosswords"]
metrics = {"gameof24": "success_rate", "crosswords": "r_letter"}

#############################################
# Just some util and plot styling functions #
#############################################
def get_files_in_folder(folder_path):
    files = []
    for file in os.listdir(folder_path):
        if os.path.isfile(os.path.join(folder_path, file)):
            files.append(folder_path + "/" +file)
    return sorted(files)

def get_number(string):
    numbers = re.findall(r'\d+', string)
    numbers = list(map(int, numbers))
    return numbers[0]

def get_params(file_path):
    data = {}

    file_name = file_path.split('/')[-1]
    params = file_name.split('_')

    data["set"] = params[0].split("-")[0]
    data["n_agents"] = get_number(params[1])
    data["n_steps"] = get_number(params[2])
    data["k"] = get_number(params[3])
    data["origin_value"] = get_number(params[4])
    data["backtrack"] = float("0." + str(get_number(params[5].split(".")[1])))
    data["resampling"] = params[6].split("-")[0]
    data["file_path"] = file_path
    data["name"] = f"{data['n_agents']}agents\n{data['n_steps']}steps\n{data['k']}k\n{data['backtrack']}b"

    return data

def compute_stats(puzzle_results, costs):
    stats = {}
    puzzle_results = np.array(puzzle_results)
    performance_stats = bootstrap(puzzle_results.reshape((1,-1)), np.mean)
    stats.update({"performance_mean":performance_stats.bootstrap_distribution.mean()})
    stats.update({"performance_ci":performance_stats.confidence_interval})

    if len(costs) == 1:
        stats.update({"cost_mean":costs[0]})
        stats.update({"cost_ci":(costs[0], costs[0])})
    else:
        costs = np.array(costs)
        cost_stats = bootstrap(costs.reshape((1,-1)), np.mean)
        stats.update({"cost_mean":cost_stats.bootstrap_distribution.mean()})
        stats.update({"cost_ci":cost_stats.confidence_interval})

    return stats

def get_gameof24_puzzle_results_foa(file_path):
    with open(file_path, "r") as experiment_file:
        data = json.load(experiment_file)
    
    info = data.pop("Info")
    cost = info["Cost"]["Total cost"]["total_cost"]
    puzzle_results = []
    for _, puzzle in data.items():
        puzzle_results.append(({"r": 1} in puzzle["Verifications"])*1)
    return puzzle_results, cost

def get_crosswords_puzzle_results_foa(file_path, metric="r_letter"):
    with open(file_path, "r") as experiment_file:
        data = json.load(experiment_file)
    info = data.pop("Info")
    cost = info["Cost"]["Total cost"]["total_cost"]

    metrics = {}
    for puzzle_id, puzzle_results in data.items():
        initial_puzzle = puzzle_results.pop("puzzle", None)       # Not needed just want to pop
        verifications = puzzle_results.pop("Verifications", None) # Not needed just want to pop

        max_actions = 0
        metrics[puzzle_id] = {"r_letter": None, "r_word": None, "r_all": None}
        for agent_id, agent_results in puzzle_results.items():
            for step_id, step_results in agent_results.items():
                step_actions = len(step_results["Step"].split(" -> "))
                if step_actions > max_actions:
                    max_actions = step_actions
                    metrics[puzzle_id] = step_results["metrics"]
        assert max_actions > 0, f"No actions found for {puzzle_id}"

    r_letters = [metric["r_letter"] for metric in metrics.values()]
    r_words = [metric["r_word"] for metric in metrics.values()]
    r_alls = [metric["r_all"] for metric in metrics.values()]
    metrics = {"r_letter": r_letters, "r_word": r_words, "r_all": r_alls}

    
    return metrics[metric], cost


get_task_puzzle_results_foa = {"gameof24": get_gameof24_puzzle_results_foa, "crosswords": get_crosswords_puzzle_results_foa}

In [12]:
# Loading the data
data = {"gameof24":{"cost":{}, "performance":{}, "both":{}, "gpt4":{}}, "crosswords":{"cost":{}, "performance":{}, "both":{}, "gpt4":{}}}

"""
cost : optimizing for cost
performance : optimizing for performance
both : optimizing for both
mixed : gpt-4 powered evaluation
"""

for task, optimizations in data.items():
    for optimization, experiments in optimizations.items():
        folder = f"../arxiv/logs/{task}/{optimization}"
        files = get_files_in_folder(folder)
        experiments = [get_params(file) for file in files]

        configurations = {}
        for experiment in experiments:
            if experiment["name"] not in configurations:
                configurations[experiment["name"]] = []
            configurations[experiment["name"]].append(experiment)
        
        for configuration, experiments in configurations.items():
            puzzle_results = []
            costs = []
            for experiment in experiments:
                file_path = experiment["file_path"]
                result, cost = get_task_puzzle_results_foa[task](file_path)
                costs.append(cost)
                puzzle_results.extend(result)
            
            # Compute stats
            stats = compute_stats(puzzle_results, costs)
            

            data[task][optimization].update({configuration:{"experiments": experiments, "stats":stats}})

## Benchmarks

In [7]:
def get_gameof24_puzzle_results_tot(file_path, method="ToT"):
    with open(file_path, "r") as experiment_file:
        data = json.load(experiment_file)
    
    # Get cost
    cost = data[-1]["usage_so_far"]["cost"]

    # Get result per puzzle
    puzzle_results = []
    for puzzle in data:
        if method == "ToT":
            puzzle_results.append(({"r": 1} in puzzle["infos"])*1)
        else:
            puzzle_results.append(np.mean([info["r"] for info in puzzle["infos"]]))
    return puzzle_results, cost

def get_crosswords_puzzle_results_tot(file_path, metric="r_letter", method="ToT"):
    with open(file_path) as f:
        data = json.load(f)

    # Get cost
    if method != "ToT": #ie. naive + cot
        cost = data[-1]["usage_so_far"]["cost"]
        r_letters = []
        r_words = []
        r_games = []
        for game in data:
            r_letters.append(np.mean([info["r_letter"] for info in game["infos"]]))
            r_words.append(np.mean([info["r_word"] for info in game["infos"]]))
            r_games.append(np.mean([info["r_game"] for info in game["infos"]]))
    
    else:
        cost = data.pop(-1)["cost"]

        # Get results per puzzle
        best_steps = []
        for game in data:
            step_len = [len(step["actions"]) for step in game]
            if step_len == []:
                # Empty game -> No suggestions at root node
                best_steps.append({"total_step":0, "env_step":0, "actions":[], 'info': {'r_letter': 0, 'r_word': 0},})
                continue
            best_step_index = step_len.index(max(step_len))
            best_step = game[best_step_index]
            best_steps.append(best_step)

        r_letters = [game["info"]["r_letter"] for game in best_steps]
        r_words = [game["info"]["r_word"] for game in best_steps]
        r_games = [1 if game["info"]["r_word"]==1 else 0 for game in best_steps]
        
    puzzle_results = {"r_letter": r_letters, "r_word": r_words, "r_game": r_games}

    return puzzle_results[metric], cost

get_task_puzzle_results_tot = {"gameof24": get_gameof24_puzzle_results_tot, "crosswords": get_crosswords_puzzle_results_tot}

In [39]:
benchmarks = {"gameof24": {"ToT": {}}, "crosswords": {"ToT": {}}} 

methods = ["ToT"] # propose goes for ToT

for task in benchmarks.keys():
    for method in methods:
        folder = f"../arxiv/benchmarks/{task}/ChatGPT/{method}"
        files = get_files_in_folder(folder)

        puzzle_results = []
        costs = []

        for file in files:
            result, cost = get_task_puzzle_results_tot[task](file, method=method)
            costs.append(cost)
            puzzle_results.extend(result)

        # Compute stats
        stats = compute_stats(puzzle_results, costs)

        benchmarks[task][method] = stats

## FoA beating ToT

In [40]:
def group_data_per_optimization(data, benchmarks, optimization, task=None):
    results = []

    for task_ in data.keys():

        # Get FoA results
        for configuration, experiment_results in data[task_][optimization].items():
            result = {"task": task_, "method": "FoA", "configuration": configuration}
            result.update(experiment_results["stats"])
            result.update({"optimization": optimization})
            results.append(result)
        
        # Get ToT results
        for method, stats in benchmarks[task_].items():
            result = {"task": task_, "method": method, "configuration": "default"}
            result.update(stats)
            result.update({"optimization": None})
            results.append(result)
    df = pd.DataFrame(results)
    

    for ci in ["performance_ci", "cost_ci"]:
        df[ci] = df[ci].apply(lambda x: (round(x[0], precision), round(x[1], precision)))

    # If a task is specified filter the df
    if task: 
        df = df[df.task==task]
    return df

df_both = group_data_per_optimization(data, benchmarks, "both")
display(df_both)

Unnamed: 0,task,method,configuration,performance_mean,performance_ci,cost_mean,cost_ci,optimization
0,gameof24,FoA,9agents\n9steps\n1k\n0.5b,0.252,"(0.214, 0.29)",1.548,"(1.522, 1.58)",both
1,gameof24,ToT,default,0.136,"(0.108, 0.168)",1.712,"(1.636, 1.75)",
2,crosswords,FoA,2agents\n6steps\n3k\n0.5b,0.362,"(0.325, 0.4)",0.246,"(0.238, 0.252)",both
3,crosswords,ToT,default,0.333,"(0.276, 0.392)",0.481,"(0.376, 0.585)",


## Pareto optimals

### A. Optimizing for cost

In [41]:
df_cost = group_data_per_optimization(data, benchmarks, "cost")
display(df_cost)

Unnamed: 0,task,method,configuration,performance_mean,performance_ci,cost_mean,cost_ci,optimization
0,gameof24,FoA,7agents\n6steps\n1k\n0.5b,0.154,"(0.124, 0.188)",1.042,"(1.032, 1.05)",cost
1,gameof24,ToT,default,0.136,"(0.108, 0.168)",1.712,"(1.636, 1.75)",
2,crosswords,FoA,4agents\n4steps\n3k\n0.5b,0.318,"(0.284, 0.352)",0.173,"(0.164, 0.176)",cost
3,crosswords,ToT,default,0.333,"(0.276, 0.392)",0.481,"(0.376, 0.585)",


### B. Optimizing for performance

In [42]:
df_performance = group_data_per_optimization(data, benchmarks, "performance")
display(df_performance)

Unnamed: 0,task,method,configuration,performance_mean,performance_ci,cost_mean,cost_ci,optimization
0,gameof24,FoA,15agents\n6steps\n1k\n0.2b,0.264,"(0.226, 0.304)",1.793,"(1.771, 1.807)",performance
1,gameof24,ToT,default,0.136,"(0.108, 0.168)",1.712,"(1.636, 1.75)",
2,crosswords,FoA,4agents\n6steps\n3k\n0.5b,0.402,"(0.36, 0.442)",0.481,"(0.47, 0.499)",performance
3,crosswords,ToT,default,0.333,"(0.276, 0.392)",0.481,"(0.376, 0.585)",


### Summary ChatGPT

In [51]:
df = pd.concat([df_both, df_cost, df_performance])
df.sort_values(by=["task", "method", "optimization"], ascending=[False, True, True],inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,task,method,configuration,performance_mean,performance_ci,cost_mean,cost_ci,optimization
0,gameof24,FoA,9agents\n9steps\n1k\n0.5b,0.252,"(0.214, 0.29)",1.548,"(1.522, 1.58)",both
1,gameof24,FoA,7agents\n6steps\n1k\n0.5b,0.154,"(0.124, 0.188)",1.042,"(1.032, 1.05)",cost
2,gameof24,FoA,15agents\n6steps\n1k\n0.2b,0.264,"(0.226, 0.304)",1.793,"(1.771, 1.807)",performance
3,gameof24,ToT,default,0.136,"(0.108, 0.168)",1.712,"(1.636, 1.75)",
4,crosswords,FoA,2agents\n6steps\n3k\n0.5b,0.362,"(0.325, 0.4)",0.246,"(0.238, 0.252)",both
5,crosswords,FoA,4agents\n4steps\n3k\n0.5b,0.318,"(0.284, 0.352)",0.173,"(0.164, 0.176)",cost
6,crosswords,FoA,4agents\n6steps\n3k\n0.5b,0.402,"(0.36, 0.442)",0.481,"(0.47, 0.499)",performance
7,crosswords,ToT,default,0.333,"(0.276, 0.392)",0.481,"(0.376, 0.585)",


## GPT-4

In [36]:
# GPT4 Benchmarks
benchmarks = {"gameof24": {"ToT": {}}, "crosswords": {"ToT": {}}} 

methods = ["ToT"] # propose goes for ToT

for task in benchmarks.keys():
    for method in methods:
        folder = f"../arxiv/benchmarks/{task}/GPT4/{method}"
        files = get_files_in_folder(folder)

        puzzle_results = []
        costs = []

        for file in files:
            result, cost = get_task_puzzle_results_tot[task](file, method=method)
            costs.append(cost)
            puzzle_results.extend(result)

        # Compute stats
        stats = compute_stats(puzzle_results, costs)

        benchmarks[task][method] = stats

In [37]:
df = group_data_per_optimization(data, benchmarks, "gpt4")
display(df)

Unnamed: 0,task,method,configuration,performance_mean,performance_ci,cost_mean,cost_ci,optimization
0,gameof24,FoA,9agents\n9steps\n1k\n0.5b,0.76,"(0.67, 0.84)",62.93,"(62.93, 62.93)",gpt4
1,gameof24,ToT,default,0.75,"(0.66, 0.83)",75.024,"(75.024, 75.024)",
2,crosswords,FoA,2agents\n6steps\n3k\n0.5b,0.46,"(0.342, 0.566)",12.938,"(12.938, 12.938)",gpt4
3,crosswords,ToT,default,0.394,"(0.256, 0.544)",48.989,"(48.989, 48.989)",
